[Pacemaker] Fail-count and failure timeout
Holger.Teutsch at fresenius-netcare.com
Holger.Teutsch at fresenius-netcare.com
Fri Oct 1 13:40:26 UTC 2010
Hi,
I observed the following in pacemaker Versions 1.1.3 and tip up to patch
10258.
In a small test environment to study fail-count behavior I have one
resource
anything
doing sleep 600 with monitoring interval 10 secs.
The failure-timeout is 300.
I would expect to never see a failcount higher than 1.
I observed some sporadic clears but mostly the count is increasing by 1
each 10 minutes.
Am I mistaken or is this a bug ?
Regards
Holger
-- complete cib for reference ---
<cib epoch="32" num_updates="0" admin_epoch="0"
validate-with="pacemaker-1.2" crm_feature_set="3.0.4" have-quorum="0"
cib-last-written="Fri Oct 1 14:17:31 2010" dc-uuid="hotlx">
<configuration>
<crm_config>
<cluster_property_set id="cib-bootstrap-options">
<nvpair id="cib-bootstrap-options-dc-version" name="dc-version"
value="1.1.3-09640bd6069e677d5eed65203a6056d9bf562e67"/>
<nvpair id="cib-bootstrap-options-cluster-infrastructure"
name="cluster-infrastructure" value="openais"/>
<nvpair id="cib-bootstrap-options-expected-quorum-votes"
name="expected-quorum-votes" value="2"/>
<nvpair id="cib-bootstrap-options-no-quorum-policy"
name="no-quorum-policy" value="ignore"/>
<nvpair id="cib-bootstrap-options-stonith-enabled"
name="stonith-enabled" value="false"/>
<nvpair id="cib-bootstrap-options-start-failure-is-fatal"
name="start-failure-is-fatal" value="false"/>
<nvpair id="cib-bootstrap-options-last-lrm-refresh"
name="last-lrm-refresh" value="1285926879"/>
</cluster_property_set>
</crm_config>
<nodes>
<node id="hotlx" uname="hotlx" type="normal"/>
</nodes>
<resources>
<primitive class="ocf" id="test" provider="heartbeat"
type="anything">
<meta_attributes id="test-meta_attributes">
<nvpair id="test-meta_attributes-target-role" name="target-role"
value="started"/>
<nvpair id="test-meta_attributes-failure-timeout"
name="failure-timeout" value="300"/>
</meta_attributes>
<operations id="test-operations">
<op id="test-op-monitor-10" interval="10" name="monitor"
on-fail="restart" timeout="20s"/>
<op id="test-op-start-0" interval="0" name="start"
on-fail="restart" timeout="20s"/>
</operations>
<instance_attributes id="test-instance_attributes">
<nvpair id="test-instance_attributes-binfile" name="binfile"
value="sleep 600"/>
</instance_attributes>
</primitive>
</resources>
<constraints/>
</configuration>
</cib>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.clusterlabs.org/pipermail/pacemaker/attachments/20101001/7b15fd92/attachment-0002.html>
More information about the Pacemaker
mailing list