[Pacemaker] Cluster issues - all resources restarting when a node reboots
Velayutham, Prakash
Prakash.Velayutham at cchmc.org
Sat Jun 30 14:04:55 UTC 2012
On Jun 28, 2012, at 1:00 AM, Andrew Beekhof wrote:
> On Thu, Jun 28, 2012 at 4:03 AM, Velayutham, Prakash
> <Prakash.Velayutham at cchmc.org> wrote:
>> Hello all,
>>
>> Below is the relevant part of my CIB. I am facing 2 issues.
>>
>> 1. Every time a node in the cluster reboots, all resources get restarted in the entire cluster.
>> 2. I have a time-based rule for stickiness, but it does not work.
>>
>> Can any one point out what is wrong with the configuration?
>
> Too many dots?
> Impossible to say really, perhaps file a bug and attach a crm_report
> (run with --help) covering the time of your test case.
>
>>
>> <crm_config>
>> <cluster_property_set id="cib-bootstrap-options">
>> <nvpair id="cib-bootstrap-options-dc-version" name="dc-version" value="1.1.5-5ce2879aa0d5f43d01629bc20edc6868a9352002"/>
>> <nvpair id="cib-bootstrap-options-cluster-infrastructure" name="cluster-infrastructure" value="openais"/>
>> <nvpair id="cib-bootstrap-options-expected-quorum-votes" name="expected-quorum-votes" value="2"/>
>> <nvpair id="cib-bootstrap-options-last-lrm-refresh" name="last-lrm-refresh" value="1340724782"/>
>> <nvpair id="cib-bootstrap-options-no-quorum-policy" name="no-quorum-policy" value="ignore"/>
>> <nvpair id="cib-bootstrap-options-cluster-recheck-interval" name="cluster-recheck-interval" value="1min"/>
>> </cluster_property_set>
>> </crm_config>
>> <nodes>
>> <node id="bmimysqlp3" uname="bmimysqlp3" type="normal">
>> ...
>> </node>
>> <node id="bmimysqlp4" uname="bmimysqlp4" type="normal">
>> ...
>> </node>
>> </nodes>
>> <resources>
>> <primitive class="stonith" id="stonith-1" type="external/riloe">
>> ...
>> </primitive>
>> <primitive class="stonith" id="stonith-2" type="external/riloe">
>> ...
>> </primitive>
>> <group id="g_mysql-rschp01">
>> <primitive class="ocf" id="p_vip-rschp01" provider="heartbeat" type="IPaddr2">
>> <operations id="p_vip-rschp01-operations">
>> ...
>> </operations>
>> <instance_attributes id="p_vip-rschp01-instance_attributes">
>> ...
>> </instance_attributes>
>> <meta_attributes id="p_vip-rschp01-meta_attributes"> </meta_attributes>
>> </primitive>
>> <primitive class="ocf" id="p_mysql-rschp01" provider="heartbeat" type="mysql">
>> <operations id="p_mysql-rschp01-operations">
>> …
>> </operations>
>> <instance_attributes id="p_mysql-rschp01-instance_attributes">
>> ...
>> </instance_attributes>
>> <meta_attributes id="p_mysql-rschp01-meta_attributes">
>> …
>> </meta_attributes>
>> </primitive>
>> </group>
>> <group id="g_mysql-rschp02">
>> <primitive class="ocf" id="p_vip-rschp02" provider="heartbeat" type="IPaddr2">
>> <operations id="p_vip-rschp02-operations">
>> ...
>> </operations>
>> <instance_attributes id="p_vip-rschp02-instance_attributes">
>> ...
>> </instance_attributes>
>> <meta_attributes id="p_vip-rschp02-meta_attributes"> </meta_attributes>
>> </primitive>
>> <primitive class="ocf" id="p_mysql-rschp02" provider="heartbeat" type="mysql">
>> <operations id="p_mysql-rschp02-operations">
>> …
>> </operations>
>> <instance_attributes id="p_mysql-rschp02-instance_attributes">
>> ...
>> </instance_attributes>
>> <meta_attributes id="p_mysql-rschp02-meta_attributes"> </meta_attributes>
>> </primitive>
>> </group>
>> <clone id="c_ping">
>> <meta_attributes id="c_ping-meta_attributes">
>> <nvpair id="c_ping-meta_attributes-target-role" name="target-role" value="Started"/>
>> </meta_attributes>
>> <primitive class="ocf" id="p_ping" provider="pacemaker" type="ping">
>> <operations id="p_ping-operations">
>> <op id="p_ping-op-monitor-15" interval="15" name="monitor" timeout="10"/>
>> </operations>
>> <instance_attributes id="p_ping-instance_attributes">
>> <nvpair id="p_ping-instance_attributes-host_list" name="host_list" value="10.200.31.1"/>
>> <nvpair id="p_ping-instance_attributes-dampen" name="dampen" value="5s"/>
>> <nvpair id="p_ping-instance_attributes-multiplier" name="multiplier" value="100"/>
>> <nvpair id="p_ping-instance_attributes-attempts" name="attempts" value="2"/>
>> <nvpair id="p_ping-instance_attributes-name" name="name" value="pingd"/>
>> </instance_attributes>
>> <meta_attributes id="p_ping-meta_attributes">
>> <nvpair id="p_ping-meta_attributes-target-role" name="target-role" value="Started"/>
>> </meta_attributes>
>> </primitive>
>> </clone>
>> <clone id="c_stack">
>> <group id="g_stack">
>> <primitive class="ocf" id="p_dlmcontrold" provider="pacemaker" type="controld">
>> <operations id="p_dlmcontrold-operations">
>> …
>> </operations>
>> <instance_attributes id="p_dlmcontrold-instance_attributes">
>> <nvpair id="p_dlmcontrold-instance_attributes-daemon" name="daemon" value="dlm_controld.pcmk"/>
>> </instance_attributes>
>> </primitive>
>> <primitive class="ocf" id="p_ocfs2controld" provider="ocfs2" type="o2cb">
>> <operations id="p_ocfs2controld-operations">
>> …
>> </operations>
>> <instance_attributes id="p_ocfs2controld-instance_attributes">
>> <nvpair id="p_ocfs2controld-instance_attributes-stack" name="stack" value="pcmk"/>
>> </instance_attributes>
>> </primitive>
>> </group>
>> </clone>
>> <clone id="c_ocfs2-u02">
>> <primitive class="ocf" id="p_ocfs2-u02" provider="heartbeat" type="Filesystem">
>> <operations id="p_ocfs2-u02-operations">
>> <op id="p_ocfs2-u02-op-monitor-60" interval="60" name="monitor" timeout="45">
>> <instance_attributes id="p_ocfs2-u02-op-monitor-60-instance_attributes">
>> <nvpair id="p_ocfs2-u02-op-monitor-60-instance_attributes-OCF_CHECK_LEVEL" name="OCF_CHECK_LEVEL" value="20"/>
>> </instance_attributes>
>> </op>
>> <op id="p_ocfs2-u02-op-start-0" interval="0" name="start" timeout="60"/>
>> </operations>
>> <instance_attributes id="p_ocfs2-u02-instance_attributes">
>> <nvpair id="p_ocfs2-u02-instance_attributes-device" name="device" value="/dev/mapper/bmimysqlp3_p4_vol1"/>
>> <nvpair id="p_ocfs2-u02-instance_attributes-directory" name="directory" value="/u02"/>
>> <nvpair id="p_ocfs2-u02-instance_attributes-fstype" name="fstype" value="ocfs2"/>
>> <nvpair id="p_ocfs2-u02-instance_attributes-options" name="options" value="rw,nointr,data=writeback"/>
>> </instance_attributes>
>> <meta_attributes id="p_ocfs2-u02-meta_attributes"> </meta_attributes>
>> </primitive>
>> </clone>
>> <clone id="c_ocfs2-u03">
>> <meta_attributes id="c_ocfs2-u03-meta_attributes">
>> <nvpair id="c_ocfs2-u03-meta_attributes-interleave" name="interleave" value="true"/>
>> <nvpair id="c_ocfs2-u03-meta_attributes-target-role" name="target-role" value="Started"/>
>> </meta_attributes>
>> <primitive class="ocf" id="p_ocfs2-u03" provider="heartbeat" type="Filesystem">
>> <operations id="p_ocfs2-u03-operations">
>> <op id="p_ocfs2-u03-op-monitor-60" interval="60" name="monitor" timeout="45">
>> <instance_attributes id="p_ocfs2-u03-op-monitor-60-instance_attributes">
>> <nvpair id="p_ocfs2-u03-op-monitor-60-instance_attributes-OCF_CHECK_LEVEL" name="OCF_CHECK_LEVEL" value="20"/>
>> </instance_attributes>
>> </op>
>> <op id="p_ocfs2-u03-op-start-0" interval="0" name="start" timeout="60"/>
>> </operations>
>> <instance_attributes id="p_ocfs2-u03-instance_attributes">
>> <nvpair id="p_ocfs2-u03-instance_attributes-device" name="device" value="/dev/mapper/bmimysqlp3_p4_vol2"/>
>> <nvpair id="p_ocfs2-u03-instance_attributes-directory" name="directory" value="/u03"/>
>> <nvpair id="p_ocfs2-u03-instance_attributes-fstype" name="fstype" value="ocfs2"/>
>> <nvpair id="p_ocfs2-u03-instance_attributes-options" name="options" value="rw,nointr,data=writeback"/>
>> </instance_attributes>
>> <meta_attributes id="p_ocfs2-u03-meta_attributes">
>> <nvpair id="p_ocfs2-u03-meta_attributes-target-role" name="target-role" value="Started"/>
>> </meta_attributes>
>> </primitive>
>> </clone>
>> </resources>
>> <constraints>
>> <rsc_order first="c_stack" id="c_ocfs2-u02-after-c_stack" then="c_ocfs2-u02"/>
>> <rsc_order first="c_stack" id="c_ocfs2-u03-after-c_stack" then="c_ocfs2-u03"/>
>> <rsc_order first="c_ocfs2-u02" id="g_mysql-rschp01-after-c_ocfs2-u02" then="g_mysql-rschp01"/>
>> <rsc_order first="c_ocfs2-u03" id="g_mysql-rschp02-after-c_ocfs2-u03" then="g_mysql-rschp02"/>
>> <rsc_location id="stonith-1-never-on-bmimysqlp3" node="bmimysqlp3" rsc="stonith-1" score="-INFINITY"/>
>> <rsc_location id="stonith-2-never-on-bmimysqlp4" node="bmimysqlp4" rsc="stonith-2" score="-INFINITY"/>
>> <rsc_location id="g_mysql-rschp01-prefers-bmimysqlp3" node="bmimysqlp3" rsc="g_mysql-rschp01" score="50"/>
>> <rsc_location id="g_mysql-rschp02-prefers-bmimysqlp4" node="bmimysqlp4" rsc="g_mysql-rschp02" score="50"/>
>> <rsc_colocation id="g_mysql-rschp01-with-c_ocfs2-u02" rsc="g_mysql-rschp01" score="+INFINITY" with-rsc="c_ocfs2-u02"/>
>> <rsc_colocation id="g_mysql-rschp02-with-c_ocfs2-u03" rsc="g_mysql-rschp02" score="+INFINITY" with-rsc="c_ocfs2-u03"/>
>> <rsc_location id="g_mysql-rschp01-no-connectivity" rsc="g_mysql-rschp01">
>> <rule id="ping-exclude-rule-rschp01" score="-INFINITY">
>> <expression attribute="pingd" id="ping-exclude-rule-rschp01-pingd" operation="lte" value="0"/>
>> </rule>
>> </rsc_location>
>> <rsc_location id="g_mysql-rschp02-no-connectivity" rsc="g_mysql-rschp02">
>> <rule id="ping-exclude-rule-2" score="-INFINITY">
>> <expression attribute="pingd" id="ping-exclude-rule-2-pingd" operation="lte" value="0"/>
>> </rule>
>> </rsc_location>
>> </constraints>
>> <rsc_defaults>
>> <meta_attributes id="core-hours" score="2">
>> <rule id="core-hour-rule" score="0">
>> <date_expression id="seven-to-fourteen-mon-to-fri" operation="date_spec">
>> <date_spec hours="7-14" id="seven-to-fourteen-mon-to-fri-date_spec" weekdays="1-5"/>
>> </date_expression>
>> </rule>
>> <nvpair id="core-hours-resource-stickiness" name="resource-stickiness" value="INFINITY"/>
>> </meta_attributes>
>> <meta_attributes id="after-hours" score="1">
>> <nvpair id="after-hours-resource-stickiness" name="resource-stickiness" value="0"/>
>> </meta_attributes>
>> </rsc_defaults>
>> </configuration>
>>
>> Thanks,
>> Prakash
>> _______________________________________________
>> Pacemaker mailing list: Pacemaker at oss.clusterlabs.org
>> http://oss.clusterlabs.org/mailman/listinfo/pacemaker
>>
>> Project Home: http://www.clusterlabs.org
>> Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
>> Bugs: http://bugs.clusterlabs.org
>
> _______________________________________________
> Pacemaker mailing list: Pacemaker at oss.clusterlabs.org
> http://oss.clusterlabs.org/mailman/listinfo/pacemaker
>
> Project Home: http://www.clusterlabs.org
> Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
> Bugs: http://bugs.clusterlabs.org
Hi,
Here is a test case with ocf::Pacemaker::Dummy resources.
<cib epoch="861" num_updates="121" admin_epoch="0" validate-with="pacemaker-1.2" crm_feature_set="3.0.5" have-quorum="1" cib-last-written="Tue Jun 26 11:44:32 2012" dc-uuid="node1">
<configuration>
<crm_config>
<cluster_property_set id="cib-bootstrap-options">
<nvpair id="cib-bootstrap-options-dc-version" name="dc-version" value="1.1.5-5ce2879aa0d5f43d01629bc20edc6868a9352002"/>
<nvpair id="cib-bootstrap-options-cluster-infrastructure" name="cluster-infrastructure" value="openais"/>
<nvpair id="cib-bootstrap-options-last-lrm-refresh" name="last-lrm-refresh" value="1340800389"/>
<nvpair id="cib-bootstrap-options-no-quorum-policy" name="no-quorum-policy" value="ignore"/>
<nvpair id="cib-bootstrap-options-cluster-recheck-interval" name="cluster-recheck-interval" value="1min"/>
<nvpair id="cib-bootstrap-options-expected-quorum-votes" name="expected-quorum-votes" value="2"/>
</cluster_property_set>
</crm_config>
<nodes>
<node id="node1" uname="node1" type="normal">
<instance_attributes id="nodes-node1">
<nvpair id="nodes-node1-standby" name="standby" value="off"/>
</instance_attributes>
</node>
<node id="node2" uname="node2" type="normal">
<instance_attributes id="nodes-node2">
<nvpair id="nodes-node2-standby" name="standby" value="off"/>
</instance_attributes>
</node>
</nodes>
<resources>
<clone id="cl_dummy">
<meta_attributes id="cl_dummy-meta_attributes">
<nvpair id="cl_dummy-meta_attributes-interleave" name="interleave" value="true"/>
<nvpair id="cl_dummy-meta_attributes-target-role" name="target-role" value="Started"/>
</meta_attributes>
<group id="grp_dummy">
<primitive class="ocf" id="p_dummy" provider="pacemaker" type="Dummy">
<operations id="p_dummy-operations">
<op id="p_dummy-op-monitor-10" interval="10" name="monitor" timeout="20"/>
</operations>
</primitive>
</group>
</clone>
<clone id="cl_dummy1">
<meta_attributes id="cl_dummy1-meta_attributes">
<nvpair id="cl_dummy1-meta_attributes-interleave" name="interleave" value="true"/>
<nvpair id="cl_dummy1-meta_attributes-target-role" name="target-role" value="Started"/>
</meta_attributes>
<primitive class="ocf" id="p_dummy1" provider="pacemaker" type="Dummy">
<operations id="p_dummy1-operations">
<op id="p_dummy1-op-monitor-10" interval="10" name="monitor" timeout="20"/>
</operations>
</primitive>
</clone>
<clone id="cl_dummy2">
<meta_attributes id="cl_dummy2-meta_attributes">
<nvpair id="cl_dummy2-meta_attributes-interleave" name="interleave" value="true"/>
<nvpair id="cl_dummy2-meta_attributes-target-role" name="target-role" value="Started"/>
</meta_attributes>
<primitive class="ocf" id="p_dummy2" provider="pacemaker" type="Dummy">
<operations id="p_dummy2-operations">
<op id="p_dummy2-op-monitor-10" interval="10" name="monitor" timeout="20"/>
</operations>
</primitive>
</clone>
<group id="g_dummy1">
<meta_attributes id="g_dummy1-meta_attributes">
<nvpair id="g_dummy1-meta_attributes-target-role" name="target-role" value="Started"/>
</meta_attributes>
<primitive class="ocf" id="p_dummy3" provider="pacemaker" type="Dummy">
<operations id="p_dummy3-operations">
<op id="p_dummy3-op-monitor-10" interval="10" name="monitor" timeout="20"/>
</operations>
</primitive>
<primitive class="ocf" id="p_dummy4" provider="pacemaker" type="Dummy">
<operations id="p_dummy4-operations">
<op id="p_dummy4-op-monitor-10" interval="10" name="monitor" timeout="20"/>
</operations>
</primitive>
</group>
<group id="g_dummy2">
<meta_attributes id="g_dummy2-meta_attributes">
<nvpair id="g_dummy2-meta_attributes-target-role" name="target-role" value="Started"/>
</meta_attributes>
<primitive class="ocf" id="p_dummy5" provider="pacemaker" type="Dummy">
<operations id="p_dummy5-operations">
<op id="p_dummy5-op-monitor-10" interval="10" name="monitor" timeout="20"/>
</operations>
</primitive>
<primitive class="ocf" id="p_dummy6" provider="pacemaker" type="Dummy">
<operations id="p_dummy6-operations">
<op id="p_dummy6-op-monitor-10" interval="10" name="monitor" timeout="20"/>
</operations>
</primitive>
</group>
</resources>
<constraints>
<rsc_order first="cl_dummy" id="cl_dummy1-after-cl_dummy" then="cl_dummy1"/>
<rsc_order first="cl_dummy" id="cl_dummy2-after-cl_dummy" then="cl_dummy2"/>
<rsc_location id="g_dummy1-p3" node="node1" rsc="g_dummy1" score="50"/>
<rsc_location id="g_dummy2-p4" node="node2" rsc="g_dummy2" score="50"/>
<rsc_colocation id="g_dummy1-with-cl_dummy1" rsc="g_dummy1" score="+INFINITY" with-rsc="cl_dummy1"/>
<rsc_colocation id="g_dummy2-with-cl_dummy2" rsc="g_dummy2" score="+INFINITY" with-rsc="cl_dummy2"/>
<rsc_order first="cl_dummy1" id="g_dummy1-after-cl_dummy1" then="g_dummy1"/>
<rsc_order first="cl_dummy2" id="g_dummy2-after-cl_dummy2" then="g_dummy2"/>
</constraints>
</configuration>
When I stop OpenAIS (Corosync) on node2, resource "g_dummy2" migrates over to node1 as expected. But when I start OpenAIS (Corosync) back on node2, instead of just "g_dummy2" migrating back to node2, both "g_dummy1" and "g_dummy2" stop and then "g_dummy1" gets started on node1 and "g_dummy2" gets started on node2. Is this a bug or is it expected behavior because of something in this configuration? I would expect that in this specific test case, "g_dummy1" does not get disturbed at all.
Thanks,
Prakash
More information about the Pacemaker
mailing list