[Pacemaker] Cluster issues - all resources restarting when a node reboots

Velayutham, Prakash Prakash.Velayutham at cchmc.org
Sat Jun 30 10:04:55 EDT 2012


On Jun 28, 2012, at 1:00 AM, Andrew Beekhof wrote:

> On Thu, Jun 28, 2012 at 4:03 AM, Velayutham, Prakash
> <Prakash.Velayutham at cchmc.org> wrote:
>> Hello all,
>> 
>> Below is the relevant part of my CIB. I am facing 2 issues.
>> 
>> 1. Every time a node in the cluster reboots, all resources get restarted in the entire cluster.
>> 2. I have a time-based rule for stickiness, but it does not work.
>> 
>> Can any one point out what is wrong with the configuration?
> 
> Too many dots?
> Impossible to say really, perhaps file a bug and attach a crm_report
> (run with --help) covering the time of your test case.
> 
>> 
>>   <crm_config>
>>      <cluster_property_set id="cib-bootstrap-options">
>>        <nvpair id="cib-bootstrap-options-dc-version" name="dc-version" value="1.1.5-5ce2879aa0d5f43d01629bc20edc6868a9352002"/>
>>        <nvpair id="cib-bootstrap-options-cluster-infrastructure" name="cluster-infrastructure" value="openais"/>
>>        <nvpair id="cib-bootstrap-options-expected-quorum-votes" name="expected-quorum-votes" value="2"/>
>>        <nvpair id="cib-bootstrap-options-last-lrm-refresh" name="last-lrm-refresh" value="1340724782"/>
>>        <nvpair id="cib-bootstrap-options-no-quorum-policy" name="no-quorum-policy" value="ignore"/>
>>        <nvpair id="cib-bootstrap-options-cluster-recheck-interval" name="cluster-recheck-interval" value="1min"/>
>>      </cluster_property_set>
>>    </crm_config>
>>    <nodes>
>>      <node id="bmimysqlp3" uname="bmimysqlp3" type="normal">
>>        ...
>>      </node>
>>      <node id="bmimysqlp4" uname="bmimysqlp4" type="normal">
>>        ...
>>      </node>
>>    </nodes>
>>    <resources>
>>      <primitive class="stonith" id="stonith-1" type="external/riloe">
>>        ...
>>      </primitive>
>>      <primitive class="stonith" id="stonith-2" type="external/riloe">
>>        ...
>>      </primitive>
>>      <group id="g_mysql-rschp01">
>>        <primitive class="ocf" id="p_vip-rschp01" provider="heartbeat" type="IPaddr2">
>>          <operations id="p_vip-rschp01-operations">
>>                ...
>>          </operations>
>>          <instance_attributes id="p_vip-rschp01-instance_attributes">
>>                ...
>>          </instance_attributes>
>>          <meta_attributes id="p_vip-rschp01-meta_attributes">                      </meta_attributes>
>>        </primitive>
>>        <primitive class="ocf" id="p_mysql-rschp01" provider="heartbeat" type="mysql">
>>          <operations id="p_mysql-rschp01-operations">
>>>>          </operations>
>>          <instance_attributes id="p_mysql-rschp01-instance_attributes">
>>                ...
>>          </instance_attributes>
>>          <meta_attributes id="p_mysql-rschp01-meta_attributes">
>>>>          </meta_attributes>
>>        </primitive>
>>      </group>
>>      <group id="g_mysql-rschp02">
>>        <primitive class="ocf" id="p_vip-rschp02" provider="heartbeat" type="IPaddr2">
>>          <operations id="p_vip-rschp02-operations">
>>                ...
>>          </operations>
>>          <instance_attributes id="p_vip-rschp02-instance_attributes">
>>                ...
>>          </instance_attributes>
>>          <meta_attributes id="p_vip-rschp02-meta_attributes">                      </meta_attributes>
>>        </primitive>
>>        <primitive class="ocf" id="p_mysql-rschp02" provider="heartbeat" type="mysql">
>>          <operations id="p_mysql-rschp02-operations">
>>>>          </operations>
>>          <instance_attributes id="p_mysql-rschp02-instance_attributes">
>>                ...
>>          </instance_attributes>
>>          <meta_attributes id="p_mysql-rschp02-meta_attributes">                      </meta_attributes>
>>        </primitive>
>>      </group>
>>      <clone id="c_ping">
>>        <meta_attributes id="c_ping-meta_attributes">
>>          <nvpair id="c_ping-meta_attributes-target-role" name="target-role" value="Started"/>
>>        </meta_attributes>
>>        <primitive class="ocf" id="p_ping" provider="pacemaker" type="ping">
>>          <operations id="p_ping-operations">
>>            <op id="p_ping-op-monitor-15" interval="15" name="monitor" timeout="10"/>
>>          </operations>
>>          <instance_attributes id="p_ping-instance_attributes">
>>            <nvpair id="p_ping-instance_attributes-host_list" name="host_list" value="10.200.31.1"/>
>>            <nvpair id="p_ping-instance_attributes-dampen" name="dampen" value="5s"/>
>>            <nvpair id="p_ping-instance_attributes-multiplier" name="multiplier" value="100"/>
>>            <nvpair id="p_ping-instance_attributes-attempts" name="attempts" value="2"/>
>>            <nvpair id="p_ping-instance_attributes-name" name="name" value="pingd"/>
>>          </instance_attributes>
>>          <meta_attributes id="p_ping-meta_attributes">
>>            <nvpair id="p_ping-meta_attributes-target-role" name="target-role" value="Started"/>
>>          </meta_attributes>
>>        </primitive>
>>      </clone>
>>      <clone id="c_stack">
>>        <group id="g_stack">
>>          <primitive class="ocf" id="p_dlmcontrold" provider="pacemaker" type="controld">
>>            <operations id="p_dlmcontrold-operations">
>>>>            </operations>
>>            <instance_attributes id="p_dlmcontrold-instance_attributes">
>>              <nvpair id="p_dlmcontrold-instance_attributes-daemon" name="daemon" value="dlm_controld.pcmk"/>
>>            </instance_attributes>
>>          </primitive>
>>          <primitive class="ocf" id="p_ocfs2controld" provider="ocfs2" type="o2cb">
>>            <operations id="p_ocfs2controld-operations">
>>>>            </operations>
>>            <instance_attributes id="p_ocfs2controld-instance_attributes">
>>              <nvpair id="p_ocfs2controld-instance_attributes-stack" name="stack" value="pcmk"/>
>>            </instance_attributes>
>>          </primitive>
>>        </group>
>>      </clone>
>>      <clone id="c_ocfs2-u02">
>>        <primitive class="ocf" id="p_ocfs2-u02" provider="heartbeat" type="Filesystem">
>>          <operations id="p_ocfs2-u02-operations">
>>            <op id="p_ocfs2-u02-op-monitor-60" interval="60" name="monitor" timeout="45">
>>              <instance_attributes id="p_ocfs2-u02-op-monitor-60-instance_attributes">
>>                <nvpair id="p_ocfs2-u02-op-monitor-60-instance_attributes-OCF_CHECK_LEVEL" name="OCF_CHECK_LEVEL" value="20"/>
>>              </instance_attributes>
>>            </op>
>>            <op id="p_ocfs2-u02-op-start-0" interval="0" name="start" timeout="60"/>
>>          </operations>
>>          <instance_attributes id="p_ocfs2-u02-instance_attributes">
>>            <nvpair id="p_ocfs2-u02-instance_attributes-device" name="device" value="/dev/mapper/bmimysqlp3_p4_vol1"/>
>>            <nvpair id="p_ocfs2-u02-instance_attributes-directory" name="directory" value="/u02"/>
>>            <nvpair id="p_ocfs2-u02-instance_attributes-fstype" name="fstype" value="ocfs2"/>
>>            <nvpair id="p_ocfs2-u02-instance_attributes-options" name="options" value="rw,nointr,data=writeback"/>
>>          </instance_attributes>
>>          <meta_attributes id="p_ocfs2-u02-meta_attributes">                      </meta_attributes>
>>        </primitive>
>>      </clone>
>>      <clone id="c_ocfs2-u03">
>>        <meta_attributes id="c_ocfs2-u03-meta_attributes">
>>          <nvpair id="c_ocfs2-u03-meta_attributes-interleave" name="interleave" value="true"/>
>>          <nvpair id="c_ocfs2-u03-meta_attributes-target-role" name="target-role" value="Started"/>
>>        </meta_attributes>
>>        <primitive class="ocf" id="p_ocfs2-u03" provider="heartbeat" type="Filesystem">
>>          <operations id="p_ocfs2-u03-operations">
>>            <op id="p_ocfs2-u03-op-monitor-60" interval="60" name="monitor" timeout="45">
>>              <instance_attributes id="p_ocfs2-u03-op-monitor-60-instance_attributes">
>>                <nvpair id="p_ocfs2-u03-op-monitor-60-instance_attributes-OCF_CHECK_LEVEL" name="OCF_CHECK_LEVEL" value="20"/>
>>              </instance_attributes>
>>            </op>
>>            <op id="p_ocfs2-u03-op-start-0" interval="0" name="start" timeout="60"/>
>>          </operations>
>>          <instance_attributes id="p_ocfs2-u03-instance_attributes">
>>            <nvpair id="p_ocfs2-u03-instance_attributes-device" name="device" value="/dev/mapper/bmimysqlp3_p4_vol2"/>
>>            <nvpair id="p_ocfs2-u03-instance_attributes-directory" name="directory" value="/u03"/>
>>            <nvpair id="p_ocfs2-u03-instance_attributes-fstype" name="fstype" value="ocfs2"/>
>>            <nvpair id="p_ocfs2-u03-instance_attributes-options" name="options" value="rw,nointr,data=writeback"/>
>>          </instance_attributes>
>>          <meta_attributes id="p_ocfs2-u03-meta_attributes">
>>            <nvpair id="p_ocfs2-u03-meta_attributes-target-role" name="target-role" value="Started"/>
>>          </meta_attributes>
>>        </primitive>
>>      </clone>
>>    </resources>
>>    <constraints>
>>      <rsc_order first="c_stack" id="c_ocfs2-u02-after-c_stack" then="c_ocfs2-u02"/>
>>      <rsc_order first="c_stack" id="c_ocfs2-u03-after-c_stack" then="c_ocfs2-u03"/>
>>      <rsc_order first="c_ocfs2-u02" id="g_mysql-rschp01-after-c_ocfs2-u02" then="g_mysql-rschp01"/>
>>      <rsc_order first="c_ocfs2-u03" id="g_mysql-rschp02-after-c_ocfs2-u03" then="g_mysql-rschp02"/>
>>      <rsc_location id="stonith-1-never-on-bmimysqlp3" node="bmimysqlp3" rsc="stonith-1" score="-INFINITY"/>
>>      <rsc_location id="stonith-2-never-on-bmimysqlp4" node="bmimysqlp4" rsc="stonith-2" score="-INFINITY"/>
>>      <rsc_location id="g_mysql-rschp01-prefers-bmimysqlp3" node="bmimysqlp3" rsc="g_mysql-rschp01" score="50"/>
>>      <rsc_location id="g_mysql-rschp02-prefers-bmimysqlp4" node="bmimysqlp4" rsc="g_mysql-rschp02" score="50"/>
>>      <rsc_colocation id="g_mysql-rschp01-with-c_ocfs2-u02" rsc="g_mysql-rschp01" score="+INFINITY" with-rsc="c_ocfs2-u02"/>
>>      <rsc_colocation id="g_mysql-rschp02-with-c_ocfs2-u03" rsc="g_mysql-rschp02" score="+INFINITY" with-rsc="c_ocfs2-u03"/>
>>      <rsc_location id="g_mysql-rschp01-no-connectivity" rsc="g_mysql-rschp01">
>>        <rule id="ping-exclude-rule-rschp01" score="-INFINITY">
>>          <expression attribute="pingd" id="ping-exclude-rule-rschp01-pingd" operation="lte" value="0"/>
>>        </rule>
>>      </rsc_location>
>>      <rsc_location id="g_mysql-rschp02-no-connectivity" rsc="g_mysql-rschp02">
>>        <rule id="ping-exclude-rule-2" score="-INFINITY">
>>          <expression attribute="pingd" id="ping-exclude-rule-2-pingd" operation="lte" value="0"/>
>>        </rule>
>>      </rsc_location>
>>    </constraints>
>>    <rsc_defaults>
>>      <meta_attributes id="core-hours" score="2">
>>        <rule id="core-hour-rule" score="0">
>>          <date_expression id="seven-to-fourteen-mon-to-fri" operation="date_spec">
>>            <date_spec hours="7-14" id="seven-to-fourteen-mon-to-fri-date_spec" weekdays="1-5"/>
>>          </date_expression>
>>        </rule>
>>        <nvpair id="core-hours-resource-stickiness" name="resource-stickiness" value="INFINITY"/>
>>      </meta_attributes>
>>      <meta_attributes id="after-hours" score="1">
>>        <nvpair id="after-hours-resource-stickiness" name="resource-stickiness" value="0"/>
>>      </meta_attributes>
>>    </rsc_defaults>
>>  </configuration>
>> 
>> Thanks,
>> Prakash
>> _______________________________________________
>> Pacemaker mailing list: Pacemaker at oss.clusterlabs.org
>> http://oss.clusterlabs.org/mailman/listinfo/pacemaker
>> 
>> Project Home: http://www.clusterlabs.org
>> Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
>> Bugs: http://bugs.clusterlabs.org
> 
> _______________________________________________
> Pacemaker mailing list: Pacemaker at oss.clusterlabs.org
> http://oss.clusterlabs.org/mailman/listinfo/pacemaker
> 
> Project Home: http://www.clusterlabs.org
> Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
> Bugs: http://bugs.clusterlabs.org

Hi,

Here is a test case with ocf::Pacemaker::Dummy resources.

<cib epoch="861" num_updates="121" admin_epoch="0" validate-with="pacemaker-1.2" crm_feature_set="3.0.5" have-quorum="1" cib-last-written="Tue Jun 26 11:44:32 2012" dc-uuid="node1">
  <configuration>
    <crm_config>
      <cluster_property_set id="cib-bootstrap-options">
        <nvpair id="cib-bootstrap-options-dc-version" name="dc-version" value="1.1.5-5ce2879aa0d5f43d01629bc20edc6868a9352002"/>
        <nvpair id="cib-bootstrap-options-cluster-infrastructure" name="cluster-infrastructure" value="openais"/>
        <nvpair id="cib-bootstrap-options-last-lrm-refresh" name="last-lrm-refresh" value="1340800389"/>
        <nvpair id="cib-bootstrap-options-no-quorum-policy" name="no-quorum-policy" value="ignore"/>
        <nvpair id="cib-bootstrap-options-cluster-recheck-interval" name="cluster-recheck-interval" value="1min"/>
        <nvpair id="cib-bootstrap-options-expected-quorum-votes" name="expected-quorum-votes" value="2"/>
      </cluster_property_set>
    </crm_config>
    <nodes>
      <node id="node1" uname="node1" type="normal">
        <instance_attributes id="nodes-node1">
          <nvpair id="nodes-node1-standby" name="standby" value="off"/>
        </instance_attributes>
      </node>
      <node id="node2" uname="node2" type="normal">
        <instance_attributes id="nodes-node2">
          <nvpair id="nodes-node2-standby" name="standby" value="off"/>
        </instance_attributes>
      </node>
    </nodes>
    <resources>
      <clone id="cl_dummy">
        <meta_attributes id="cl_dummy-meta_attributes">
          <nvpair id="cl_dummy-meta_attributes-interleave" name="interleave" value="true"/>
          <nvpair id="cl_dummy-meta_attributes-target-role" name="target-role" value="Started"/>
        </meta_attributes>
        <group id="grp_dummy">
          <primitive class="ocf" id="p_dummy" provider="pacemaker" type="Dummy">
            <operations id="p_dummy-operations">
              <op id="p_dummy-op-monitor-10" interval="10" name="monitor" timeout="20"/>
            </operations>
          </primitive>
        </group>
      </clone>
      <clone id="cl_dummy1">
        <meta_attributes id="cl_dummy1-meta_attributes">
          <nvpair id="cl_dummy1-meta_attributes-interleave" name="interleave" value="true"/>
          <nvpair id="cl_dummy1-meta_attributes-target-role" name="target-role" value="Started"/>
        </meta_attributes>
        <primitive class="ocf" id="p_dummy1" provider="pacemaker" type="Dummy">
          <operations id="p_dummy1-operations">
            <op id="p_dummy1-op-monitor-10" interval="10" name="monitor" timeout="20"/>
          </operations>
        </primitive>
      </clone>
      <clone id="cl_dummy2">
        <meta_attributes id="cl_dummy2-meta_attributes">
          <nvpair id="cl_dummy2-meta_attributes-interleave" name="interleave" value="true"/>
          <nvpair id="cl_dummy2-meta_attributes-target-role" name="target-role" value="Started"/>
        </meta_attributes>
        <primitive class="ocf" id="p_dummy2" provider="pacemaker" type="Dummy">
          <operations id="p_dummy2-operations">
            <op id="p_dummy2-op-monitor-10" interval="10" name="monitor" timeout="20"/>
          </operations>
        </primitive>
      </clone>
      <group id="g_dummy1">
        <meta_attributes id="g_dummy1-meta_attributes">
          <nvpair id="g_dummy1-meta_attributes-target-role" name="target-role" value="Started"/>
        </meta_attributes>
        <primitive class="ocf" id="p_dummy3" provider="pacemaker" type="Dummy">
          <operations id="p_dummy3-operations">
            <op id="p_dummy3-op-monitor-10" interval="10" name="monitor" timeout="20"/>
          </operations>
        </primitive>
        <primitive class="ocf" id="p_dummy4" provider="pacemaker" type="Dummy">
          <operations id="p_dummy4-operations">
            <op id="p_dummy4-op-monitor-10" interval="10" name="monitor" timeout="20"/>
          </operations>
        </primitive>
      </group>
      <group id="g_dummy2">
        <meta_attributes id="g_dummy2-meta_attributes">
          <nvpair id="g_dummy2-meta_attributes-target-role" name="target-role" value="Started"/>
        </meta_attributes>
        <primitive class="ocf" id="p_dummy5" provider="pacemaker" type="Dummy">
          <operations id="p_dummy5-operations">
            <op id="p_dummy5-op-monitor-10" interval="10" name="monitor" timeout="20"/>
          </operations>
        </primitive>
        <primitive class="ocf" id="p_dummy6" provider="pacemaker" type="Dummy">
          <operations id="p_dummy6-operations">
            <op id="p_dummy6-op-monitor-10" interval="10" name="monitor" timeout="20"/>
          </operations>
        </primitive>
      </group>
    </resources>
    <constraints>
      <rsc_order first="cl_dummy" id="cl_dummy1-after-cl_dummy" then="cl_dummy1"/>
      <rsc_order first="cl_dummy" id="cl_dummy2-after-cl_dummy" then="cl_dummy2"/>
      <rsc_location id="g_dummy1-p3" node="node1" rsc="g_dummy1" score="50"/>
      <rsc_location id="g_dummy2-p4" node="node2" rsc="g_dummy2" score="50"/>
      <rsc_colocation id="g_dummy1-with-cl_dummy1" rsc="g_dummy1" score="+INFINITY" with-rsc="cl_dummy1"/>
      <rsc_colocation id="g_dummy2-with-cl_dummy2" rsc="g_dummy2" score="+INFINITY" with-rsc="cl_dummy2"/>
      <rsc_order first="cl_dummy1" id="g_dummy1-after-cl_dummy1" then="g_dummy1"/>
      <rsc_order first="cl_dummy2" id="g_dummy2-after-cl_dummy2" then="g_dummy2"/>
    </constraints>
  </configuration>

When I stop OpenAIS (Corosync) on node2, resource "g_dummy2" migrates over to node1 as expected. But when I start OpenAIS (Corosync) back on node2, instead of just "g_dummy2" migrating back to node2, both "g_dummy1" and "g_dummy2" stop and then "g_dummy1" gets started on node1 and "g_dummy2" gets started on node2. Is this a bug or is it expected behavior because of something in this configuration? I would expect that in this specific test case, "g_dummy1" does not get disturbed at all.

Thanks,
Prakash



More information about the Pacemaker mailing list