[Pacemaker] Corosync won't recover when a node fails
Andreas Kurz
andreas at hastexo.com
Fri Oct 4 08:37:36 UTC 2013
On 2013-10-04 03:03, David Parker wrote:
> Sure. Here's the full config:
You defintely must not use the deprecated ocf:heartbeat:drbd resource
agent but the one that comes with DRBD: ocf:linbit:drbd ... you should
see a big fat warning in your logs to not use it.
And this colocation is wrong:
...
<rsc_colocation id="drbd-nfs-ha" rsc="ms-drbd_r0"
rsc-role="Master" score="INFINITY" with-rsc="nfs_resources"/>
...
rsc and with-rcs need to be the other way round
<rsc_colocation id="drbd-nfs-ha" rsc="nfs_resources" score="INFINITY"
with-rsc="ms-drbd_r0" with-rsc-role="Master" />
... give this a try.
Regards,
Andreas
>
> <cib epoch="28" num_updates="34" admin_epoch="0"
> validate-with="pacemaker-1.2" cib-last-written="Thu Oct 3 16:26:39
> 2013" crm_feature_set="3.0.6" update-origin="test-vm-2"
> update-client="cibadmin" have-quorum="1" dc-uuid="test-vm-1">
> <configuration>
> <crm_config>
> <cluster_property_set id="cib-bootstrap-options">
> <nvpair id="cib-bootstrap-options-dc-version" name="dc-version"
> value="1.1.7-ee0730e13d124c3d58f00016c3376a1de5323cff"/>
> <nvpair id="cib-bootstrap-options-cluster-infrastructure"
> name="cluster-infrastructure" value="openais"/>
> <nvpair id="cib-bootstrap-options-expected-quorum-votes"
> name="expected-quorum-votes" value="2"/>
> <nvpair id="cib-bootstrap-options-stonith-enabled"
> name="stonith-enabled" value="false"/>
> <nvpair id="cib-bootstrap-options-no-quorum-policy"
> name="no-quorum-policy" value="ignore"/>
> </cluster_property_set>
> </crm_config>
> <nodes>
> <node id="test-vm-1" type="normal" uname="test-vm-1"/>
> <node id="test-vm-2" type="normal" uname="test-vm-2"/>
> </nodes>
> <resources>
> <group id="nfs_resources">
> <meta_attributes id="nfs_resources-meta_attributes">
> <nvpair id="nfs_resources-meta_attributes-target-role"
> name="target-role" value="Started"/>
> </meta_attributes>
> <primitive class="ocf" id="nfs_fs" provider="heartbeat"
> type="Filesystem">
> <instance_attributes id="nfs_fs-instance_attributes">
> <nvpair id="nfs_fs-instance_attributes-device" name="device"
> value="/dev/drbd1"/>
> <nvpair id="nfs_fs-instance_attributes-directory"
> name="directory" value="/export/data/"/>
> <nvpair id="nfs_fs-instance_attributes-fstype" name="fstype"
> value="ext3"/>
> <nvpair id="nfs_fs-instance_attributes-options"
> name="options" value="noatime,nodiratime"/>
> </instance_attributes>
> <operations>
> <op id="nfs_fs-start-0" interval="0" name="start" timeout="60"/>
> <op id="nfs_fs-stop-0" interval="0" name="stop" timeout="120"/>
> </operations>
> </primitive>
> <primitive class="ocf" id="nfs_ip" provider="heartbeat"
> type="IPaddr2">
> <instance_attributes id="nfs_ip-instance_attributes">
> <nvpair id="nfs_ip-instance_attributes-ip" name="ip"
> value="192.168.25.205"/>
> <nvpair id="nfs_ip-instance_attributes-cidr_netmask"
> name="cidr_netmask" value="32"/>
> </instance_attributes>
> <operations>
> <op id="nfs_ip-monitor-10s" interval="10s" name="monitor"/>
> </operations>
> <meta_attributes id="nfs_ip-meta_attributes">
> <nvpair id="nfs_ip-meta_attributes-is-managed"
> name="is-managed" value="true"/>
> </meta_attributes>
> </primitive>
> <primitive class="lsb" id="nfs" type="nfs-kernel-server">
> <operations>
> <op id="nfs-monitor-5s" interval="5s" name="monitor"/>
> <op id="nfs-start-0" interval="0" name="start" timeout="120"/>
> <op id="nfs-stop-0" interval="0" name="stop" timeout="120"/>
> </operations>
> </primitive>
> </group>
> <master id="ms-drbd_r0">
> <meta_attributes id="ms-drbd_r0-meta_attributes">
> <nvpair id="ms-drbd_r0-meta_attributes-clone-max"
> name="clone-max" value="2"/>
> <nvpair id="ms-drbd_r0-meta_attributes-notify" name="notify"
> value="true"/>
> <nvpair id="ms-drbd_r0-meta_attributes-globally-unique"
> name="globally-unique" value="false"/>
> <nvpair id="ms-drbd_r0-meta_attributes-target-role"
> name="target-role" value="Master"/>
> </meta_attributes>
> <primitive class="ocf" id="drbd_r0" provider="heartbeat"
> type="drbd">
> <instance_attributes id="drbd_r0-instance_attributes">
> <nvpair id="drbd_r0-instance_attributes-drbd_resource"
> name="drbd_resource" value="r0"/>
> </instance_attributes>
> <operations>
> <op id="drbd_r0-monitor-59s" interval="59s" name="monitor"
> role="Master" timeout="30s"/>
> <op id="drbd_r0-monitor-60s" interval="60s" name="monitor"
> role="Slave" timeout="30s"/>
> </operations>
> </primitive>
> </master>
> </resources>
> <constraints>
> <rsc_colocation id="drbd-nfs-ha" rsc="ms-drbd_r0"
> rsc-role="Master" score="INFINITY" with-rsc="nfs_resources"/>
> <rsc_order id="drbd-before-nfs" first="ms-drbd_r0"
> first-action="promote" score="INFINITY" then="nfs_resources"
> then-action="start"/>
> </constraints>
> <rsc_defaults>
> <meta_attributes id="rsc-options">
> <nvpair id="rsc-options-resource-stickiness"
> name="resource-stickiness" value="100"/>
> </meta_attributes>
> </rsc_defaults>
> </configuration>
> <status>
> <node_state id="test-vm-1" uname="test-vm-1" ha="active"
> in_ccm="true" crmd="online" join="member" expected="member"
> crm-debug-origin="do_state_transition" shutdown="0">
> <transient_attributes id="test-vm-1">
> <instance_attributes id="status-test-vm-1">
> <nvpair id="status-test-vm-1-fail-count-drbd_r0.1"
> name="fail-count-drbd_r0:1" value="1"/>
> <nvpair id="status-test-vm-1-last-failure-drbd_r0.1"
> name="last-failure-drbd_r0:1" value="1380831442"/>
> <nvpair id="status-test-vm-1-master-drbd_r0.0"
> name="master-drbd_r0:0" value="100"/>
> <nvpair id="status-test-vm-1-probe_complete"
> name="probe_complete" value="true"/>
> </instance_attributes>
> </transient_attributes>
> <lrm id="test-vm-1">
> <lrm_resources>
> <lrm_resource id="drbd_r0:0" type="drbd" class="ocf"
> provider="heartbeat">
> <lrm_rsc_op id="drbd_r0:0_last_failure_0"
> operation_key="drbd_r0:0_monitor_0" operation="monitor"
> crm-debug-origin="build_active_RAs" crm_feature_set="3.0.6"
> transition-key="7:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> transition-magic="0:8;7:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> call-id="32" rc-code="8" op-status="0" interval="0"
> op-digest="c0e018b73fdf522b6cdd355e125af15e"/>
> <lrm_rsc_op id="drbd_r0:0_monitor_59000"
> operation_key="drbd_r0:0_monitor_59000" operation="monitor"
> crm-debug-origin="build_active_RAs" crm_feature_set="3.0.6"
> transition-key="20:5:8:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> transition-magic="0:8;20:5:8:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> call-id="35" rc-code="8" op-status="0" interval="59000"
> op-digest="6f5adcd7f1211cdfc17850827b8582c5"/>
> </lrm_resource>
> <lrm_resource id="nfs" type="nfs-kernel-server" class="lsb">
> <lrm_rsc_op id="nfs_last_0" operation_key="nfs_start_0"
> operation="start" crm-debug-origin="build_active_RAs"
> crm_feature_set="3.0.6"
> transition-key="14:8:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> transition-magic="0:0;14:8:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> call-id="39" rc-code="0" op-status="0" interval="0"
> op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
> <lrm_rsc_op id="nfs_last_failure_0"
> operation_key="nfs_monitor_0" operation="monitor"
> crm-debug-origin="build_active_RAs" crm_feature_set="3.0.6"
> transition-key="6:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> transition-magic="0:0;6:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> call-id="31" rc-code="0" op-status="0" interval="0"
> op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
> <lrm_rsc_op id="nfs_monitor_5000"
> operation_key="nfs_monitor_5000" operation="monitor"
> crm-debug-origin="build_active_RAs" crm_feature_set="3.0.6"
> transition-key="2:8:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> transition-magic="0:0;2:8:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> call-id="40" rc-code="0" op-status="0" interval="5000"
> op-digest="4811cef7f7f94e3a35a70be7916cb2fd"/>
> </lrm_resource>
> <lrm_resource id="nfs_ip" type="IPaddr2" class="ocf"
> provider="heartbeat">
> <lrm_rsc_op id="nfs_ip_last_failure_0"
> operation_key="nfs_ip_monitor_0" operation="monitor"
> crm-debug-origin="build_active_RAs" crm_feature_set="3.0.6"
> transition-key="5:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> transition-magic="0:0;5:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> call-id="30" rc-code="0" op-status="0" interval="0"
> op-digest="570cd25774b1ead32cb1840813adbe21"/>
> <lrm_rsc_op id="nfs_ip_monitor_10000"
> operation_key="nfs_ip_monitor_10000" operation="monitor"
> crm-debug-origin="build_active_RAs" crm_feature_set="3.0.6"
> transition-key="8:5:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> transition-magic="0:0;8:5:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> call-id="33" rc-code="0" op-status="0" interval="10000"
> op-digest="bc929bfa78c3086ebd199cf0110b87bf"/>
> </lrm_resource>
> <lrm_resource id="nfs_fs" type="Filesystem" class="ocf"
> provider="heartbeat">
> <lrm_rsc_op id="nfs_fs_last_failure_0"
> operation_key="nfs_fs_monitor_0" operation="monitor"
> crm-debug-origin="build_active_RAs" crm_feature_set="3.0.6"
> transition-key="4:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> transition-magic="0:0;4:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> call-id="29" rc-code="0" op-status="0" interval="0"
> op-digest="c0a40c0015f71e8b20b5359e12f25eb5"/>
> </lrm_resource>
> </lrm_resources>
> </lrm>
> </node_state>
> <node_state id="test-vm-2" uname="test-vm-2" ha="active"
> in_ccm="true" crmd="online" join="member"
> crm-debug-origin="do_update_resource" expected="member" shutdown="0">
> <lrm id="test-vm-2">
> <lrm_resources>
> <lrm_resource id="nfs" type="nfs-kernel-server" class="lsb">
> <lrm_rsc_op id="nfs_last_0" operation_key="nfs_monitor_0"
> operation="monitor" crm-debug-origin="do_update_resource"
> crm_feature_set="3.0.6"
> transition-key="10:14:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> transition-magic="0:7;10:14:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> call-id="4" rc-code="7" op-status="0" interval="0" last-run="1380832563"
> last-rc-change="1380832563" exec-time="210" queue-time="0"
> op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
> </lrm_resource>
> <lrm_resource id="nfs_ip" type="IPaddr2" class="ocf"
> provider="heartbeat">
> <lrm_rsc_op id="nfs_ip_last_0"
> operation_key="nfs_ip_monitor_0" operation="monitor"
> crm-debug-origin="do_update_resource" crm_feature_set="3.0.6"
> transition-key="9:14:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> transition-magic="0:7;9:14:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> call-id="3" rc-code="7" op-status="0" interval="0" last-run="1380832563"
> last-rc-change="1380832563" exec-time="490" queue-time="0"
> op-digest="570cd25774b1ead32cb1840813adbe21"/>
> </lrm_resource>
> <lrm_resource id="nfs_fs" type="Filesystem" class="ocf"
> provider="heartbeat">
> <lrm_rsc_op id="nfs_fs_last_0"
> operation_key="nfs_fs_monitor_0" operation="monitor"
> crm-debug-origin="do_update_resource" crm_feature_set="3.0.6"
> transition-key="8:14:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> transition-magic="0:7;8:14:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> call-id="2" rc-code="7" op-status="0" interval="0" last-run="1380832563"
> last-rc-change="1380832563" exec-time="690" queue-time="0"
> op-digest="c0a40c0015f71e8b20b5359e12f25eb5"/>
> </lrm_resource>
> <lrm_resource id="drbd_r0:1" type="drbd" class="ocf"
> provider="heartbeat">
> <lrm_rsc_op id="drbd_r0:1_last_0"
> operation_key="drbd_r0:1_start_0" operation="start"
> crm-debug-origin="do_update_resource" crm_feature_set="3.0.6"
> transition-key="26:14:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> transition-magic="0:0;26:14:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> call-id="6" rc-code="0" op-status="0" interval="0" last-run="1380832564"
> last-rc-change="1380832564" exec-time="840" queue-time="0"
> op-digest="c0e018b73fdf522b6cdd355e125af15e"/>
> <lrm_rsc_op id="drbd_r0:1_monitor_60000"
> operation_key="drbd_r0:1_monitor_60000" operation="monitor"
> crm-debug-origin="do_update_resource" crm_feature_set="3.0.6"
> transition-key="25:15:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> transition-magic="0:0;25:15:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> call-id="8" rc-code="0" op-status="0" interval="60000"
> last-rc-change="1380832565" exec-time="310" queue-time="10"
> op-digest="6f5adcd7f1211cdfc17850827b8582c5"/>
> </lrm_resource>
> </lrm_resources>
> </lrm>
> <transient_attributes id="test-vm-2">
> <instance_attributes id="status-test-vm-2">
> <nvpair id="status-test-vm-2-probe_complete"
> name="probe_complete" value="true"/>
> <nvpair id="status-test-vm-2-master-drbd_r0.1"
> name="master-drbd_r0:1" value="75"/>
> </instance_attributes>
> </transient_attributes>
> </node_state>
> </status>
> </cib>
>
>
> On Thu, Oct 3, 2013 at 5:06 PM, Andreas Kurz <andreas at hastexo.com
> <mailto:andreas at hastexo.com>> wrote:
>
> On 2013-10-03 22:12, David Parker wrote:
> > Thanks, Andrew. The goal was to use either Pacemaker and Corosync 1.x
> > from the Debain packages, or use both compiled from source. So, with
> > the compiled version, I was hoping to avoid CMAN. However, it
> seems the
> > packaged version of Pacemaker doesn't support CMAN anyway, so it's
> moot.
> >
> > I rebuilt my VMs from scratch, re-installed Pacemaker and Corosync
> from
> > the Debian packages, but I'm still having an odd problem. Here is the
> > config portion of my CIB:
> >
> > <crm_config>
> > <cluster_property_set id="cib-bootstrap-options">
> > <nvpair id="cib-bootstrap-options-dc-version"
> name="dc-version"
> > value="1.1.7-ee0730e13d124c3d58f00016c3376a1de5323cff"/>
> > <nvpair id="cib-bootstrap-options-cluster-infrastructure"
> > name="cluster-infrastructure" value="openais"/>
> > <nvpair id="cib-bootstrap-options-expected-quorum-votes"
> > name="expected-quorum-votes" value="2"/>
> > <nvpair id="cib-bootstrap-options-stonith-enabled"
> > name="stonith-enabled" value="false"/>
> > <nvpair id="cib-bootstrap-options-no-quorum-policy"
> > name="no-quorum-policy" value="ignore"/>
> > </cluster_property_set>
> > </crm_config>
> >
> > I set no-quorum-policy=ignore based on the documentation example for a
> > 2-node cluster. But when Pacemaker starts up on the first node, the
> > DRBD resource is in slave mode and none of the other resources are
> > started (they depend on DRBD being master), and I see these lines
> in the
> > log:
> >
> > Oct 03 15:29:18 test-vm-1 pengine: [3742]: notice: unpack_config: On
> > loss of CCM Quorum: Ignore
> > Oct 03 15:29:18 test-vm-1 pengine: [3742]: notice: LogActions: Start
> > nfs_fs (test-vm-1 - blocked)
> > Oct 03 15:29:18 test-vm-1 pengine: [3742]: notice: LogActions: Start
> > nfs_ip (test-vm-1 - blocked)
> > Oct 03 15:29:18 test-vm-1 pengine: [3742]: notice: LogActions: Start
> > nfs (test-vm-1 - blocked)
> > Oct 03 15:29:18 test-vm-1 pengine: [3742]: notice: LogActions: Start
> > drbd_r0:0 (test-vm-1)
> >
> > I'm assuming the NFS resources show "blocked" because the resource
> they
> > depend on is not in the correct state.
> >
> > Even when the second node (test-vm-2) comes online, the state of these
> > resources does not change. I can shutdown and re-start Pacemaker over
> > and over again on test-vm-2, but nothihg changes. However... and this
> > is where it gets weird... if I shut down Pacemaker on test-vm-1, then
> > all of the resources immediately fail over to test-vm-2 and start
> > correctly. And I see these lines in the log:
> >
> > Oct 03 15:44:26 test-vm-1 pengine: [5305]: notice: unpack_config: On
> > loss of CCM Quorum: Ignore
> > Oct 03 15:44:26 test-vm-1 pengine: [5305]: notice: stage6: Scheduling
> > Node test-vm-1 for shutdown
> > Oct 03 15:44:26 test-vm-1 pengine: [5305]: notice: LogActions: Start
> > nfs_fs (test-vm-2)
> > Oct 03 15:44:26 test-vm-1 pengine: [5305]: notice: LogActions: Start
> > nfs_ip (test-vm-2)
> > Oct 03 15:44:26 test-vm-1 pengine: [5305]: notice: LogActions: Start
> > nfs (test-vm-2)
> > Oct 03 15:44:26 test-vm-1 pengine: [5305]: notice: LogActions: Stop
> > drbd_r0:0 (test-vm-1)
> > Oct 03 15:44:26 test-vm-1 pengine: [5305]: notice: LogActions: Promote
> > drbd_r0:1 (Slave -> Master test-vm-2)
> >
> > After that, I can generally move the resources back and forth, and
> even
> > fail them over by hard-failing a node, without any problems. The real
> > problem is that this isn't consistent, though. Every once in a while,
> > I'll hard-fail a node and the other one will go into this "stuck"
> state
> > where Pacemaker knows it lost a node, but DRBD will stay in slave mode
> > and the other resources will never start. It seems to happen quite
> > randomly. Then, even if I restart Pacemaker on both nodes, or reboot
> > them altogether, I run into the startup issue mentioned previously.
> >
> > Any ideas?
>
> Yes, share your complete resource configuration ;-)
>
> Regards,
> Andreas
>
> >
> > Thanks,
> > Dave
> >
> >
> >
> > On Wed, Oct 2, 2013 at 1:01 AM, Andrew Beekhof <andrew at beekhof.net
> <mailto:andrew at beekhof.net>
> > <mailto:andrew at beekhof.net <mailto:andrew at beekhof.net>>> wrote:
> >
> >
> > On 02/10/2013, at 5:24 AM, David Parker <dparker at utica.edu
> <mailto:dparker at utica.edu>
> > <mailto:dparker at utica.edu <mailto:dparker at utica.edu>>> wrote:
> >
> > > Thanks, I did a little Googling and found the git repository
> for pcs.
> >
> > pcs won't help you rebuild pacemaker with cman support (or
> corosync
> > 2.x support) turned on though.
> >
> >
> > > Is there any way to make a two-node cluster work with the stock
> > Debian packages, though? It seems odd that this would be
> impossible.
> >
> > it really depends how the debian maintainers built pacemaker.
> > by the sounds of it, it only supports the pacemaker plugin
> mode for
> > corosync 1.x
> >
> > >
> > >
> > > On Tue, Oct 1, 2013 at 3:16 PM, Larry Brigman
> > <larry.brigman at gmail.com <mailto:larry.brigman at gmail.com>
> <mailto:larry.brigman at gmail.com <mailto:larry.brigman at gmail.com>>>
> wrote:
> > > pcs is another package you will need to install.
> > >
> > > On Oct 1, 2013 9:04 AM, "David Parker" <dparker at utica.edu
> <mailto:dparker at utica.edu>
> > <mailto:dparker at utica.edu <mailto:dparker at utica.edu>>> wrote:
> > > Hello,
> > >
> > > Sorry for the delay in my reply. I've been doing a lot of
> > experimentation, but so far I've had no luck.
> > >
> > > Thanks for the suggestion, but it seems I'm not able to use
> CMAN.
> > I'm running Debian Wheezy with Corosync and Pacemaker
> installed via
> > apt-get. When I installed CMAN and set up a cluster.conf file,
> > Pacemaker refused to start and said that CMAN was not supported.
> > When CMAN is not installed, Pacemaker starts up fine, but I see
> > these lines in the log:
> > >
> > > Sep 30 23:36:29 test-vm-1 crmd: [6941]: ERROR:
> > init_quorum_connection: The Corosync quorum API is not
> supported in
> > this build
> > > Sep 30 23:36:29 test-vm-1 pacemakerd: [6932]: ERROR:
> > pcmk_child_exit: Child process crmd exited (pid=6941, rc=100)
> > > Sep 30 23:36:29 test-vm-1 pacemakerd: [6932]: WARN:
> > pcmk_child_exit: Pacemaker child process crmd no longer wishes
> to be
> > respawned. Shutting ourselves down.
> > >
> > > So, then I checked to see which plugins are supported:
> > >
> > > # pacemakerd -F
> > > Pacemaker 1.1.7 (Build:
> ee0730e13d124c3d58f00016c3376a1de5323cff)
> > > Supporting: generated-manpages agent-manpages ncurses
> heartbeat
> > corosync-plugin snmp libesmtp
> > >
> > > Am I correct in believing that this Pacemaker package has been
> > compiled without support for any quorum API? If so, does anyone
> > know if there is a Debian package which has the correct support?
> > >
> > > I also tried compiling LibQB, Corosync and Pacemaker from source
> > via git, following the instructions documented here:
> > >
> > > http://clusterlabs.org/wiki/SourceInstall
> > >
> > > I was hopeful that this would work, because as I understand it,
> > Corosync 2.x no longer uses CMAN. Everything compiled and started
> > fine, but the compiled version of Pacemaker did not include either
> > the 'crm' or 'pcs' commands. Do I need to install something
> else in
> > order to get one of these?
> > >
> > > Any and all help is greatly appreciated!
> > >
> > > Thanks,
> > > Dave
> > >
> > >
> > > On Wed, Sep 25, 2013 at 6:08 AM, David Lang <david at lang.hm
> <mailto:david at lang.hm>
> > <mailto:david at lang.hm <mailto:david at lang.hm>>> wrote:
> > > the cluster is trying to reach a quarum (the majority of the
> nodes
> > talking to each other) and that is never going to happen with only
> > one node. so you have to disable this.
> > >
> > > try putting
> > > <cman two_node="1" expected_votes="1" transport="udpu"/>
> > > in your cluster.conf
> > >
> > > David Lang
> > >
> > > On Tue, 24 Sep 2013, David Parker wrote:
> > >
> > > Date: Tue, 24 Sep 2013 11:48:59 -0400
> > > From: David Parker <dparker at utica.edu
> <mailto:dparker at utica.edu> <mailto:dparker at utica.edu
> <mailto:dparker at utica.edu>>>
> > > Reply-To: The Pacemaker cluster resource manager
> > > <pacemaker at oss.clusterlabs.org
> <mailto:pacemaker at oss.clusterlabs.org>
> > <mailto:pacemaker at oss.clusterlabs.org
> <mailto:pacemaker at oss.clusterlabs.org>>>
> > > To: The Pacemaker cluster resource manager
> > <pacemaker at oss.clusterlabs.org
> <mailto:pacemaker at oss.clusterlabs.org>
> <mailto:pacemaker at oss.clusterlabs.org
> <mailto:pacemaker at oss.clusterlabs.org>>>
> > > Subject: Re: [Pacemaker] Corosync won't recover when a node
> fails
> > >
> > >
> > > I forgot to mention, OS is Debian Wheezy 64-bit, Corosync and
> > Pacemaker
> > > installed from packages via apt-get, and there are no local
> > firewall rules
> > > in place:
> > >
> > > # iptables -L
> > > Chain INPUT (policy ACCEPT)
> > > target prot opt source destination
> > >
> > > Chain FORWARD (policy ACCEPT)
> > > target prot opt source destination
> > >
> > > Chain OUTPUT (policy ACCEPT)
> > > target prot opt source destination
> > >
> > >
> > > On Tue, Sep 24, 2013 at 11:41 AM, David Parker
> <dparker at utica.edu <mailto:dparker at utica.edu>
> > <mailto:dparker at utica.edu <mailto:dparker at utica.edu>>> wrote:
> > >
> > > Hello,
> > >
> > > I have a 2-node cluster using Corosync and Pacemaker, where the
> > nodes are
> > > actually to VirtualBox VMs on the same physical machine. I
> have some
> > > resources set up in Pacemaker, and everything works fine if
> I move
> > them in
> > > a controlled way with the "crm_resource -r <resource> --move
> > --node <node>"
> > > command.
> > >
> > > However, when I hard-fail one of the nodes via the "poweroff"
> > command in
> > > Virtual Box, which "pulls the plug" on the VM, the resources do
> > not move,
> > > and I see the following output in the log on the remaining node:
> > >
> > > Sep 24 11:20:30 corosync [TOTEM ] The token was lost in the
> > OPERATIONAL
> > > state.
> > > Sep 24 11:20:30 corosync [TOTEM ] A processor failed,
> forming new
> > > configuration.
> > > Sep 24 11:20:30 corosync [TOTEM ] entering GATHER state from 2.
> > > Sep 24 11:20:31 test-vm-2 lrmd: [2503]: debug: rsc:drbd_r0:0
> > monitor[31]
> > > (pid 8495)
> > > drbd[8495]: 2013/09/24_11:20:31 WARNING: This resource
> agent is
> > > deprecated and may be removed in a future release. See the man
> > page for
> > > details. To suppress this warning, set the "ignore_deprecation"
> > resource
> > > parameter to true.
> > > drbd[8495]: 2013/09/24_11:20:31 WARNING: This resource
> agent is
> > > deprecated and may be removed in a future release. See the man
> > page for
> > > details. To suppress this warning, set the "ignore_deprecation"
> > resource
> > > parameter to true.
> > > drbd[8495]: 2013/09/24_11:20:31 DEBUG: r0: Calling
> drbdadm -c
> > > /etc/drbd.conf role r0
> > > drbd[8495]: 2013/09/24_11:20:31 DEBUG: r0: Exit code 0
> > > drbd[8495]: 2013/09/24_11:20:31 DEBUG: r0: Command output:
> > > Secondary/Primary
> > > drbd[8495]: 2013/09/24_11:20:31 DEBUG: r0: Calling
> drbdadm -c
> > > /etc/drbd.conf cstate r0
> > > drbd[8495]: 2013/09/24_11:20:31 DEBUG: r0: Exit code 0
> > > drbd[8495]: 2013/09/24_11:20:31 DEBUG: r0: Command output:
> > Connected
> > > drbd[8495]: 2013/09/24_11:20:31 DEBUG: r0 status:
> > Secondary/Primary
> > > Secondary Primary Connected
> > > Sep 24 11:20:31 test-vm-2 lrmd: [2503]: info: operation
> monitor[31] on
> > > drbd_r0:0 for client 2506: pid 8495 exited with return code 0
> > > Sep 24 11:20:32 corosync [TOTEM ] entering GATHER state from 0.
> > > Sep 24 11:20:34 corosync [TOTEM ] The consensus timeout expired.
> > > Sep 24 11:20:34 corosync [TOTEM ] entering GATHER state from 3.
> > > Sep 24 11:20:36 corosync [TOTEM ] The consensus timeout expired.
> > > Sep 24 11:20:36 corosync [TOTEM ] entering GATHER state from 3.
> > > Sep 24 11:20:38 corosync [TOTEM ] The consensus timeout expired.
> > > Sep 24 11:20:38 corosync [TOTEM ] entering GATHER state from 3.
> > > Sep 24 11:20:40 corosync [TOTEM ] The consensus timeout expired.
> > > Sep 24 11:20:40 corosync [TOTEM ] entering GATHER state from 3.
> > > Sep 24 11:20:40 corosync [TOTEM ] Totem is unable to form a
> cluster
> > > because of an operating system or network fault. The most common
> > cause of
> > > this message is that the local firewall is configured
> improperly.
> > > Sep 24 11:20:43 corosync [TOTEM ] The consensus timeout expired.
> > > Sep 24 11:20:43 corosync [TOTEM ] entering GATHER state from 3.
> > > Sep 24 11:20:43 corosync [TOTEM ] Totem is unable to form a
> cluster
> > > because of an operating system or network fault. The most common
> > cause of
> > > this message is that the local firewall is configured
> improperly.
> > > Sep 24 11:20:45 corosync [TOTEM ] The consensus timeout expired.
> > > Sep 24 11:20:45 corosync [TOTEM ] entering GATHER state from 3.
> > > Sep 24 11:20:45 corosync [TOTEM ] Totem is unable to form a
> cluster
> > > because of an operating system or network fault. The most common
> > cause of
> > > this message is that the local firewall is configured
> improperly.
> > > Sep 24 11:20:47 corosync [TOTEM ] The consensus timeout expired.
> > >
> > > Those last 3 messages just repeat over and over, the cluster
> never
> > > recovers, and the resources never move. "crm_mon" reports
> that the
> > > resources are still running on the dead node, and shows no
> > indication that
> > > anything has gone wrong.
> > >
> > > Does anyone know what the issue could be? My expectation
> was that the
> > > remaining node would become the sole member of the cluster, take
> > over the
> > > resources, and everything would keep running.
> > >
> > > For reference, my corosync.conf file is below:
> > >
> > > compatibility: whitetank
> > >
> > > totem {
> > > version: 2
> > > secauth: off
> > > interface {
> > > member {
> > > memberaddr: 192.168.25.201
> > > }
> > > member {
> > > memberaddr: 192.168.25.202
> > > }
> > > ringnumber: 0
> > > bindnetaddr: 192.168.25.0
> > > mcastport: 5405
> > > }
> > > transport: udpu
> > > }
> > >
> > > logging {
> > > fileline: off
> > > to_logfile: yes
> > > to_syslog: yes
> > > debug: on
> > > logfile: /var/log/cluster/corosync.log
> > > timestamp: on
> > > logger_subsys {
> > > subsys: AMF
> > > debug: on
> > > }
> > > }
> > >
> > >
> > > Thanks!
> > > Dave
> > >
> > > --
> > > Dave Parker
> > > Systems Administrator
> > > Utica College
> > > Integrated Information Technology Services
> > > (315) 792-3229
> > > Registered Linux User #408177
> > >
> > >
> > >
> > >
> > >
> > > _______________________________________________
> > >
> > > Pacemaker mailing list: Pacemaker at oss.clusterlabs.org
> <mailto:Pacemaker at oss.clusterlabs.org>
> > <mailto:Pacemaker at oss.clusterlabs.org
> <mailto:Pacemaker at oss.clusterlabs.org>>
> > >
> > > http://oss.clusterlabs.org/mailman/listinfo/pacemaker
> > >
> > >
> > >
> > > Project Home: http://www.clusterlabs.org
> > >
> > > Getting started:
> > http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
> > >
> > > Bugs: http://bugs.clusterlabs.org
> > >
> > >
> > > _______________________________________________
> > > Pacemaker mailing list: Pacemaker at oss.clusterlabs.org
> <mailto:Pacemaker at oss.clusterlabs.org>
> > <mailto:Pacemaker at oss.clusterlabs.org
> <mailto:Pacemaker at oss.clusterlabs.org>>
> > > http://oss.clusterlabs.org/mailman/listinfo/pacemaker
> > >
> > > Project Home: http://www.clusterlabs.org
> > > Getting started:
> > http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
> > > Bugs: http://bugs.clusterlabs.org
> > >
> > >
> > >
> > >
> > > --
> > > Dave Parker
> > > Systems Administrator
> > > Utica College
> > > Integrated Information Technology Services
> > > (315) 792-3229 <tel:%28315%29%20792-3229>
> <tel:%28315%29%20792-3229>
> > > Registered Linux User #408177
> > >
> > > _______________________________________________
> > > Pacemaker mailing list: Pacemaker at oss.clusterlabs.org
> <mailto:Pacemaker at oss.clusterlabs.org>
> > <mailto:Pacemaker at oss.clusterlabs.org
> <mailto:Pacemaker at oss.clusterlabs.org>>
> > > http://oss.clusterlabs.org/mailman/listinfo/pacemaker
> > >
> > > Project Home: http://www.clusterlabs.org
> > > Getting started:
> > http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
> > > Bugs: http://bugs.clusterlabs.org
> > >
> > >
> > > _______________________________________________
> > > Pacemaker mailing list: Pacemaker at oss.clusterlabs.org
> <mailto:Pacemaker at oss.clusterlabs.org>
> > <mailto:Pacemaker at oss.clusterlabs.org
> <mailto:Pacemaker at oss.clusterlabs.org>>
> > > http://oss.clusterlabs.org/mailman/listinfo/pacemaker
> > >
> > > Project Home: http://www.clusterlabs.org
> > > Getting started:
> > http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
> > > Bugs: http://bugs.clusterlabs.org
> > >
> > >
> > >
> > >
> > > --
> > > Dave Parker
> > > Systems Administrator
> > > Utica College
> > > Integrated Information Technology Services
> > > (315) 792-3229 <tel:%28315%29%20792-3229>
> <tel:%28315%29%20792-3229>
> > > Registered Linux User #408177
> > > _______________________________________________
> > > Pacemaker mailing list: Pacemaker at oss.clusterlabs.org
> <mailto:Pacemaker at oss.clusterlabs.org>
> > <mailto:Pacemaker at oss.clusterlabs.org
> <mailto:Pacemaker at oss.clusterlabs.org>>
> > > http://oss.clusterlabs.org/mailman/listinfo/pacemaker
> > >
> > > Project Home: http://www.clusterlabs.org
> > > Getting started:
> > http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
> > > Bugs: http://bugs.clusterlabs.org
> >
> >
> > _______________________________________________
> > Pacemaker mailing list: Pacemaker at oss.clusterlabs.org
> <mailto:Pacemaker at oss.clusterlabs.org>
> > <mailto:Pacemaker at oss.clusterlabs.org
> <mailto:Pacemaker at oss.clusterlabs.org>>
> > http://oss.clusterlabs.org/mailman/listinfo/pacemaker
> >
> > Project Home: http://www.clusterlabs.org
> > Getting started:
> http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
> > Bugs: http://bugs.clusterlabs.org
> >
> >
> >
> >
> > --
> > Dave Parker
> > Systems Administrator
> > Utica College
> > Integrated Information Technology Services
> > (315) 792-3229 <tel:%28315%29%20792-3229>
> > Registered Linux User #408177
> >
> >
> > This body part will be downloaded on demand.
> >
>
>
> --
> Need help with Pacemaker?
> http://www.hastexo.com/now
>
>
>
> _______________________________________________
> Pacemaker mailing list: Pacemaker at oss.clusterlabs.org
> <mailto:Pacemaker at oss.clusterlabs.org>
> http://oss.clusterlabs.org/mailman/listinfo/pacemaker
>
> Project Home: http://www.clusterlabs.org
> Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
> Bugs: http://bugs.clusterlabs.org
>
>
>
>
> --
> Dave Parker
> Systems Administrator
> Utica College
> Integrated Information Technology Services
> (315) 792-3229
> Registered Linux User #408177
>
>
> _______________________________________________
> Pacemaker mailing list: Pacemaker at oss.clusterlabs.org
> http://oss.clusterlabs.org/mailman/listinfo/pacemaker
>
> Project Home: http://www.clusterlabs.org
> Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
> Bugs: http://bugs.clusterlabs.org
>
--
Need help with Pacemaker?
http://www.hastexo.com/now
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 287 bytes
Desc: OpenPGP digital signature
URL: <https://lists.clusterlabs.org/pipermail/pacemaker/attachments/20131004/a204df18/attachment-0004.sig>
More information about the Pacemaker
mailing list