[Pacemaker] Corosync won't recover when a node fails
David Parker
dparker at utica.edu
Fri Oct 4 01:03:44 UTC 2013
Sure. Here's the full config:
<cib epoch="28" num_updates="34" admin_epoch="0"
validate-with="pacemaker-1.2" cib-last-written="Thu Oct 3 16:26:39 2013"
crm_feature_set="3.0.6" update-origin="test-vm-2" update-client="cibadmin"
have-quorum="1" dc-uuid="test-vm-1">
<configuration>
<crm_config>
<cluster_property_set id="cib-bootstrap-options">
<nvpair id="cib-bootstrap-options-dc-version" name="dc-version"
value="1.1.7-ee0730e13d124c3d58f00016c3376a1de5323cff"/>
<nvpair id="cib-bootstrap-options-cluster-infrastructure"
name="cluster-infrastructure" value="openais"/>
<nvpair id="cib-bootstrap-options-expected-quorum-votes"
name="expected-quorum-votes" value="2"/>
<nvpair id="cib-bootstrap-options-stonith-enabled"
name="stonith-enabled" value="false"/>
<nvpair id="cib-bootstrap-options-no-quorum-policy"
name="no-quorum-policy" value="ignore"/>
</cluster_property_set>
</crm_config>
<nodes>
<node id="test-vm-1" type="normal" uname="test-vm-1"/>
<node id="test-vm-2" type="normal" uname="test-vm-2"/>
</nodes>
<resources>
<group id="nfs_resources">
<meta_attributes id="nfs_resources-meta_attributes">
<nvpair id="nfs_resources-meta_attributes-target-role"
name="target-role" value="Started"/>
</meta_attributes>
<primitive class="ocf" id="nfs_fs" provider="heartbeat"
type="Filesystem">
<instance_attributes id="nfs_fs-instance_attributes">
<nvpair id="nfs_fs-instance_attributes-device" name="device"
value="/dev/drbd1"/>
<nvpair id="nfs_fs-instance_attributes-directory"
name="directory" value="/export/data/"/>
<nvpair id="nfs_fs-instance_attributes-fstype" name="fstype"
value="ext3"/>
<nvpair id="nfs_fs-instance_attributes-options" name="options"
value="noatime,nodiratime"/>
</instance_attributes>
<operations>
<op id="nfs_fs-start-0" interval="0" name="start" timeout="60"/>
<op id="nfs_fs-stop-0" interval="0" name="stop" timeout="120"/>
</operations>
</primitive>
<primitive class="ocf" id="nfs_ip" provider="heartbeat"
type="IPaddr2">
<instance_attributes id="nfs_ip-instance_attributes">
<nvpair id="nfs_ip-instance_attributes-ip" name="ip"
value="192.168.25.205"/>
<nvpair id="nfs_ip-instance_attributes-cidr_netmask"
name="cidr_netmask" value="32"/>
</instance_attributes>
<operations>
<op id="nfs_ip-monitor-10s" interval="10s" name="monitor"/>
</operations>
<meta_attributes id="nfs_ip-meta_attributes">
<nvpair id="nfs_ip-meta_attributes-is-managed"
name="is-managed" value="true"/>
</meta_attributes>
</primitive>
<primitive class="lsb" id="nfs" type="nfs-kernel-server">
<operations>
<op id="nfs-monitor-5s" interval="5s" name="monitor"/>
<op id="nfs-start-0" interval="0" name="start" timeout="120"/>
<op id="nfs-stop-0" interval="0" name="stop" timeout="120"/>
</operations>
</primitive>
</group>
<master id="ms-drbd_r0">
<meta_attributes id="ms-drbd_r0-meta_attributes">
<nvpair id="ms-drbd_r0-meta_attributes-clone-max"
name="clone-max" value="2"/>
<nvpair id="ms-drbd_r0-meta_attributes-notify" name="notify"
value="true"/>
<nvpair id="ms-drbd_r0-meta_attributes-globally-unique"
name="globally-unique" value="false"/>
<nvpair id="ms-drbd_r0-meta_attributes-target-role"
name="target-role" value="Master"/>
</meta_attributes>
<primitive class="ocf" id="drbd_r0" provider="heartbeat"
type="drbd">
<instance_attributes id="drbd_r0-instance_attributes">
<nvpair id="drbd_r0-instance_attributes-drbd_resource"
name="drbd_resource" value="r0"/>
</instance_attributes>
<operations>
<op id="drbd_r0-monitor-59s" interval="59s" name="monitor"
role="Master" timeout="30s"/>
<op id="drbd_r0-monitor-60s" interval="60s" name="monitor"
role="Slave" timeout="30s"/>
</operations>
</primitive>
</master>
</resources>
<constraints>
<rsc_colocation id="drbd-nfs-ha" rsc="ms-drbd_r0" rsc-role="Master"
score="INFINITY" with-rsc="nfs_resources"/>
<rsc_order id="drbd-before-nfs" first="ms-drbd_r0"
first-action="promote" score="INFINITY" then="nfs_resources"
then-action="start"/>
</constraints>
<rsc_defaults>
<meta_attributes id="rsc-options">
<nvpair id="rsc-options-resource-stickiness"
name="resource-stickiness" value="100"/>
</meta_attributes>
</rsc_defaults>
</configuration>
<status>
<node_state id="test-vm-1" uname="test-vm-1" ha="active" in_ccm="true"
crmd="online" join="member" expected="member"
crm-debug-origin="do_state_transition" shutdown="0">
<transient_attributes id="test-vm-1">
<instance_attributes id="status-test-vm-1">
<nvpair id="status-test-vm-1-fail-count-drbd_r0.1"
name="fail-count-drbd_r0:1" value="1"/>
<nvpair id="status-test-vm-1-last-failure-drbd_r0.1"
name="last-failure-drbd_r0:1" value="1380831442"/>
<nvpair id="status-test-vm-1-master-drbd_r0.0"
name="master-drbd_r0:0" value="100"/>
<nvpair id="status-test-vm-1-probe_complete"
name="probe_complete" value="true"/>
</instance_attributes>
</transient_attributes>
<lrm id="test-vm-1">
<lrm_resources>
<lrm_resource id="drbd_r0:0" type="drbd" class="ocf"
provider="heartbeat">
<lrm_rsc_op id="drbd_r0:0_last_failure_0"
operation_key="drbd_r0:0_monitor_0" operation="monitor"
crm-debug-origin="build_active_RAs" crm_feature_set="3.0.6"
transition-key="7:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
transition-magic="0:8;7:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
call-id="32" rc-code="8" op-status="0" interval="0"
op-digest="c0e018b73fdf522b6cdd355e125af15e"/>
<lrm_rsc_op id="drbd_r0:0_monitor_59000"
operation_key="drbd_r0:0_monitor_59000" operation="monitor"
crm-debug-origin="build_active_RAs" crm_feature_set="3.0.6"
transition-key="20:5:8:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
transition-magic="0:8;20:5:8:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
call-id="35" rc-code="8" op-status="0" interval="59000"
op-digest="6f5adcd7f1211cdfc17850827b8582c5"/>
</lrm_resource>
<lrm_resource id="nfs" type="nfs-kernel-server" class="lsb">
<lrm_rsc_op id="nfs_last_0" operation_key="nfs_start_0"
operation="start" crm-debug-origin="build_active_RAs"
crm_feature_set="3.0.6"
transition-key="14:8:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
transition-magic="0:0;14:8:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
call-id="39" rc-code="0" op-status="0" interval="0"
op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
<lrm_rsc_op id="nfs_last_failure_0"
operation_key="nfs_monitor_0" operation="monitor"
crm-debug-origin="build_active_RAs" crm_feature_set="3.0.6"
transition-key="6:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
transition-magic="0:0;6:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
call-id="31" rc-code="0" op-status="0" interval="0"
op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
<lrm_rsc_op id="nfs_monitor_5000"
operation_key="nfs_monitor_5000" operation="monitor"
crm-debug-origin="build_active_RAs" crm_feature_set="3.0.6"
transition-key="2:8:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
transition-magic="0:0;2:8:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
call-id="40" rc-code="0" op-status="0" interval="5000"
op-digest="4811cef7f7f94e3a35a70be7916cb2fd"/>
</lrm_resource>
<lrm_resource id="nfs_ip" type="IPaddr2" class="ocf"
provider="heartbeat">
<lrm_rsc_op id="nfs_ip_last_failure_0"
operation_key="nfs_ip_monitor_0" operation="monitor"
crm-debug-origin="build_active_RAs" crm_feature_set="3.0.6"
transition-key="5:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
transition-magic="0:0;5:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
call-id="30" rc-code="0" op-status="0" interval="0"
op-digest="570cd25774b1ead32cb1840813adbe21"/>
<lrm_rsc_op id="nfs_ip_monitor_10000"
operation_key="nfs_ip_monitor_10000" operation="monitor"
crm-debug-origin="build_active_RAs" crm_feature_set="3.0.6"
transition-key="8:5:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
transition-magic="0:0;8:5:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
call-id="33" rc-code="0" op-status="0" interval="10000"
op-digest="bc929bfa78c3086ebd199cf0110b87bf"/>
</lrm_resource>
<lrm_resource id="nfs_fs" type="Filesystem" class="ocf"
provider="heartbeat">
<lrm_rsc_op id="nfs_fs_last_failure_0"
operation_key="nfs_fs_monitor_0" operation="monitor"
crm-debug-origin="build_active_RAs" crm_feature_set="3.0.6"
transition-key="4:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
transition-magic="0:0;4:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
call-id="29" rc-code="0" op-status="0" interval="0"
op-digest="c0a40c0015f71e8b20b5359e12f25eb5"/>
</lrm_resource>
</lrm_resources>
</lrm>
</node_state>
<node_state id="test-vm-2" uname="test-vm-2" ha="active" in_ccm="true"
crmd="online" join="member" crm-debug-origin="do_update_resource"
expected="member" shutdown="0">
<lrm id="test-vm-2">
<lrm_resources>
<lrm_resource id="nfs" type="nfs-kernel-server" class="lsb">
<lrm_rsc_op id="nfs_last_0" operation_key="nfs_monitor_0"
operation="monitor" crm-debug-origin="do_update_resource"
crm_feature_set="3.0.6"
transition-key="10:14:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
transition-magic="0:7;10:14:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
call-id="4" rc-code="7" op-status="0" interval="0" last-run="1380832563"
last-rc-change="1380832563" exec-time="210" queue-time="0"
op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
</lrm_resource>
<lrm_resource id="nfs_ip" type="IPaddr2" class="ocf"
provider="heartbeat">
<lrm_rsc_op id="nfs_ip_last_0" operation_key="nfs_ip_monitor_0"
operation="monitor" crm-debug-origin="do_update_resource"
crm_feature_set="3.0.6"
transition-key="9:14:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
transition-magic="0:7;9:14:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
call-id="3" rc-code="7" op-status="0" interval="0" last-run="1380832563"
last-rc-change="1380832563" exec-time="490" queue-time="0"
op-digest="570cd25774b1ead32cb1840813adbe21"/>
</lrm_resource>
<lrm_resource id="nfs_fs" type="Filesystem" class="ocf"
provider="heartbeat">
<lrm_rsc_op id="nfs_fs_last_0" operation_key="nfs_fs_monitor_0"
operation="monitor" crm-debug-origin="do_update_resource"
crm_feature_set="3.0.6"
transition-key="8:14:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
transition-magic="0:7;8:14:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
call-id="2" rc-code="7" op-status="0" interval="0" last-run="1380832563"
last-rc-change="1380832563" exec-time="690" queue-time="0"
op-digest="c0a40c0015f71e8b20b5359e12f25eb5"/>
</lrm_resource>
<lrm_resource id="drbd_r0:1" type="drbd" class="ocf"
provider="heartbeat">
<lrm_rsc_op id="drbd_r0:1_last_0"
operation_key="drbd_r0:1_start_0" operation="start"
crm-debug-origin="do_update_resource" crm_feature_set="3.0.6"
transition-key="26:14:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
transition-magic="0:0;26:14:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
call-id="6" rc-code="0" op-status="0" interval="0" last-run="1380832564"
last-rc-change="1380832564" exec-time="840" queue-time="0"
op-digest="c0e018b73fdf522b6cdd355e125af15e"/>
<lrm_rsc_op id="drbd_r0:1_monitor_60000"
operation_key="drbd_r0:1_monitor_60000" operation="monitor"
crm-debug-origin="do_update_resource" crm_feature_set="3.0.6"
transition-key="25:15:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
transition-magic="0:0;25:15:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
call-id="8" rc-code="0" op-status="0" interval="60000"
last-rc-change="1380832565" exec-time="310" queue-time="10"
op-digest="6f5adcd7f1211cdfc17850827b8582c5"/>
</lrm_resource>
</lrm_resources>
</lrm>
<transient_attributes id="test-vm-2">
<instance_attributes id="status-test-vm-2">
<nvpair id="status-test-vm-2-probe_complete"
name="probe_complete" value="true"/>
<nvpair id="status-test-vm-2-master-drbd_r0.1"
name="master-drbd_r0:1" value="75"/>
</instance_attributes>
</transient_attributes>
</node_state>
</status>
</cib>
On Thu, Oct 3, 2013 at 5:06 PM, Andreas Kurz <andreas at hastexo.com> wrote:
> On 2013-10-03 22:12, David Parker wrote:
> > Thanks, Andrew. The goal was to use either Pacemaker and Corosync 1.x
> > from the Debain packages, or use both compiled from source. So, with
> > the compiled version, I was hoping to avoid CMAN. However, it seems the
> > packaged version of Pacemaker doesn't support CMAN anyway, so it's moot.
> >
> > I rebuilt my VMs from scratch, re-installed Pacemaker and Corosync from
> > the Debian packages, but I'm still having an odd problem. Here is the
> > config portion of my CIB:
> >
> > <crm_config>
> > <cluster_property_set id="cib-bootstrap-options">
> > <nvpair id="cib-bootstrap-options-dc-version" name="dc-version"
> > value="1.1.7-ee0730e13d124c3d58f00016c3376a1de5323cff"/>
> > <nvpair id="cib-bootstrap-options-cluster-infrastructure"
> > name="cluster-infrastructure" value="openais"/>
> > <nvpair id="cib-bootstrap-options-expected-quorum-votes"
> > name="expected-quorum-votes" value="2"/>
> > <nvpair id="cib-bootstrap-options-stonith-enabled"
> > name="stonith-enabled" value="false"/>
> > <nvpair id="cib-bootstrap-options-no-quorum-policy"
> > name="no-quorum-policy" value="ignore"/>
> > </cluster_property_set>
> > </crm_config>
> >
> > I set no-quorum-policy=ignore based on the documentation example for a
> > 2-node cluster. But when Pacemaker starts up on the first node, the
> > DRBD resource is in slave mode and none of the other resources are
> > started (they depend on DRBD being master), and I see these lines in the
> > log:
> >
> > Oct 03 15:29:18 test-vm-1 pengine: [3742]: notice: unpack_config: On
> > loss of CCM Quorum: Ignore
> > Oct 03 15:29:18 test-vm-1 pengine: [3742]: notice: LogActions: Start
> > nfs_fs (test-vm-1 - blocked)
> > Oct 03 15:29:18 test-vm-1 pengine: [3742]: notice: LogActions: Start
> > nfs_ip (test-vm-1 - blocked)
> > Oct 03 15:29:18 test-vm-1 pengine: [3742]: notice: LogActions: Start
> > nfs (test-vm-1 - blocked)
> > Oct 03 15:29:18 test-vm-1 pengine: [3742]: notice: LogActions: Start
> > drbd_r0:0 (test-vm-1)
> >
> > I'm assuming the NFS resources show "blocked" because the resource they
> > depend on is not in the correct state.
> >
> > Even when the second node (test-vm-2) comes online, the state of these
> > resources does not change. I can shutdown and re-start Pacemaker over
> > and over again on test-vm-2, but nothihg changes. However... and this
> > is where it gets weird... if I shut down Pacemaker on test-vm-1, then
> > all of the resources immediately fail over to test-vm-2 and start
> > correctly. And I see these lines in the log:
> >
> > Oct 03 15:44:26 test-vm-1 pengine: [5305]: notice: unpack_config: On
> > loss of CCM Quorum: Ignore
> > Oct 03 15:44:26 test-vm-1 pengine: [5305]: notice: stage6: Scheduling
> > Node test-vm-1 for shutdown
> > Oct 03 15:44:26 test-vm-1 pengine: [5305]: notice: LogActions: Start
> > nfs_fs (test-vm-2)
> > Oct 03 15:44:26 test-vm-1 pengine: [5305]: notice: LogActions: Start
> > nfs_ip (test-vm-2)
> > Oct 03 15:44:26 test-vm-1 pengine: [5305]: notice: LogActions: Start
> > nfs (test-vm-2)
> > Oct 03 15:44:26 test-vm-1 pengine: [5305]: notice: LogActions: Stop
> > drbd_r0:0 (test-vm-1)
> > Oct 03 15:44:26 test-vm-1 pengine: [5305]: notice: LogActions: Promote
> > drbd_r0:1 (Slave -> Master test-vm-2)
> >
> > After that, I can generally move the resources back and forth, and even
> > fail them over by hard-failing a node, without any problems. The real
> > problem is that this isn't consistent, though. Every once in a while,
> > I'll hard-fail a node and the other one will go into this "stuck" state
> > where Pacemaker knows it lost a node, but DRBD will stay in slave mode
> > and the other resources will never start. It seems to happen quite
> > randomly. Then, even if I restart Pacemaker on both nodes, or reboot
> > them altogether, I run into the startup issue mentioned previously.
> >
> > Any ideas?
>
> Yes, share your complete resource configuration ;-)
>
> Regards,
> Andreas
>
> >
> > Thanks,
> > Dave
> >
> >
> >
> > On Wed, Oct 2, 2013 at 1:01 AM, Andrew Beekhof <andrew at beekhof.net
> > <mailto:andrew at beekhof.net>> wrote:
> >
> >
> > On 02/10/2013, at 5:24 AM, David Parker <dparker at utica.edu
> > <mailto:dparker at utica.edu>> wrote:
> >
> > > Thanks, I did a little Googling and found the git repository for
> pcs.
> >
> > pcs won't help you rebuild pacemaker with cman support (or corosync
> > 2.x support) turned on though.
> >
> >
> > > Is there any way to make a two-node cluster work with the stock
> > Debian packages, though? It seems odd that this would be impossible.
> >
> > it really depends how the debian maintainers built pacemaker.
> > by the sounds of it, it only supports the pacemaker plugin mode for
> > corosync 1.x
> >
> > >
> > >
> > > On Tue, Oct 1, 2013 at 3:16 PM, Larry Brigman
> > <larry.brigman at gmail.com <mailto:larry.brigman at gmail.com>> wrote:
> > > pcs is another package you will need to install.
> > >
> > > On Oct 1, 2013 9:04 AM, "David Parker" <dparker at utica.edu
> > <mailto:dparker at utica.edu>> wrote:
> > > Hello,
> > >
> > > Sorry for the delay in my reply. I've been doing a lot of
> > experimentation, but so far I've had no luck.
> > >
> > > Thanks for the suggestion, but it seems I'm not able to use CMAN.
> > I'm running Debian Wheezy with Corosync and Pacemaker installed via
> > apt-get. When I installed CMAN and set up a cluster.conf file,
> > Pacemaker refused to start and said that CMAN was not supported.
> > When CMAN is not installed, Pacemaker starts up fine, but I see
> > these lines in the log:
> > >
> > > Sep 30 23:36:29 test-vm-1 crmd: [6941]: ERROR:
> > init_quorum_connection: The Corosync quorum API is not supported in
> > this build
> > > Sep 30 23:36:29 test-vm-1 pacemakerd: [6932]: ERROR:
> > pcmk_child_exit: Child process crmd exited (pid=6941, rc=100)
> > > Sep 30 23:36:29 test-vm-1 pacemakerd: [6932]: WARN:
> > pcmk_child_exit: Pacemaker child process crmd no longer wishes to be
> > respawned. Shutting ourselves down.
> > >
> > > So, then I checked to see which plugins are supported:
> > >
> > > # pacemakerd -F
> > > Pacemaker 1.1.7 (Build: ee0730e13d124c3d58f00016c3376a1de5323cff)
> > > Supporting: generated-manpages agent-manpages ncurses heartbeat
> > corosync-plugin snmp libesmtp
> > >
> > > Am I correct in believing that this Pacemaker package has been
> > compiled without support for any quorum API? If so, does anyone
> > know if there is a Debian package which has the correct support?
> > >
> > > I also tried compiling LibQB, Corosync and Pacemaker from source
> > via git, following the instructions documented here:
> > >
> > > http://clusterlabs.org/wiki/SourceInstall
> > >
> > > I was hopeful that this would work, because as I understand it,
> > Corosync 2.x no longer uses CMAN. Everything compiled and started
> > fine, but the compiled version of Pacemaker did not include either
> > the 'crm' or 'pcs' commands. Do I need to install something else in
> > order to get one of these?
> > >
> > > Any and all help is greatly appreciated!
> > >
> > > Thanks,
> > > Dave
> > >
> > >
> > > On Wed, Sep 25, 2013 at 6:08 AM, David Lang <david at lang.hm
> > <mailto:david at lang.hm>> wrote:
> > > the cluster is trying to reach a quarum (the majority of the nodes
> > talking to each other) and that is never going to happen with only
> > one node. so you have to disable this.
> > >
> > > try putting
> > > <cman two_node="1" expected_votes="1" transport="udpu"/>
> > > in your cluster.conf
> > >
> > > David Lang
> > >
> > > On Tue, 24 Sep 2013, David Parker wrote:
> > >
> > > Date: Tue, 24 Sep 2013 11:48:59 -0400
> > > From: David Parker <dparker at utica.edu <mailto:dparker at utica.edu>>
> > > Reply-To: The Pacemaker cluster resource manager
> > > <pacemaker at oss.clusterlabs.org
> > <mailto:pacemaker at oss.clusterlabs.org>>
> > > To: The Pacemaker cluster resource manager
> > <pacemaker at oss.clusterlabs.org <mailto:pacemaker at oss.clusterlabs.org
> >>
> > > Subject: Re: [Pacemaker] Corosync won't recover when a node fails
> > >
> > >
> > > I forgot to mention, OS is Debian Wheezy 64-bit, Corosync and
> > Pacemaker
> > > installed from packages via apt-get, and there are no local
> > firewall rules
> > > in place:
> > >
> > > # iptables -L
> > > Chain INPUT (policy ACCEPT)
> > > target prot opt source destination
> > >
> > > Chain FORWARD (policy ACCEPT)
> > > target prot opt source destination
> > >
> > > Chain OUTPUT (policy ACCEPT)
> > > target prot opt source destination
> > >
> > >
> > > On Tue, Sep 24, 2013 at 11:41 AM, David Parker <dparker at utica.edu
> > <mailto:dparker at utica.edu>> wrote:
> > >
> > > Hello,
> > >
> > > I have a 2-node cluster using Corosync and Pacemaker, where the
> > nodes are
> > > actually to VirtualBox VMs on the same physical machine. I have
> some
> > > resources set up in Pacemaker, and everything works fine if I move
> > them in
> > > a controlled way with the "crm_resource -r <resource> --move
> > --node <node>"
> > > command.
> > >
> > > However, when I hard-fail one of the nodes via the "poweroff"
> > command in
> > > Virtual Box, which "pulls the plug" on the VM, the resources do
> > not move,
> > > and I see the following output in the log on the remaining node:
> > >
> > > Sep 24 11:20:30 corosync [TOTEM ] The token was lost in the
> > OPERATIONAL
> > > state.
> > > Sep 24 11:20:30 corosync [TOTEM ] A processor failed, forming new
> > > configuration.
> > > Sep 24 11:20:30 corosync [TOTEM ] entering GATHER state from 2.
> > > Sep 24 11:20:31 test-vm-2 lrmd: [2503]: debug: rsc:drbd_r0:0
> > monitor[31]
> > > (pid 8495)
> > > drbd[8495]: 2013/09/24_11:20:31 WARNING: This resource agent is
> > > deprecated and may be removed in a future release. See the man
> > page for
> > > details. To suppress this warning, set the "ignore_deprecation"
> > resource
> > > parameter to true.
> > > drbd[8495]: 2013/09/24_11:20:31 WARNING: This resource agent is
> > > deprecated and may be removed in a future release. See the man
> > page for
> > > details. To suppress this warning, set the "ignore_deprecation"
> > resource
> > > parameter to true.
> > > drbd[8495]: 2013/09/24_11:20:31 DEBUG: r0: Calling drbdadm -c
> > > /etc/drbd.conf role r0
> > > drbd[8495]: 2013/09/24_11:20:31 DEBUG: r0: Exit code 0
> > > drbd[8495]: 2013/09/24_11:20:31 DEBUG: r0: Command output:
> > > Secondary/Primary
> > > drbd[8495]: 2013/09/24_11:20:31 DEBUG: r0: Calling drbdadm -c
> > > /etc/drbd.conf cstate r0
> > > drbd[8495]: 2013/09/24_11:20:31 DEBUG: r0: Exit code 0
> > > drbd[8495]: 2013/09/24_11:20:31 DEBUG: r0: Command output:
> > Connected
> > > drbd[8495]: 2013/09/24_11:20:31 DEBUG: r0 status:
> > Secondary/Primary
> > > Secondary Primary Connected
> > > Sep 24 11:20:31 test-vm-2 lrmd: [2503]: info: operation
> monitor[31] on
> > > drbd_r0:0 for client 2506: pid 8495 exited with return code 0
> > > Sep 24 11:20:32 corosync [TOTEM ] entering GATHER state from 0.
> > > Sep 24 11:20:34 corosync [TOTEM ] The consensus timeout expired.
> > > Sep 24 11:20:34 corosync [TOTEM ] entering GATHER state from 3.
> > > Sep 24 11:20:36 corosync [TOTEM ] The consensus timeout expired.
> > > Sep 24 11:20:36 corosync [TOTEM ] entering GATHER state from 3.
> > > Sep 24 11:20:38 corosync [TOTEM ] The consensus timeout expired.
> > > Sep 24 11:20:38 corosync [TOTEM ] entering GATHER state from 3.
> > > Sep 24 11:20:40 corosync [TOTEM ] The consensus timeout expired.
> > > Sep 24 11:20:40 corosync [TOTEM ] entering GATHER state from 3.
> > > Sep 24 11:20:40 corosync [TOTEM ] Totem is unable to form a cluster
> > > because of an operating system or network fault. The most common
> > cause of
> > > this message is that the local firewall is configured improperly.
> > > Sep 24 11:20:43 corosync [TOTEM ] The consensus timeout expired.
> > > Sep 24 11:20:43 corosync [TOTEM ] entering GATHER state from 3.
> > > Sep 24 11:20:43 corosync [TOTEM ] Totem is unable to form a cluster
> > > because of an operating system or network fault. The most common
> > cause of
> > > this message is that the local firewall is configured improperly.
> > > Sep 24 11:20:45 corosync [TOTEM ] The consensus timeout expired.
> > > Sep 24 11:20:45 corosync [TOTEM ] entering GATHER state from 3.
> > > Sep 24 11:20:45 corosync [TOTEM ] Totem is unable to form a cluster
> > > because of an operating system or network fault. The most common
> > cause of
> > > this message is that the local firewall is configured improperly.
> > > Sep 24 11:20:47 corosync [TOTEM ] The consensus timeout expired.
> > >
> > > Those last 3 messages just repeat over and over, the cluster never
> > > recovers, and the resources never move. "crm_mon" reports that the
> > > resources are still running on the dead node, and shows no
> > indication that
> > > anything has gone wrong.
> > >
> > > Does anyone know what the issue could be? My expectation was that
> the
> > > remaining node would become the sole member of the cluster, take
> > over the
> > > resources, and everything would keep running.
> > >
> > > For reference, my corosync.conf file is below:
> > >
> > > compatibility: whitetank
> > >
> > > totem {
> > > version: 2
> > > secauth: off
> > > interface {
> > > member {
> > > memberaddr: 192.168.25.201
> > > }
> > > member {
> > > memberaddr: 192.168.25.202
> > > }
> > > ringnumber: 0
> > > bindnetaddr: 192.168.25.0
> > > mcastport: 5405
> > > }
> > > transport: udpu
> > > }
> > >
> > > logging {
> > > fileline: off
> > > to_logfile: yes
> > > to_syslog: yes
> > > debug: on
> > > logfile: /var/log/cluster/corosync.log
> > > timestamp: on
> > > logger_subsys {
> > > subsys: AMF
> > > debug: on
> > > }
> > > }
> > >
> > >
> > > Thanks!
> > > Dave
> > >
> > > --
> > > Dave Parker
> > > Systems Administrator
> > > Utica College
> > > Integrated Information Technology Services
> > > (315) 792-3229
> > > Registered Linux User #408177
> > >
> > >
> > >
> > >
> > >
> > > _______________________________________________
> > >
> > > Pacemaker mailing list: Pacemaker at oss.clusterlabs.org
> > <mailto:Pacemaker at oss.clusterlabs.org>
> > >
> > > http://oss.clusterlabs.org/mailman/listinfo/pacemaker
> > >
> > >
> > >
> > > Project Home: http://www.clusterlabs.org
> > >
> > > Getting started:
> > http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
> > >
> > > Bugs: http://bugs.clusterlabs.org
> > >
> > >
> > > _______________________________________________
> > > Pacemaker mailing list: Pacemaker at oss.clusterlabs.org
> > <mailto:Pacemaker at oss.clusterlabs.org>
> > > http://oss.clusterlabs.org/mailman/listinfo/pacemaker
> > >
> > > Project Home: http://www.clusterlabs.org
> > > Getting started:
> > http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
> > > Bugs: http://bugs.clusterlabs.org
> > >
> > >
> > >
> > >
> > > --
> > > Dave Parker
> > > Systems Administrator
> > > Utica College
> > > Integrated Information Technology Services
> > > (315) 792-3229 <tel:%28315%29%20792-3229>
> > > Registered Linux User #408177
> > >
> > > _______________________________________________
> > > Pacemaker mailing list: Pacemaker at oss.clusterlabs.org
> > <mailto:Pacemaker at oss.clusterlabs.org>
> > > http://oss.clusterlabs.org/mailman/listinfo/pacemaker
> > >
> > > Project Home: http://www.clusterlabs.org
> > > Getting started:
> > http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
> > > Bugs: http://bugs.clusterlabs.org
> > >
> > >
> > > _______________________________________________
> > > Pacemaker mailing list: Pacemaker at oss.clusterlabs.org
> > <mailto:Pacemaker at oss.clusterlabs.org>
> > > http://oss.clusterlabs.org/mailman/listinfo/pacemaker
> > >
> > > Project Home: http://www.clusterlabs.org
> > > Getting started:
> > http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
> > > Bugs: http://bugs.clusterlabs.org
> > >
> > >
> > >
> > >
> > > --
> > > Dave Parker
> > > Systems Administrator
> > > Utica College
> > > Integrated Information Technology Services
> > > (315) 792-3229 <tel:%28315%29%20792-3229>
> > > Registered Linux User #408177
> > > _______________________________________________
> > > Pacemaker mailing list: Pacemaker at oss.clusterlabs.org
> > <mailto:Pacemaker at oss.clusterlabs.org>
> > > http://oss.clusterlabs.org/mailman/listinfo/pacemaker
> > >
> > > Project Home: http://www.clusterlabs.org
> > > Getting started:
> > http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
> > > Bugs: http://bugs.clusterlabs.org
> >
> >
> > _______________________________________________
> > Pacemaker mailing list: Pacemaker at oss.clusterlabs.org
> > <mailto:Pacemaker at oss.clusterlabs.org>
> > http://oss.clusterlabs.org/mailman/listinfo/pacemaker
> >
> > Project Home: http://www.clusterlabs.org
> > Getting started:
> http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
> > Bugs: http://bugs.clusterlabs.org
> >
> >
> >
> >
> > --
> > Dave Parker
> > Systems Administrator
> > Utica College
> > Integrated Information Technology Services
> > (315) 792-3229
> > Registered Linux User #408177
> >
> >
> > This body part will be downloaded on demand.
> >
>
>
> --
> Need help with Pacemaker?
> http://www.hastexo.com/now
>
>
>
> _______________________________________________
> Pacemaker mailing list: Pacemaker at oss.clusterlabs.org
> http://oss.clusterlabs.org/mailman/listinfo/pacemaker
>
> Project Home: http://www.clusterlabs.org
> Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
> Bugs: http://bugs.clusterlabs.org
>
>
--
Dave Parker
Systems Administrator
Utica College
Integrated Information Technology Services
(315) 792-3229
Registered Linux User #408177
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.clusterlabs.org/pipermail/pacemaker/attachments/20131003/f2b21f2d/attachment.htm>
More information about the Pacemaker
mailing list