[Pacemaker] DRBD Split Brain after each reboot
andschais at gmail.com
andschais at gmail.com
Mon Dec 21 20:22:44 UTC 2009
Hi all,
I'm getting troubles with a Pacemaker+DRBD 2 nodes cluster. I am trying to
solve it for about a week, I really need help!!!
If I disconnect power cord the failover works great, resources migrate to
secondary node and back to primary when I turn it on.
But when turn off primary node with a "shutdown -r now" command, I always
finish with a split brian. That's not all, If a put just a few resources
(for example: virtual IP, DRBD, Apache and PostgreSQL) split brain does not
take place, but at the moment I put 8 or 9 resources (specially when one of
those resources is JBoss AS) I always get split brain...
Can someone give me some hints?
My systems are:
OS: Debian Lenny 2.6.26-2-686
Corosync 1.1.2
DRBD 8.3.6
And my configuration files are:
/etc/corosync/corosync.conf
# Please read the openais.conf.5 manual page
totem {
version: 2
# How long before declaring a token lost (ms)
token: 3000
# How many token retransmits before forming a new configuration
token_retransmits_before_loss_const: 10
# How long to wait for join messages in the membership protocol (ms)
join: 60
# How long to wait for consensus to be achieved before starting a
new round of membership configuration (ms)
consensus: 1500
# Turn off the virtual synchrony filter
vsftype: none
# Number of messages that may be sent by one processor on receipt of
the token
max_messages: 20
# Limit generated nodeids to 31-bits (positive signed integers)
clear_node_high_bit: yes
# Disable encryption
secauth: on
# How many threads to use for encryption/decryption
threads: 0
# Optionally assign a fixed node id (integer)
# nodeid: 1234
# This specifies the mode of redundant ring, which may be none,
active, or passive.
rrp_mode: passive
interface {
# The following values need to be set based on your
environment
ringnumber: 0
bindnetaddr: 172.16.1.0
mcastaddr: 226.94.1.1
mcastport: 5405
}
interface {
# The following values need to be set based on your
environment
ringnumber: 1
bindnetaddr: 10.186.68.0
mcastaddr: 226.94.2.1
mcastport: 5405
}
}
amf {
mode: disabled
}
service {
# Load the Pacemaker Cluster Resource Manager
ver: 0
name: pacemaker
}
aisexec {
user: root
group: root
}
logging {
to_stderr: yes
debug: on
timestamp: on
to_file: yes
logfile: /var/log/corosync.log
to_syslog: no
syslog_facility: daemon
}
}
/etc/drbd.conf
global {
usage-count yes;
}
common {
syncer { rate 33M; }
}
resource r0 {
protocol C;
handlers {
pri-on-incon-degr "/usr/lib/drbd/notify-pri-on-incon-degr.sh;
/usr/lib/drbd/notify-emergency-reboot.sh; echo b > /proc/sysrq-trigger ;
reboot -f";
pri-lost-after-sb "/usr/lib/drbd/notify-pri-lost-after-sb.sh;
/usr/lib/drbd/notify-emergency-reboot.sh; echo b > /proc/sysrq-trigger ;
reboot -f";
local-io-error "/usr/lib/drbd/notify-io-error.sh;
/usr/lib/drbd/notify-emergency-shutdown.sh; echo o > /proc/sysrq-trigger ;
halt -f";
fence-peer "/usr/lib/drbd/crm-fence-peer.sh";
after-resync-target "/usr/lib/drbd/crm-unfence-peer.sh";
outdate-peer "/usr/lib/drbd/outdate-peer.sh";
split-brain "/usr/lib/drbd/notify-split-brain.sh root at localhost";
}
startup {
degr-wfc-timeout 30;
wfc-timeout 30;
}
disk {
fencing resource-only;
on-io-error detach;
}
net {
after-sb-0pri disconnect;
after-sb-1pri disconnect;
after-sb-2pri disconnect;
rr-conflict disconnect;
}
on primary {
device /dev/drbd0;
disk /dev/vg00/drbd;
address 172.16.1.1:7788;
meta-disk internal;
}
on secondary {
device /dev/drbd0;
disk /dev/vg00/drbd;
address 172.16.1.2:7788;
meta-disk internal;
}
}
and my crm config
<configuration>
<crm_config>
<cluster_property_set id="cib-bootstrap-options">
<nvpair id="cib-bootstrap-options-no-quorum-policy"
name="no-quorum-policy" value="ignore"/>
<nvpair id="cib-bootstrap-options-stonith-enabled"
name="stonith-enabled" value="false"/>
<nvpair id="cib-bootstrap-options-expected-quorum-votes"
name="expected-quorum-votes" value="2"/>
<nvpair id="cib-bootstrap-options-last-lrm-refresh"
name="last-lrm-refresh" value="1261424411"/>
<nvpair id="cib-bootstrap-options-dc-version" name="dc-version"
value="1.0.6-cebe2b6ff49b36b29a3bd7ada1c4701c7470febe"/>
<nvpair id="cib-bootstrap-options-cluster-infrastructure"
name="cluster-infrastructure" value="openais"/>
</cluster_property_set>
</crm_config>
<nodes>
<node uname="primary" type="normal" id="primary">
<instance_attributes id="nodes-primary">
<nvpair name="standby" id="nodes-primary-standby" value="off"/>
</instance_attributes>
</node>
<node uname="secondary" type="normal" id="secondary">
<instance_attributes id="nodes-secondary">
<nvpair name="standby" id="nodes-secondary-standby" value="off"/>
</instance_attributes>
</node>
</nodes>
<resources>
<master id="ms-drbd">
<meta_attributes id="ms-drbd-meta_attributes">
<nvpair id="ms-drbd-meta_attributes-master-max" name="master-max"
value="1"/>
<nvpair id="ms-drbd-meta_attributes-master-node-max"
name="master-node-max" value="1"/>
<nvpair id="ms-drbd-meta_attributes-clone-max" name="clone-max"
value="2"/>
<nvpair id="ms-drbd-meta_attributes-clone-node-max"
name="clone-node-max" value="1"/>
<nvpair id="ms-drbd-meta_attributes-notify" name="notify"
value="true"/>
<nvpair id="ms-drbd-meta_attributes-globally-unique"
name="globally-unique" value="false"/>
<nvpair name="target-role"
id="ms-drbd-meta_attributes-target-role" value="Started"/>
</meta_attributes>
<primitive class="ocf" id="drbd" provider="linbit" type="drbd">
<instance_attributes id="drbd-instance_attributes">
<nvpair id="drbd-instance_attributes-drbd_resource"
name="drbd_resource" value="r0"/>
</instance_attributes>
<operations>
<op id="drbd-monitor-59s" interval="59s" name="monitor"
role="Master" timeout="30s"/>
<op id="drbd-monitor-60s" interval="60s" name="monitor"
role="Slave" timeout="30s"/>
<op id="drbd-start-0s" interval="0s" name="start"
start-delay="10s"/>
<op id="drbd-promote-0s" interval="0s" name="promote"
start-delay="10s"/>
</operations>
</primitive>
</master>
<group id="p-group">
<primitive class="ocf" id="fs" provider="heartbeat"
type="Filesystem">
<instance_attributes id="fs-instance_attributes">
<nvpair id="fs-instance_attributes-fstype" name="fstype"
value="ext3"/>
<nvpair id="fs-instance_attributes-directory" name="directory"
value="/drbd"/>
<nvpair id="fs-instance_attributes-device" name="device"
value="/dev/drbd0"/>
</instance_attributes>
<meta_attributes id="fs-meta_attributes">
<nvpair id="fs-meta_attributes-is-managed" name="is-managed"
value="true"/>
</meta_attributes>
</primitive>
<primitive class="ocf" id="ip" provider="heartbeat" type="IPaddr2">
<instance_attributes id="ip-instance_attributes">
<nvpair id="ip-instance_attributes-ip" name="ip"
value="10.186.68.1"/>
<nvpair id="ip-instance_attributes-broadcast" name="broadcast"
value="10.186.68.127"/>
<nvpair id="ip-instance_attributes-cidr_netmask"
name="cidr_netmask" value="25"/>
</instance_attributes>
<operations>
<op id="ip-monitor-10s" interval="10s" name="monitor"/>
</operations>
</primitive>
<primitive class="heartbeat" id="drbdlinks" type="drbdlinks">
<operations>
<op id="drbdlinks-monitor-60s" interval="60s" name="monitor"/>
</operations>
</primitive>
<primitive class="ocf" id="postgresql" provider="heartbeat"
type="pgsql">
<instance_attributes id="postgresql-instance_attributes">
<nvpair id="postgresql-instance_attributes-pgctl" name="pgctl"
value="/usr/lib/postgresql/8.3/bin/pg_ctl"/>
<nvpair id="postgresql-instance_attributes-psql" name="psql"
value="/usr/bin/psql"/>
<nvpair id="postgresql-instance_attributes-pgdata" name="pgdata"
value="/var/lib/postgresql/8.3/main"/>
<nvpair id="postgresql-instance_attributes-pgdba" name="pgdba"
value="postgres"/>
<nvpair id="postgresql-instance_attributes-pgdb" name="pgdb"
value="postgres"/>
<nvpair id="postgresql-instance_attributes-logfile"
name="logfile" value="/var/log/postgresql/postgresql-8.3-main.log"/>
</instance_attributes>
<operations>
<op id="postgresql-monitor-60s" interval="60s" name="monitor"
timeout="30s"/>
</operations>
</primitive>
<primitive class="ocf" id="asterisk" provider="custom"
type="Asterisk">
<operations>
<op id="asterisk-monitor-60s" interval="60s" name="monitor"
start-delay="30s" timeout="30s"/>
</operations>
</primitive>
<primitive class="lsb" id="postfix" type="postfix"/>
<primitive class="ocf" id="apache2" provider="heartbeat"
type="apache">
<instance_attributes id="apache2-instance_attributes">
<nvpair id="apache2-instance_attributes-configfile"
name="configfile" value="/etc/apache2/apache2.conf"/>
</instance_attributes>
<operations>
<op id="apache2-monitor-60s" interval="60s" name="monitor"/>
</operations>
</primitive>
<primitive class="lsb" id="dhcp" type="dhcp3-server"/>
<primitive class="ocf" id="jboss" provider="custom" type="JBoss">
<instance_attributes id="jboss-instance_attributes">
<nvpair id="jboss-instance_attributes-java_home"
name="java_home" value="/opt/java/"/>
<nvpair id="jboss-instance_attributes-jboss_home"
name="jboss_home" value="/opt/jboss"/>
</instance_attributes>
<operations>
<op id="jboss-monitor-60s" interval="60s" name="monitor"
start-delay="100s" timeout="30s"/>
<op id="jboss-start-0s" interval="0s" name="start"
timeout="99s"/>
</operations>
</primitive>
</group>
</resources>
<constraints>
<rsc_colocation id="p-group-on-ms-drbd" rsc="p-group" score="INFINITY"
with-rsc="ms-drbd" with-rsc-role="Master"/>
<rsc_location id="ms-drbd-master-on-primary" rsc="ms-drbd">
<rule id="ms-drbd-master-on-primary-rule" role="Master" score="100">
<expression attribute="#uname"
id="ms-drbd-master-on-primary-expression" operation="eq" value="primary"/>
</rule>
</rsc_location>
<rsc_order first="ms-drbd" first-action="promote"
id="ms-drbd-before-group" score="INFINITY" then="p-group"
then-action="start"/>
</constraints>
<rsc_defaults/>
<op_defaults/>
</configuration>
Thanks in advance.
Andres.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.clusterlabs.org/pipermail/pacemaker/attachments/20091221/951d4437/attachment-0001.html>
More information about the Pacemaker
mailing list