[Pacemaker] Multiple split-brain problem
coma
coma.inf at gmail.com
Tue Jun 26 15:49:32 CEST 2012
Hello,
i running on a 2 node cluster with corosync & drbd in active/passive mode
for mysql hight availablity.
The cluster working fine (failover/failback & replication ok), i have no
network outage (network is monitored and i've not seen any failure) but
split-brain occurs very often and i don't anderstand why, maybe you can
help me?
I'm new pacemaker/corosync/DRBD user, so my cluster and drbd configuration
are probably not optimal, so if you have any comments, tips or examples I
would be very grateful!
Here is an exemple of corosync log when a split-brain occurs (1 hour log to
see before/after split-brain):
http://pastebin.com/3DprkcTA
Thank you in advance for any help!
More details about my configuration:
I have:
One prefered "master" node (node1) on a virtual server, and one "slave"
node on a physical server.
On each server,
eth0 is connected on my main LAN for client/server communication (with
cluster VIP)
Eth1 is connected on a dedicated Vlan for corosync communication (network:
192.168.3.0 /30)
Eth2 is connected on a dedicated Vlan for drbd replication (network:
192.168.2.0/30)
Here is my drbd configuration:
resource drbd-mysql {
protocol C;
disk {
on-io-error detach;
}
handlers {
fence-peer "/usr/lib/drbd/crm-fence-peer.sh";
after-resync-target "/usr/lib/drbd/crm-unfence-peer.sh";
split-brain "/usr/lib/drbd/notify-split-brain.sh root";
}
net {
cram-hmac-alg sha1;
shared-secret "secret";
after-sb-0pri discard-younger-primary;
after-sb-1pri discard-secondary;
after-sb-2pri call-pri-lost-after-sb;
}
startup {
wfc-timeout 1;
degr-wfc-timeout 1;
}
on node1{
device /dev/drbd1;
address 192.168.2.1:7801;
disk /dev/sdb;
meta-disk internal;
}
on node2 {
device /dev/drbd1;
address 192.168.2.2:7801;
disk /dev/sdb;
meta-disk internal;
}
}
Here my cluster config:
node node1 \
attributes standby="off"
node node2 \
attributes standby="off"
primitive Cluster-VIP ocf:heartbeat:IPaddr2 \
params ip="10.1.0.130" broadcast="10.1.7.255" nic="eth0"
cidr_netmask="21" iflabel="VIP1" \
op monitor interval="10s" timeout="20s" \
meta is-managed="true"
primitive cluster_status_page ocf:heartbeat:ClusterMon \
params pidfile="/var/run/crm_mon.pid"
htmlfile="/var/www/html/cluster_status.html" \
op monitor interval="4s" timeout="20s"
primitive datavg ocf:heartbeat:LVM \
params volgrpname="datavg" exclusive="true" \
op start interval="0" timeout="30" \
op stop interval="0" timeout="30"
primitive drbd_mysql ocf:linbit:drbd \
params drbd_resource="drbd-mysql" \
op monitor interval="15s"
primitive fs_mysql ocf:heartbeat:Filesystem \
params device="/dev/datavg/data" directory="/data" fstype="ext4"
primitive mail_alert ocf:heartbeat:MailTo \
params email="myemail at test.com" \
op monitor interval="10" timeout="10" depth="0"
primitive mysqld ocf:heartbeat:mysql \
params binary="/usr/bin/mysqld_safe" config="/etc/my.cnf"
datadir="/data/mysql/databases" user="mysql"
pid="/var/run/mysqld/mysqld.pid" socket="/var/lib/mysql/mysql.sock"
test_passwd="cluster_test" test_table="Cluster_Test.dbcheck"
test_user="cluster_test" \
op start interval="0" timeout="120" \
op stop interval="0" timeout="120" \
op monitor interval="30s" timeout="30s" OCF_CHECK_LEVEL="1"
target-role="Started"
group mysql datavg fs_mysql Cluster-VIP mysqld cluster_status_page
mail_alert
ms ms_drbd_mysql drbd_mysql \
meta master-max="1" master-node-max="1" clone-max="2"
clone-node-max="1" notify="true"
location mysql-preferred-node mysql inf: node1
colocation mysql_on_drbd inf: mysql ms_drbd_mysql:Master
order mysql_after_drbd inf: ms_drbd_mysql:promote mysql:start
property $id="cib-bootstrap-options" \
dc-version="1.1.6-3.el6-a02c0f19a00c1eb2527ad38f146ebc0834814558" \
cluster-infrastructure="openais" \
expected-quorum-votes="2" \
stonith-enabled="false" \
no-quorum-policy="ignore" \
last-lrm-refresh="1340701656"
rsc_defaults $id="rsc-options" \
resource-stickiness="100" \
migration-threshold="2" \
failure-timeout="30s"
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://oss.clusterlabs.org/pipermail/pacemaker/attachments/20120626/4869a2e6/attachment-0001.html>
More information about the Pacemaker
mailing list