[Pacemaker] Multiple split-brain problem

Tue Jun 26 15:49:32 CEST 2012

Hello,

i running on a 2 node cluster with corosync & drbd in active/passive mode
for mysql hight availablity.

The cluster working fine (failover/failback & replication ok), i have no
network outage (network is monitored and i've not seen any failure) but
split-brain occurs very often and i don't anderstand why, maybe you can
help me?

I'm new pacemaker/corosync/DRBD user, so my cluster and drbd configuration
are probably not optimal, so if you have any comments, tips or examples I
would be very grateful!

Here is an exemple of corosync log when a split-brain occurs (1 hour log to
see before/after split-brain):

http://pastebin.com/3DprkcTA

Thank you in advance for any help!

More details about my configuration:

I have:
One prefered "master" node (node1) on a virtual server, and one "slave"
node on a physical server.
On each server,
eth0 is connected on my main LAN for client/server communication (with
cluster VIP)
Eth1 is connected on a dedicated Vlan for corosync communication (network:
192.168.3.0 /30)
Eth2 is connected on a dedicated Vlan for drbd replication (network:
192.168.2.0/30)

Here is my drbd configuration:

resource drbd-mysql {
protocol C;
    disk {
        on-io-error detach;
    }
    handlers {
        fence-peer "/usr/lib/drbd/crm-fence-peer.sh";
        after-resync-target "/usr/lib/drbd/crm-unfence-peer.sh";
        split-brain "/usr/lib/drbd/notify-split-brain.sh root";
    }
    net {
        cram-hmac-alg sha1;
        shared-secret "secret";
        after-sb-0pri discard-younger-primary;
        after-sb-1pri discard-secondary;
        after-sb-2pri call-pri-lost-after-sb;
    }
    startup {
        wfc-timeout  1;
        degr-wfc-timeout 1;
    }
    on node1{
        device /dev/drbd1;
        address 192.168.2.1:7801;
        disk /dev/sdb;
        meta-disk internal;
    }
    on node2 {
    device /dev/drbd1;
    address 192.168.2.2:7801;
    disk /dev/sdb;
    meta-disk internal;
    }
}

Here my cluster config:

node node1 \
        attributes standby="off"
node node2 \
        attributes standby="off"
primitive Cluster-VIP ocf:heartbeat:IPaddr2 \
        params ip="10.1.0.130" broadcast="10.1.7.255" nic="eth0"
cidr_netmask="21" iflabel="VIP1" \
        op monitor interval="10s" timeout="20s" \
        meta is-managed="true"
primitive cluster_status_page ocf:heartbeat:ClusterMon \
        params pidfile="/var/run/crm_mon.pid"
htmlfile="/var/www/html/cluster_status.html" \
        op monitor interval="4s" timeout="20s"
primitive datavg ocf:heartbeat:LVM \
        params volgrpname="datavg" exclusive="true" \
        op start interval="0" timeout="30" \
        op stop interval="0" timeout="30"
primitive drbd_mysql ocf:linbit:drbd \
        params drbd_resource="drbd-mysql" \
        op monitor interval="15s"
primitive fs_mysql ocf:heartbeat:Filesystem \
        params device="/dev/datavg/data" directory="/data" fstype="ext4"
primitive mail_alert ocf:heartbeat:MailTo \
        params email="myemail at test.com" \
        op monitor interval="10" timeout="10" depth="0"
primitive mysqld ocf:heartbeat:mysql \
        params binary="/usr/bin/mysqld_safe" config="/etc/my.cnf"
datadir="/data/mysql/databases" user="mysql"
pid="/var/run/mysqld/mysqld.pid" socket="/var/lib/mysql/mysql.sock"
test_passwd="cluster_test" test_table="Cluster_Test.dbcheck"
test_user="cluster_test" \
        op start interval="0" timeout="120" \
        op stop interval="0" timeout="120" \
        op monitor interval="30s" timeout="30s" OCF_CHECK_LEVEL="1"
target-role="Started"
group mysql datavg fs_mysql Cluster-VIP mysqld cluster_status_page
mail_alert
ms ms_drbd_mysql drbd_mysql \
        meta master-max="1" master-node-max="1" clone-max="2"
clone-node-max="1" notify="true"
location mysql-preferred-node mysql inf: node1
colocation mysql_on_drbd inf: mysql ms_drbd_mysql:Master
order mysql_after_drbd inf: ms_drbd_mysql:promote mysql:start
property $id="cib-bootstrap-options" \
        dc-version="1.1.6-3.el6-a02c0f19a00c1eb2527ad38f146ebc0834814558" \
        cluster-infrastructure="openais" \
        expected-quorum-votes="2" \
        stonith-enabled="false" \
        no-quorum-policy="ignore" \
        last-lrm-refresh="1340701656"
rsc_defaults $id="rsc-options" \
        resource-stickiness="100" \
        migration-threshold="2" \
        failure-timeout="30s"
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://oss.clusterlabs.org/pipermail/pacemaker/attachments/20120626/4869a2e6/attachment-0001.html>