[Pacemaker] how to test network access and fail over accordingly?

Wed Oct 6 03:45:36 UTC 2010

Hello,

I have a 2 node cluster, running DRBD, heartbeat and pacemaker in
active/passive mode.  On both nodes, eth0 is connected to the main
network, eth1 is used to connect the nodes directly to each other.
The nodes share a virtual IP address on eth0.  Pacemaker is also
controlling a custom service with an LSB compliant script in
/etc/init.d/.  All of this is working fine and I'm happy with it.

I'd like to configure the nodes so that they fail over if eth0 goes
down (or if they cannot access a particular gateway), so I tried
adding the following (as per
http://www.clusterlabs.org/wiki/Example_configurations#Set_up_pingd)

primitive p_pingd ocf:pacemaker:pingd params host_list=172.20.0.254 op
monitor interval=15s timeout=5s
clone c_pingd p_pingd meta globally-unique=false
location loc_pingd g_cluster_services rule -inf: not_defined p_pingd
or p_pingd lte 0

... but when I do add that, all resource are stopped and they don't
come back up on either node.  Am I making a basic mistake or do you
need more info from me?

All help is appreciated,
Craig.

pacemaker
Version: 1.0.8+hg15494-2ubuntu2

heartbeat
Version: 1:3.0.3-1ubuntu1

drbd8-utils
Version: 2:8.3.7-1ubuntu2.1

rp at rpalpha:~$ sudo crm configure show
node $id="32482293-7b0f-466e-b405-c64bcfa2747d" rpalpha
node $id="3f2aac12-05aa-4ac7-b91f-c47fa28efb44" rpbravo
primitive p_drbd_data ocf:linbit:drbd \
        params drbd_resource="data" \
        op monitor interval="30s"
primitive p_fs_data ocf:heartbeat:Filesystem \
        params device="/dev/drbd/by-res/data" directory="/mnt/data"
fstype="ext4"
primitive p_ip ocf:heartbeat:IPaddr2 \
        params ip="172.20.50.3" cidr_netmask="255.255.0.0" nic="eth0" \
        op monitor interval="30s"
primitive p_rp lsb:rp \
        op monitor interval="30s" \
        meta target-role="Started"
group g_cluster_services p_ip p_fs_data p_rp
ms ms_drbd p_drbd_data \
        meta master-max="1" master-node-max="1" clone-max="2"
clone-node-max="1" notify="true"
location loc_preferred_master g_cluster_services inf: rpalpha
colocation colo_mnt_on_master inf: g_cluster_services ms_drbd:Master
order ord_mount_after_drbd inf: ms_drbd:promote g_cluster_services:start
property $id="cib-bootstrap-options" \
        dc-version="1.0.8-042548a451fce8400660f6031f4da6f0223dd5dd" \
        cluster-infrastructure="Heartbeat" \
        no-quorum-policy="ignore" \
        stonith-enabled="false" \
        expected-quorum-votes="2" \

rp at rpalpha:~$ sudo cat /etc/ha.d/ha.cf
node rpalpha
node rpbravo

keepalive 2
warntime 5
deadtime 15
initdead 60

mcast eth0 239.0.0.43 694 1 0
bcast eth1

use_logd yes
autojoin none
crm respawn

rp at rpalpha:~$ sudo cat /etc/drbd.conf
global {
        usage-count no;
}
common {
        protocol C;

        handlers {}

        startup {}

        disk {}

        net {
                cram-hmac-alg sha1;
                shared-secret "foobar";
        }

        syncer {
                verify-alg sha1;
                rate 100M;
        }
}
resource data {
        device /dev/drbd0;
        meta-disk internal;
        on rpalpha {
                disk /dev/mapper/rpalpha-data;
                address 192.168.1.1:7789;
        }
        on rpbravo {
                disk /dev/mapper/rpbravo-data;
                address 192.168.1.2:7789;
        }
}