[Pacemaker] Why monitor fails in my RA

Wed Apr 25 20:41:05 UTC 2012

Hi,

I try to write redis resources agent working in master-slave. My 
configuration:
node s1
node s2
primitive ip-redis ocf:heartbeat:IPaddr2 \
         params ip="192.168.1.15" nic="eth0" cidr_netmask="24" \
         op monitor interval="10s" timeout="30s" \
         meta target-role="Started"
primitive redis-server ocf:implix:redis4 \
         op start interval="0" timeout="60s" \
         op stop interval="0" timeout="60s" \
         op monitor interval="5s" role="Master" timeout="60s" \
         op monitor interval="10s" role="Slave" timeout="60s" \
         params masterip="192.168.1.15"
ms redis-ms redis-server \
         meta master-max="1" master-node-max="1" clone-max="2" \
	clone-node-max="1" target-role="Master"
colocation co-redis-ms inf: ip-redis redis-ms:Master
order or-redis inf: redis-ms:promote ip-redis:start
property $id="cib-bootstrap-options" \
         dc-version="1.1.6-9971ebba4494012a93c03b40a2c58ec0eb60f50c" \
         cluster-infrastructure="openais" \
         no-quorum-policy="ignore" \
         stonith-enabled="false" \
         expected-quorum-votes="2" \
         default-action-timeout="20s" \
         last-lrm-refresh="1335271825" \
         default-resource-stickiness="10"

	To simplify RA all redis nodes start as a slave (that's why I need to 
pass masterip in configuration).

	Script works great it promote on secondary (if master node is down) but 
only few times. In some point sometimes after 2 or after 3 master fails 
(manually kill process) I get this error:
redis-server:0_monitor_5000 (node=s1, call=16, rc=9, status=complete): 
master (failed)

My mointor function (simplified and removed overhead and added some 
comments) is:
redis_monitor() {
	# I set score 10 for master 5 is for slave
         CURSCORE=`$CRM_MASTER -G -q`
         logger "redis_monitor: score $CURSCORE"
         local state
         redis_state

	# In RET is current local redis state
         state=$(echo "${RET}" | cut -d':' -f2 | tr -d '\r')

         if [ "${state}" = "master" ];then
                 $CRM_MASTER -v $CRM_MASTER_SCORE # score is 10
                 exit $OCF_RUNNING_MASTER
         fi

         if [ "${state}" = "slave" ];then
                 $CRM_MASTER -v $CRM_SLAVE_SCORE # score is 5
                 exit $OCF_SUCCESS
         fi

	# if not slave/master so resource is failed
         $CRM_MASTER -l reboot -D
         if [ $CURSCORE -eq $CRM_MASTER_SCORE ];then
                 exit $OCF_FAILED_MASTER
         fi

         exit $OCF_NOT_RUNNING
}

 From my logs I know that monitoring function returned OCF_FAILED_MASTER 
when master is down and then this error occurred:
redis-server:0_monitor_5000 (node=s1, call=16, rc=9, status=complete): 
master (failed)

After that failed master node is not monitored on that node until I run 
cleanup:
#crm resource cleanup redis-server:0

My questions:
1) What I'm doing wrong ?. How can I fix this.
I've tried on-fail="restart" but this not helped

2) Using older version of redis 2.3 If master failed redis is hanging 
for some time (21-24 seconds). Even I set higher timeout on monitor 
functions it still timeout after 20 seconds why?.
(Changing default-action-timeout to higher value helped to resolve this 
but I think timeout should be enough)

--
Greg