[Pacemaker] Consider extra slave node resource when calculating actions for failover
Juraj Fabo
juraj.fabo at gmail.com
Tue Jan 14 12:25:59 UTC 2014
Hi
I have master-slave cluster with configuration attached below.
It is based on documented postgresql master-slave cluster configuration.
Colocation constraints should work that way that if some of "master-group"
resources fails,
failover to slave node will be done. This basically works ok.
I would like to have an additional condition integrated.
On and only on the HotStandby runs resource SERVICE-res-mon-s1.
If the SERVICE-res-mon-s1 resource on slave reports negative score then
failover should not be done,
because it indicates that the slave node is not ready to run services from
master-group.
However, even if the SERVICE-res-mon-s1 fails, postgresql slave (HotStandby)
should still run,
because the SERVICE-res-mon-s1 monitors some application related
functionality which does not block the postgres itself.
The requested feature is very close to the one described in the
http://clusterlabs.org/doc/en-
US/Pacemaker/1.0/html/Pacemaker_Explained/ch09s03s03s02.html
"prefer nodes with the most connectivity"
with that difference, that the resource agent is running only on the standby
node.
Reason is, that the SERVICE-res-mon-s1 is in reality minimalistic
implementation of the SERVICE-service
in order to know whether the slave would be able to run the SERVICE-service.
If needed, I could change this design to use clone of SERVICE-res-mon-s1 to
run them on both nodes, however, I did not succeed even with this
configuration.
Next step would be to have multiple instances of this resource agent running
on both nodes (with different parameter spanID) and preffer node where "more
spans" are ok.
Ocf agent service_res_check reports the resource availability via "monitor"
function, where it updates its own score attribute via crm_attribute.
I thought that using this custom score attribute in location or colocation
constraint could do the job, but it did not affected the failover logic.
Please, what should be done in order to have the cluster consider also the
SERVICE-res-mon-s1 results when calculating resources score and "willingnes
to move"?
note: my pacemaker contains also patch from
https://github.com/beekhof/pacemaker/commit/58962338
Thank you in advance
node $id="1" serv1 \
attributes SERVICE-pgsql-data-status="STREAMING|ASYNC"
node $id="2" serv2 \
attributes SERVICE-pgsql-data-status="LATEST"
primitive SERVICE-MIP1 ocf:heartbeat:IPaddr2 \
params ip="10.40.0.70" cidr_netmask="24" iflabel="ma1" \
op monitor interval="10s"
primitive SERVICE-MIP2 ocf:heartbeat:IPaddr2 \
params ip="10.40.0.71" cidr_netmask="26" iflabel="ma2" \
op monitor interval="10s"
primitive SERVICE-VIP ocf:heartbeat:IPaddr2 \
params ip="10.40.0.72" cidr_netmask="24" iflabel="sla" \
meta resource-stickiness="1" \
op monitor interval="10s" timeout="60s" on-fail="restart"
primitive SERVICE-res-mon-s1 ocf:heartbeat:service_res_check \
params spanID="1" \
meta resource-stickiness="1" \
op monitor interval="9s" timeout="4s" on-fail="restart"
primitive SERVICE-pgsql ocf:heartbeat:pgsql \
params master_ip="10.40.0.70" slave_ip="10.40.0.72" node_list="serv1
serv2" pgctl="/usr/bin/pg_ctl" psql="/usr/bin/psql"
pgdata="/var/lib/pgsql/data/" start_opt="-p 5432" rep_mode="async"
logfile="/var/log/service_ra_pgsql.log"
primary_conninfo_opt="keepalives_idle=60 keepalives_interval=5
keepalives_count=5" stop_escalate="0" \
op start interval="0s" timeout="120s" on-fail="restart" \
op monitor interval="7s" timeout="30s" on-fail="restart" \
op monitor interval="2s" role="Master" timeout="30s" on-
fail="restart" \
op promote interval="0s" timeout="120s" on-fail="restart" \
op demote interval="0s" timeout="30s" on-fail="stop" \
op stop interval="0s" timeout="30s" on-fail="block" \
op notify interval="0s" timeout="30s"
primitive SERVICE-pingCheck ocf:pacemaker:ping \
params host_list="10.40.0.99" name="default_ping_set"
multiplier="100" \
op start interval="0s" timeout="60s" on-fail="restart" \
op monitor interval="2s" timeout="60s" on-fail="restart" \
op stop interval="0s" timeout="60s" on-fail="ignore"
primitive SERVICE-service ocf:heartbeat:service_service_ocf \
op monitor interval="7s" timeout="30s" on-fail="restart"
primitive SERVICE-tomcat ocf:heartbeat:tomcat \
params java_home="/usr/java/default"
catalina_home="/usr/share/tomcat6" statusurl="http://127.0.0.1:9081/admin"
catalina_pid="/var/run/tomcat6.pid" tomcat_user="tomcat"
script_log="/var/log/service_ra_tomcat.log" \
op monitor interval="10s" timeout="40s" on-fail="restart" depth="0"
\
op start interval="0" timeout="40s"
group master-group SERVICE-MIP1 SERVICE-MIP2 SERVICE-service SERVICE-tomcat
ms msPostgresql SERVICE-pgsql \
meta master-max="1" master-node-max="1" clone-max="2" clone-node-
max="1" notify="true"
clone clnPingd SERVICE-pingCheck
location rsc_location-1 msPostgresql \
rule $id="rsc_location-1-rule" -inf: not_defined default_ping_set or
default_ping_set lt 100
colocation rsc_colocation-1 inf: msPostgresql clnPingd
colocation rsc_colocation-2 inf: master-group msPostgresql:Master
colocation rsc_colocation-3 -inf: SERVICE-VIP msPostgresql:Master
colocation rsc_colocation-4 -inf: SERVICE-res-mon-s1 msPostgresql:Master
order rsc_order-1 0: clnPingd msPostgresql symmetrical=false
order rsc_order-2 inf: msPostgresql:promote master-group:start
symmetrical=false
order rsc_order-3 0: msPostgresql:demote master-group:stop symmetrical=false
property $id="cib-bootstrap-options" \
dc-version="1.1.10-368c726" \
cluster-infrastructure="corosync" \
no-quorum-policy="ignore" \
stonith-enabled="false" \
crmd-transition-delay="0s" \
last-lrm-refresh="1389346364"
rsc_defaults $id="rsc-options" \
resource-stickiness="INFINITY" \
migration-threshold="1"
#vim:set syntax=pcmk
More information about the Pacemaker
mailing list