[Pacemaker] postgresql failover

Thu Aug 15 16:00:24 UTC 2013

Hey folks,

Following few battles with the thing - I managed to get pgsql RA to run on
4 nodes, it's all great, however...
When testing the failover, I unplugged the 'master' machine, the slaves are
getting sorted out, new master is elected, however the slaves now don't
reconnect to the new master.
They all complain about missing stuff in pg_archive, which I was told to
ignore.
But they still don't reconnect to the new master to keep the replication
going.

cp: cannot stat
`/var/lib/pgsql/9.2/data/pg_archive/00000007000000010000003F': No such file
or directory
cp: cannot stat
`/var/lib/pgsql/9.2/data/pg_archive/00000007000000010000003F': No such file
or directory
cp: cannot stat `/var/lib/pgsql/9.2/data/pg_archive/00000008.history': No
such file or directory
FATAL:  timeline 8 of the primary does not match recovery target timeline 7

It's the last line that worries me.
Until I run rsync manually to sync up pg_archive with master, it doesn't
work anymore.

Not sure where did I go wrong.

Here's my crm config:

node hanode01 \
        attributes pgsql-data-status="DISCONNECT"
kernel="2.6.32-279.el6.x86_64" foobar="barfoo"
node hanode02 \
        attributes pgsql-data-status="DISCONNECT"
node hanode03 \
        attributes pgsql-data-status="LATEST"
node hanode04 \
        attributes pgsql-data-status="DISCONNECT"
primitive pgsql ocf:heartbeat:pgsql \
        params pgctl="/usr/pgsql-9.2/bin/pg_ctl"
psql="/usr/pgsql-9.2/bin/psql" pgdata="/var/lib/pgsql/9.2/data/"
restore_command="cp /var/lib/pgsql/9.2/data/pg_archive/\%f \%p"
start_opt="-p 5432" rep_mode="async" node_list="hanode01 hanode02 hanode03
hanode04" master_ip="10.0.1.100" stop_escalate="0" repuser="replicator"
monitor_password="lemon31ee7" monitor_user="monitor" \
        op start interval="0s" role="Master" timeout="260s"
on-fail="restart" \
        op monitor interval="2s" role="Master" timeout="260s"
on-fail="restart" \
        op monitor interval="7s" timeout="260s" on-fail="restart" \
        op promote interval="0s" timeout="260s" on-fail="restart" \
        op demote interval="0s" timeout="260s" on-fail="stop" \
        op stop interval="0s" timeout="260s" on-fail="block" \
        op notify interval="0s" timeout="260s"
primitive vip-master ocf:heartbeat:IPaddr2 \
        params ip="10.0.0.100" nic="eth1" cidr_netmask="24" \
        op start interval="0s" timeout="260s" on-fail="restart" \
        op monitor interval="10s" timeout="260s" on-fail="restart" \
        op stop interval="0s" timeout="260s" on-fail="block"
primitive vip-rep ocf:heartbeat:IPaddr2 \
        params ip="10.0.1.100" nic="eth2" cidr_netmask="24" \
        op start interval="0s" timeout="260s" on-fail="restart" \
        op monitor interval="10s" timeout="260s" on-fail="restart" \
        op stop interval="0s" timeout="260s" on-fail="block"
group master-group vip-master vip-rep
ms msPostgresql pgsql \
        meta master-max="1" master-node-max="1" clone-max="10"
clone-node-max="1" notify="true" target-role="Master"
colocation rsc_colocation-2 inf: master-group msPostgresql:Master
order rsc_order-2 0: msPostgresql:promote master-group:start
symmetrical=false
order rsc_order-3 0: msPostgresql:demote master-group:stop symmetrical=false
property $id="cib-bootstrap-options" \
        dc-version="1.1.9-1512.el6-2a917dd" \
        cluster-infrastructure="classic openais (with plugin)" \
        expected-quorum-votes="4" \
        stonith-enabled="false" \
        no-quorum-policy="ignore" \
        last-lrm-refresh="1376582085"
rsc_defaults $id="rsc_defaults-options" \
        resource-stickiness="INFINITY" \
        migration-threshold="5"

and postgresql configuration:
listen_addresses = '*'
wal_level = hot_standby
synchronous_commit = on
archive_mode = on
archive_command = 'cp %p /var/lib/pgsql/9.2/data/pg_archive/%f'
max_wal_senders=5
wal_keep_segments = 32
hot_standby = on
restart_after_crash = off
replication_timeout = 5000         # mseconds
wal_receiver_status_interval = 2   # seconds
max_standby_streaming_delay = -1
max_standby_archive_delay = -1
synchronous_commit = on
restart_after_crash = off
hot_standby_feedback = on

, pg_hba:

# "local" is for Unix domain socket connections only
local   all             all                                     trust
# IPv4 local connections:
host    all             all             127.0.0.1/32            trust
# Allow replication connections from localhost, by a user with the
# replication privilege.
#local   replication     postgres                                peer
host    replication     postgres        127.0.0.1/32            trust
host    replication     replicator        10.0.0.0/8            trust
host    all             all             10.0.0.0/8               md5

-- 
GJ
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.clusterlabs.org/pipermail/pacemaker/attachments/20130815/aa1e4860/attachment-0003.html>