[Pacemaker] postgresql failover
Gregg Jaskiewicz
gryzman at gmail.com
Thu Aug 15 12:00:24 EDT 2013
Hey folks,
Following few battles with the thing - I managed to get pgsql RA to run on
4 nodes, it's all great, however...
When testing the failover, I unplugged the 'master' machine, the slaves are
getting sorted out, new master is elected, however the slaves now don't
reconnect to the new master.
They all complain about missing stuff in pg_archive, which I was told to
ignore.
But they still don't reconnect to the new master to keep the replication
going.
cp: cannot stat
`/var/lib/pgsql/9.2/data/pg_archive/00000007000000010000003F': No such file
or directory
cp: cannot stat
`/var/lib/pgsql/9.2/data/pg_archive/00000007000000010000003F': No such file
or directory
cp: cannot stat `/var/lib/pgsql/9.2/data/pg_archive/00000008.history': No
such file or directory
FATAL: timeline 8 of the primary does not match recovery target timeline 7
It's the last line that worries me.
Until I run rsync manually to sync up pg_archive with master, it doesn't
work anymore.
Not sure where did I go wrong.
Here's my crm config:
node hanode01 \
attributes pgsql-data-status="DISCONNECT"
kernel="2.6.32-279.el6.x86_64" foobar="barfoo"
node hanode02 \
attributes pgsql-data-status="DISCONNECT"
node hanode03 \
attributes pgsql-data-status="LATEST"
node hanode04 \
attributes pgsql-data-status="DISCONNECT"
primitive pgsql ocf:heartbeat:pgsql \
params pgctl="/usr/pgsql-9.2/bin/pg_ctl"
psql="/usr/pgsql-9.2/bin/psql" pgdata="/var/lib/pgsql/9.2/data/"
restore_command="cp /var/lib/pgsql/9.2/data/pg_archive/\%f \%p"
start_opt="-p 5432" rep_mode="async" node_list="hanode01 hanode02 hanode03
hanode04" master_ip="10.0.1.100" stop_escalate="0" repuser="replicator"
monitor_password="lemon31ee7" monitor_user="monitor" \
op start interval="0s" role="Master" timeout="260s"
on-fail="restart" \
op monitor interval="2s" role="Master" timeout="260s"
on-fail="restart" \
op monitor interval="7s" timeout="260s" on-fail="restart" \
op promote interval="0s" timeout="260s" on-fail="restart" \
op demote interval="0s" timeout="260s" on-fail="stop" \
op stop interval="0s" timeout="260s" on-fail="block" \
op notify interval="0s" timeout="260s"
primitive vip-master ocf:heartbeat:IPaddr2 \
params ip="10.0.0.100" nic="eth1" cidr_netmask="24" \
op start interval="0s" timeout="260s" on-fail="restart" \
op monitor interval="10s" timeout="260s" on-fail="restart" \
op stop interval="0s" timeout="260s" on-fail="block"
primitive vip-rep ocf:heartbeat:IPaddr2 \
params ip="10.0.1.100" nic="eth2" cidr_netmask="24" \
op start interval="0s" timeout="260s" on-fail="restart" \
op monitor interval="10s" timeout="260s" on-fail="restart" \
op stop interval="0s" timeout="260s" on-fail="block"
group master-group vip-master vip-rep
ms msPostgresql pgsql \
meta master-max="1" master-node-max="1" clone-max="10"
clone-node-max="1" notify="true" target-role="Master"
colocation rsc_colocation-2 inf: master-group msPostgresql:Master
order rsc_order-2 0: msPostgresql:promote master-group:start
symmetrical=false
order rsc_order-3 0: msPostgresql:demote master-group:stop symmetrical=false
property $id="cib-bootstrap-options" \
dc-version="1.1.9-1512.el6-2a917dd" \
cluster-infrastructure="classic openais (with plugin)" \
expected-quorum-votes="4" \
stonith-enabled="false" \
no-quorum-policy="ignore" \
last-lrm-refresh="1376582085"
rsc_defaults $id="rsc_defaults-options" \
resource-stickiness="INFINITY" \
migration-threshold="5"
and postgresql configuration:
listen_addresses = '*'
wal_level = hot_standby
synchronous_commit = on
archive_mode = on
archive_command = 'cp %p /var/lib/pgsql/9.2/data/pg_archive/%f'
max_wal_senders=5
wal_keep_segments = 32
hot_standby = on
restart_after_crash = off
replication_timeout = 5000 # mseconds
wal_receiver_status_interval = 2 # seconds
max_standby_streaming_delay = -1
max_standby_archive_delay = -1
synchronous_commit = on
restart_after_crash = off
hot_standby_feedback = on
, pg_hba:
# "local" is for Unix domain socket connections only
local all all trust
# IPv4 local connections:
host all all 127.0.0.1/32 trust
# Allow replication connections from localhost, by a user with the
# replication privilege.
#local replication postgres peer
host replication postgres 127.0.0.1/32 trust
host replication replicator 10.0.0.0/8 trust
host all all 10.0.0.0/8 md5
--
GJ
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.clusterlabs.org/pipermail/pacemaker/attachments/20130815/aa1e4860/attachment-0002.html>
More information about the Pacemaker
mailing list