[Pacemaker] Corosync 2.3 dies randomly
Robert Parsons
rparsons at tappublishing.com
Tue May 7 21:01:31 UTC 2013
> Corosync has a blackbox - did you interrogate that too?
I grabbed the latest source from github and the problem remains. Here's
some diagnostic output:
strace on the corosync process:
recvmsg(12, {msg_name(16)={sa_family=AF_INET, sin_port=htons(8999),
sin_addr=inet_addr("10.1.4.133")},
msg_iov(1)=[{"\376\376\0\0\0\0\"\377\205\4\1\no\0\0\0\303\5\0\0o\0\0\0\0\0\0\0w\4\1\n"...,
10000}], msg_controllen=0, msg_flags=0}, MSG_DONTWAIT|MSG_NOSIGNAL) = 74
poll([{fd=8, events=POLLIN}], 1, 0) = 0 (Timeout)
poll([{fd=9, events=POLLIN}], 1, 0) = 0 (Timeout)
clock_gettime(CLOCK_MONOTONIC, {2443, 951478063}) = 0
sendmsg(11, {msg_name(16)={sa_family=AF_INET, sin_port=htons(9000),
sin_addr=inet_addr("10.1.4.122")},
msg_iov(1)=[{"\376\376\0\0\0\0\"\377w\4\1\no\0\0\0\304\5\0\0o\0\0\0\0\0\0\0w\4\1\n"...,
74}], msg_controllen=0, msg_flags=0}, MSG_NOSIGNAL) = 74
clock_gettime(CLOCK_MONOTONIC, {2443, 952476414}) = 0
clock_gettime(CLOCK_MONOTONIC, {2443, 955080092}) = 0
clock_gettime(CLOCK_MONOTONIC, {2443, 955358106}) = 0
clock_gettime(CLOCK_MONOTONIC, {2443, 955640730}) = 0
epoll_wait(4, {{EPOLLIN, {u32=16, u64=1723612260405870608}}}, 12, 0) = 1
clock_gettime(CLOCK_MONOTONIC, {2443, 956315715}) = 0
clock_gettime(CLOCK_MONOTONIC, {2443, 956515750}) = 0
epoll_wait(4, {{EPOLLIN, {u32=16, u64=1723612260405870608}}}, 12, 57) = 1
clock_gettime(CLOCK_MONOTONIC, {2443, 957009047}) = 0
clock_gettime(CLOCK_MONOTONIC, {2443, 957280723}) = 0
recvmsg(12, {msg_name(16)={sa_family=AF_INET, sin_port=htons(8999),
sin_addr=inet_addr("10.1.4.133")},
msg_iov(1)=[{"\376\376\0\0\0\0\"\377\205\4\1\no\0\0\0\311\5\0\0o\0\0\0\0\0\0\0w\4\1\n"...,
10000}], msg_controllen=0, msg_flags=0}, MSG_DONTWAIT|MSG_NOSIGNAL) = 74
poll([{fd=8, events=POLLIN}], 1, 0) = 0 (Timeout)
poll([{fd=9, events=POLLIN}], 1, 0) = 0 (Timeout)
clock_gettime(CLOCK_MONOTONIC, {2443, 958503649}) = 0
sendmsg(11, {msg_name(16)={sa_family=AF_INET, sin_port=htons(9000),
sin_addr=inet_addr("10.1.4.122")},
msg_iov(1)=[{"\376\376\0\0\0\0\"\377w\4\1\no\0\0\0\312\5\0\0o\0\0\0\0\0\0\0w\4\1\n"...,
74}], msg_controllen=0, msg_flags=0}, MSG_NOSIGNAL) = 74
clock_gettime(CLOCK_MONOTONIC, {2443, 959145106}) = 0
clock_gettime(CLOCK_MONOTONIC, {2443, 959414728}) = 0
clock_gettime(CLOCK_MONOTONIC, {2443, 959635399}) = 0
clock_gettime(CLOCK_MONOTONIC, {2443, 959929783}) = 0
epoll_wait(4, {{EPOLLIN, {u32=16, u64=1723612260405870608}}}, 12, 0) = 1
clock_gettime(CLOCK_MONOTONIC, {2443, 960568631}) = 0
clock_gettime(CLOCK_MONOTONIC, {2443, 961997811}) = 0
recvmsg(12, {msg_name(16)={sa_family=AF_INET, sin_port=htons(8999),
sin_addr=inet_addr("10.1.4.133")},
msg_iov(1)=[{"\376\376\0\0\0\0\"\377\205\4\1\no\0\0\0\317\5\0\0o\0\0\0\0\0\0\0w\4\1\n"...,
10000}], msg_controllen=0, msg_flags=0}, MSG_DONTWAIT|MSG_NOSIGNAL) = 74
poll([{fd=8, events=POLLIN}], 1, 0) = 0 (Timeout)
poll([{fd=9, events=POLLIN}], 1, 0) = 0 (Timeout)
clock_gettime(CLOCK_MONOTONIC, {2443, 963223314}) = 0
sendmsg(11, {msg_name(16)={sa_family=AF_INET, sin_port=htons(9000),
sin_addr=inet_addr("10.1.4.122")},
msg_iov(1)=[{"\376\376\0\0\0\0\"\377w\4\1\no\0\0\0\320\5\0\0o\0\0\0\0\0\0\0w\4\1\n"...,
74}], msg_controllen=0, msg_flags=0}, MSG_NOSIGNAL) = 74
clock_gettime(CLOCK_MONOTONIC, {2443, 963959031}) = 0
clock_gettime(CLOCK_MONOTONIC, {2443, 964185154}) = 0
clock_gettime(CLOCK_MONOTONIC, {2443, 964537071}) = 0
clock_gettime(CLOCK_MONOTONIC, {2443, 965621813}) = 0
epoll_wait(4, {{EPOLLIN, {u32=10, u64=4183101728859619338}}}, 12, 0) = 1
clock_gettime(CLOCK_MONOTONIC, {2443, 966212452}) = 0
clock_gettime(CLOCK_MONOTONIC, {2443, 966413788}) = 0
epoll_wait(4, {{EPOLLIN, {u32=10, u64=4183101728859619338}}}, 12, 47) = 1
clock_gettime(CLOCK_MONOTONIC, {2443, 966908130}) = 0
clock_gettime(CLOCK_MONOTONIC, {2443, 967178141}) = 0
recvmsg(8, {msg_name(16)={sa_family=AF_INET, sin_port=htons(8999),
sin_addr=inet_addr("10.1.4.122")},
msg_iov(1)=[{"\376\376\0\0\1\2\"\377z\4\1\n\2z\4\1\n\2\0\n\1\4z\0\0\0\0\0\0\0\0\0"...,
10000}], msg_controllen=0, msg_flags=0}, MSG_DONTWAIT|MSG_NOSIGNAL) = 1472
--- SIGBUS (Bus error) @ 0 (0) ---
corosync-blackbox:
Failed to initialize the cmap API. Error CS_ERR_LIBRARY
Failed to initialize the cmap API. Error CS_ERR_LIBRARY
Dumping the contents of /var/lib/corosync/fdata
[debug] shm size:8388608; real_size:8388608; rb->word_size:2097152
[debug] read total of: 8388620
Ringbuffer:
->NORMAL
->write_pt [673]
->read_pt [0]
->size [2097152 words]
=>free [8385912 bytes]
=>used [2692 bytes]
debug May 07 15:26:52 handle_new_connection(476):2147483648: IPC
credentials authenticated (13438-14116-18)
debug May 07 15:26:52 qb_ipcs_shm_connect(294):9: connecting to client
[14116]
debug May 07 15:26:52 qb_rb_open_2(226):9: shm size:zd; real_size:zd;
rb->word_size:1048576
debug May 07 15:26:52 qb_rb_open_2(226):9: shm size:zd; real_size:zd;
rb->word_size:1048576
debug May 07 15:26:52 qb_rb_open_2(226):9: shm size:zd; real_size:zd;
rb->word_size:1048576
debug May 07 15:26:52 cs_ipcs_connection_created(269):8: connection
created
debug May 07 15:26:52 cmap_lib_init_fn(306):9: lib_init_fn:
conn=0x7feb08b0ea00
debug May 07 15:26:52 qb_ipcs_dispatch_connection_request(723):9: HUP
conn (13438-14116-18)
debug May 07 15:26:52 qb_ipcs_disconnect(565):9:
qb_ipcs_disconnect(13438-14116-18) state:2
debug May 07 15:26:52 _del(117):9: epoll_ctl(del): Bad file descriptor (9)
debug May 07 15:26:52 cs_ipcs_connection_closed(414):8:
cs_ipcs_connection_closed()
debug May 07 15:26:52 cmap_lib_exit_fn(325):9: exit_fn for
conn=0x7feb08b0ea00
debug May 07 15:26:52 cs_ipcs_connection_destroyed(387):8:
cs_ipcs_connection_destroyed()
trace May 07 15:26:52 qb_rb_close(279):9: ENTERING qb_rb_close()
trace May 07 15:26:52 my_posix_sem_destroy(91):9: ENTERING
my_posix_sem_destroy()
debug May 07 15:26:52 qb_rb_close(290):9: Free'ing ringbuffer:
/dev/shm/qb-cmap-response-13438-14116-18-header
trace May 07 15:26:52 qb_rb_close(279):9: ENTERING qb_rb_close()
trace May 07 15:26:52 my_posix_sem_destroy(91):9: ENTERING
my_posix_sem_destroy()
debug May 07 15:26:52 qb_rb_close(290):9: Free'ing ringbuffer:
/dev/shm/qb-cmap-event-13438-14116-18-header
trace May 07 15:26:52 qb_rb_close(279):9: ENTERING qb_rb_close()
trace May 07 15:26:52 my_posix_sem_destroy(91):9: ENTERING
my_posix_sem_destroy()
debug May 07 15:26:52 qb_rb_close(290):9: Free'ing ringbuffer:
/dev/shm/qb-cmap-request-13438-14116-18-header
debug May 07 15:26:52 handle_new_connection(476):2147483648: IPC
credentials authenticated (13438-14118-18)
debug May 07 15:26:52 qb_ipcs_shm_connect(294):9: connecting to client
[14118]
debug May 07 15:26:52 qb_rb_open_2(226):9: shm size:zd; real_size:zd;
rb->word_size:1048576
debug May 07 15:26:52 qb_rb_open_2(226):9: shm size:zd; real_size:zd;
rb->word_size:1048576
debug May 07 15:26:52 qb_rb_open_2(226):9: shm size:zd; real_size:zd;
rb->word_size:1048576
debug May 07 15:26:52 cs_ipcs_connection_created(269):8: connection
created
debug May 07 15:26:52 cmap_lib_init_fn(306):9: lib_init_fn:
conn=0x7feb08b0ea00
ERROR: qb_rb_chunk_read failed: Connection timed out
[trace] ENTERING qb_rb_close()
[debug] Free'ing ringbuffer: /dev/shm/qb-create_from_file-header
syslog at moment of death
May 7 15:47:34 krusty corosync[4660]: [QUORUM] total_votes=6,
expected_votes=17
May 7 15:47:34 krusty corosync[4660]: [QUORUM] node 167838839
state=1, votes=1, expected=17
May 7 15:47:34 krusty corosync[4660]: [QUORUM] node 167838841
state=2, votes=1, expected=17
May 7 15:47:34 krusty corosync[4660]: [QUORUM] node 167838842
state=1, votes=1, expected=17
May 7 15:47:34 krusty corosync[4660]: [QUORUM] node 167838844
state=1, votes=1, expected=17
May 7 15:47:34 krusty corosync[4660]: [QUORUM] node 167838845
state=2, votes=1, expected=17
May 7 15:47:34 krusty corosync[4660]: [QUORUM] node 167838846
state=1, votes=1, expected=17
May 7 15:47:34 krusty corosync[4660]: [QUORUM] node 167838849
state=1, votes=1, expected=17
May 7 15:47:34 krusty corosync[4660]: [QUORUM] node 167838850
state=2, votes=1, expected=17
May 7 15:47:34 krusty corosync[4660]: [QUORUM] node 167838852
state=2, votes=1, expected=17
May 7 15:47:34 krusty corosync[4660]: [QUORUM] node 167838853
state=1, votes=1, expected=17
May 7 15:47:34 krusty corosync[4660]: [QUORUM] node 167838854
state=2, votes=1, expected=17
May 7 15:47:34 krusty corosync[4660]: [QUORUM] got nodeinfo message
from cluster node 167838844
May 7 15:47:34 krusty corosync[4660]: [QUORUM] nodeinfo message[0]:
votes: 0, expected: 0 flags: 0
May 7 15:47:34 krusty corosync[4660]: [QUORUM] got nodeinfo message
from cluster node 167838846
May 7 15:47:34 krusty corosync[4660]: [QUORUM] nodeinfo
message[167838846]: votes: 1, expected: 17 flags: 0
May 7 15:47:34 krusty corosync[4660]: [QUORUM] flags: quorate: No
Leaving: No WFA Status: No First: No Qdevice: No QdeviceAlive: No
QdeviceCastVote: No QdeviceMasterWins: No
May 7 15:47:34 krusty corosync[4660]: [QUORUM] total_votes=6,
expected_votes=17
May 7 15:47:34 krusty corosync[4660]: [QUORUM] node 167838839
state=1, votes=1, expected=17
May 7 15:47:34 krusty corosync[4660]: [QUORUM] node 167838841
state=2, votes=1, expected=17
May 7 15:47:34 krusty corosync[4660]: [QUORUM] node 167838842
state=1, votes=1, expected=17
May 7 15:47:34 krusty corosync[4660]: [QUORUM] node 167838844
state=1, votes=1, expected=17
May 7 15:47:34 krusty corosync[4660]: [QUORUM] node 167838845
state=2, votes=1, expected=17
May 7 15:47:34 krusty corosync[4660]: [QUORUM] node 167838846
state=1, votes=1, expected=17
May 7 15:47:34 krusty corosync[4660]: [QUORUM] node 167838849
state=1, votes=1, expected=17
May 7 15:47:34 krusty corosync[4660]: [QUORUM] node 167838850
state=2, votes=1, expected=17
May 7 15:47:34 krusty corosync[4660]: [QUORUM] node 167838852
state=2, votes=1, expected=17
May 7 15:47:34 krusty corosync[4660]: [QUORUM] node 167838853
state=1, votes=1, expected=17
May 7 15:47:34 krusty corosync[4660]: [QUORUM] node 167838854
state=2, votes=1, expected=17
May 7 15:47:34 krusty rsyslogd-2177: imuxsock begins to drop messages
from pid 4660 due to rate-limiting
May 7 15:47:34 krusty crmd[5631]: notice:
corosync_mark_unseen_peer_dead: Node 167838845/maude was not seen in the
previous transition
May 7 15:47:34 krusty crmd[5631]: notice: crm_update_peer_state:
corosync_mark_unseen_peer_dead: Node maude[167838845] - state is now
lost (was member)
May 7 15:47:36 krusty crmd[5631]: error: crmd_quorum_destroy:
connection terminated
May 7 15:47:36 krusty crmd[5631]: error:
crmd_cib_connection_destroy: Connection to the CIB terminated...
May 7 15:47:36 krusty stonith-ng[5629]: error: pcmk_cpg_dispatch:
Connection to the CPG API failed: 2
May 7 15:47:36 krusty stonith-ng[5629]: error:
stonith_peer_ais_destroy: AIS connection terminated
May 7 15:47:36 krusty attrd[5630]: error: pcmk_cpg_dispatch:
Connection to the CPG API failed: 2
May 7 15:47:36 krusty attrd[5630]: crit: attrd_ais_destroy: Lost
connection to Corosync service!
May 7 15:47:36 krusty attrd[5630]: notice: main: Exiting...
May 7 15:47:36 krusty attrd[5630]: error:
attrd_cib_connection_destroy: Connection to the CIB terminated...
May 7 15:47:36 krusty cib[5628]: error: pcmk_cpg_dispatch:
Connection to the CPG API failed: 2
May 7 15:47:36 krusty cib[5628]: error: cib_ais_destroy: Corosync
connection lost! Exiting.
The process does not produce an error message and does not dump core.
Thanks for any help.
- Rob P.
More information about the Pacemaker
mailing list