Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=e1…
Commit: e114e9ad6579674555bfa019a29189b0a771bfe0
Parent: 66c89c1bbedd2217b199e76814960aae97d6b0fe
Author: Fabio M. Di Nitto <fdinitto(a)redhat.com>
AuthorDate: Tue Apr 24 12:07:25 2012 +0200
Committer: Ryan McCabe <rmccabe(a)redhat.com>
CommitterDate: Tue May 1 10:57:48 2012 -0400
cpglockd: fix a startup race condition
[root@clusternet-node1 shm]# cpglockd -f
nodeid 1 already in group with PID 8126 8126
Unable to join CPG group
in some cases the cpg membership is updated between
cpglockd joining and cpglockd requesting current membership,
triggering the above error incorrectly.
check if the pids are the same, we can be rather sure
that it's us (given previous fix for startup pid/lock check)
Signed-off-by: Fabio M. Di Nitto <fdinitto(a)redhat.com>
Reviewed-by: Lon Hohberger <lhh(a)redhat.com>
Reviewed-by: Ryan McCabe <rmccabe(a)redhat.com>
---
rgmanager/src/daemons/cpglockd.c | 10 ++++++----
1 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/rgmanager/src/daemons/cpglockd.c b/rgmanager/src/daemons/cpglockd.c
index 5b6ea79..1bd53d3 100644
--- a/rgmanager/src/daemons/cpglockd.c
+++ b/rgmanager/src/daemons/cpglockd.c
@@ -1309,10 +1309,12 @@ cpg_init(void)
for (i = 0 ; i < cpg_member_list_len ; i++) {
if (member_list[i].nodeid == my_node_id) {
- fprintf(stderr, "nodeid %d already in group with PID %u\n",
- member_list[i].nodeid, member_list[i].pid);
- cpg_fin();
- return -1;
+ if (member_list[i].pid != getpid()) {
+ fprintf(stderr, "nodeid %d already in group with PID %u %u\n",
+ member_list[i].nodeid, member_list[i].pid, getpid());
+ cpg_fin();
+ return -1;
+ }
}
}
Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=7e…
Commit: 7e44784a83001516ab02ec66495fc1b83255ea3a
Parent: 26c3db58dac85cabef6001b9b9abc3a089b77c26
Author: Lon Hohberger <lhh(a)redhat.com>
AuthorDate: Tue May 1 10:18:58 2012 -0400
Committer: Lon Hohberger <lhh(a)redhat.com>
CommitterDate: Tue May 1 10:28:00 2012 -0400
rgmanager: Add man page for cpglockd; update rgmanager.8
Signed-off-by: Lon Hohberger <lhh(a)redhat.com>
---
rgmanager/man/cpglockd.8 | 21 +++++++++++++++++++++
rgmanager/man/rgmanager.8 | 7 ++++++-
2 files changed, 27 insertions(+), 1 deletions(-)
diff --git a/rgmanager/man/cpglockd.8 b/rgmanager/man/cpglockd.8
new file mode 100644
index 0000000..8084d96
--- /dev/null
+++ b/rgmanager/man/cpglockd.8
@@ -0,0 +1,21 @@
+.TH "cpglockd" "8" "May 2012" "" "Red Hat High Availability"
+.SH "NAME"
+cpglockd \- CPG lock server for rgmanager
+.SH "DESCRIPTION"
+.PP
+.B cpglockd
+utilizes the extended virtual synchrony features of the Corosync
+Cluster Engine to implement a simplistic, distributed lock server
+for rgmanager.
+.SH "COMMAND LINE OPTIONS"
+.IP \-F
+Don't wait for the current node to join the fencing domain at startup
+.IP \-Q
+Don't wait for quorum formation at startup
+.IP \-f
+Operate in the foreground mode; do not daemonize
+.IP \-h
+Print command line usage.
+
+.SH "SEE ALSO"
+rgmanager(8), corosync(8)
diff --git a/rgmanager/man/rgmanager.8 b/rgmanager/man/rgmanager.8
index 2b018d2..bab52ef 100644
--- a/rgmanager/man/rgmanager.8
+++ b/rgmanager/man/rgmanager.8
@@ -380,8 +380,13 @@ Do not perform stop-before-start. Combined with the
.I -Z
flag to clusvcadm, this can be used to allow rgmanager to be upgraded
without stopping a given user service or set of services.
+.IP \-C [0|1]
+Explicitly disable or enable CPG-based locking. The default is to
+enable this when RRP is turned on (which requires a cluster outage).
+This option MUST be the same on all hosts in the cluster and must
+only be enabled or disabled with all instances of rgmanager turned off.
.SH "SEE ALSO"
http://sources.redhat.com/cluster/wiki/RGManager
-clusvcadm(8), cluster.conf(5)
+clusvcadm(8), cluster.conf(5), cpglockd(8)
Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=af…
Commit: af8536f05c6d908b1ea8af115a53f5e4f3ae8e79
Parent: 32c6ea0595c7091f13bcfbae8879c7e9ef035d98
Author: Ryan McCabe <rmccabe(a)redhat.com>
AuthorDate: Thu Apr 26 14:12:37 2012 -0400
Committer: Ryan McCabe <rmccabe(a)redhat.com>
CommitterDate: Thu Apr 26 14:12:37 2012 -0400
Work around a fenced hang/delay that can result in us unpausing too soon while waiting for fencing to complete.
Signed-off-by: Ryan McCabe <rmccabe(a)redhat.com>
---
rgmanager/src/daemons/cpglockd.c | 53 +++++++++++++++++++++++++-------------
1 files changed, 35 insertions(+), 18 deletions(-)
diff --git a/rgmanager/src/daemons/cpglockd.c b/rgmanager/src/daemons/cpglockd.c
index fa40e98..f101084 100644
--- a/rgmanager/src/daemons/cpglockd.c
+++ b/rgmanager/src/daemons/cpglockd.c
@@ -40,6 +40,7 @@ struct lock_node {
struct pending_fence_node {
list_head();
int nodeid;
+ int force_wait;
uint64_t fail_time;
};
@@ -96,6 +97,22 @@ flag_shutdown(int __attribute__ ((unused)) sig)
shutdown_pending = 1;
}
+
+static int
+is_member(uint32_t nodeid)
+{
+ struct member_node *n;
+ int x;
+
+ list_for(&group_members, n, x) {
+ if (n->nodeid == nodeid)
+ return 1;
+ }
+
+ return 0;
+}
+
+
static int
cman_nodes_lost(cman_node_t *old_nodes,
size_t old_node_len,
@@ -221,6 +238,14 @@ cman_callback(cman_handle_t ch, void *privdata, int reason, int arg)
pf = do_alloc(sizeof(*pf));
pf->nodeid = cur_nodeid;
pf->fail_time = cur_time;
+ /*
+ ** If the node is also a member of the cpglock group, wait
+ ** for positive confirmation from fenced that it was fenced.
+ ** It cannot have shut down cleanly if we did not process a
+ ** DELETE for it yet.
+ */
+ if (is_member(cur_nodeid))
+ pf->force_wait = 1;
list_append(&pending_fencing, pf);
} else {
logt_print(LOG_DEBUG, "Lost node %d but fencing not configured\n",
@@ -904,21 +929,6 @@ process_lock(struct cpg_lock_msg *m)
static int
-is_member(uint32_t nodeid)
-{
- struct member_node *n;
- int x;
-
- list_for(&group_members, n, x) {
- if (n->nodeid == nodeid)
- return 1;
- }
-
- return 0;
-}
-
-
-static int
process_grant(struct cpg_lock_msg *m, uint32_t nodeid)
{
struct lock_node *l;
@@ -1645,7 +1655,8 @@ main(int argc, char **argv)
}
if (lft > pf_node->fail_time) {
- logt_print(LOG_DEBUG, "Fencing for node %d finished at %ld (>%ld)\n",
+ logt_print(LOG_DEBUG,
+ "Fencing for node %d finished at %ld (>%ld)\n",
pf_node->nodeid, lft, pf_node->fail_time);
list_remove(&pending_fencing, pf_node);
free(pf_node);
@@ -1663,13 +1674,14 @@ main(int argc, char **argv)
** victim to 1 by now, we can deduce it has left cleanly, and we
** don't need to wait for it.
*/
- if (!victim && !x) {
+ if (!victim && !x && !pf_node->force_wait) {
int retries = 0;
/* Wait up to 1s for fenced to set victim */
do {
usleep(250000);
if (fenced_node_info(pf_node->nodeid, &fn) < 0) {
- logt_print(LOG_DEBUG, "Unable to get fenced data for node %d\n",
+ logt_print(LOG_DEBUG,
+ "Unable to get fenced data for node %d\n",
pf_node->nodeid);
} else
victim = fn.victim;
@@ -1685,6 +1697,11 @@ main(int argc, char **argv)
}
goto fence_check;
}
+ if (!victim && !x && pf_node->force_wait) {
+ logt_print(LOG_DEBUG, "Would have removed %d but now waiting\n",
+ pf_node->nodeid);
+ }
+
}
if (shutdown_pending)
Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=16…
Commit: 16ae7b906511976910aa17f879c9e2a6e14b0cfa
Parent: 44a2a2f0b39b90154d36eeaeac8debeb291c9097
Author: Fabio M. Di Nitto <fdinitto(a)redhat.com>
AuthorDate: Wed Apr 25 09:25:07 2012 +0200
Committer: Fabio M. Di Nitto <fdinitto(a)redhat.com>
CommitterDate: Wed Apr 25 09:25:07 2012 +0200
rgmanager: fix cpglockd vs rgmanager startup race condition with systemd
systemd consider a service fully operation as soon as /var/run/foo.pid is
written.
this creates a small window in which rgmanager can start before cpglockd
is operational and rgmanager would fail.
loop over cpglockdump to verify that cpglockd is functional and then
start rgmanager
Signed-off-by: Fabio M. Di Nitto <fdinitto(a)redhat.com>
---
rgmanager/init.d/rgmanager.in | 1 +
1 files changed, 1 insertions(+), 0 deletions(-)
diff --git a/rgmanager/init.d/rgmanager.in b/rgmanager/init.d/rgmanager.in
index e7ee916..40347ef 100644
--- a/rgmanager/init.d/rgmanager.in
+++ b/rgmanager/init.d/rgmanager.in
@@ -79,6 +79,7 @@ start_cpglockd()
{
rings="$(corosync-objctl 2>/dev/null |grep ringnumber | wc -l)"
[ "$rings" -gt "1" ] && service cpglockd start
+ while ! cpglockdump > /dev/null 2>&1; do sleep 1; done
}
rtrn=0