Gitweb: http://git.fedorahosted.org/git/?p=cluster.git;a=commitdiff;h=3118b7849b85b…
Commit: 3118b7849b85b4dfc3d1b6979a64ebea893dce0e
Parent: ef45cc5ab4dc50a88506e0360cf360af29235aa4
Author: Christine Caulfield <ccaulfie(a)redhat.com>
AuthorDate: Fri Dec 20 10:35:54 2013 +0000
Committer: Christine Caulfield <ccaulfie(a)redhat.com>
CommitterDate: Fri Dec 20 10:38:18 2013 +0000
cman: fix some typos and English phrasing in cman defaults file
The cman.init.defaults file had some typos and odd non-English phrasing.
I've also fixed a inconsistent punctuation in a few places too
rhbz#1035929
Signed-off-by: Christine Caulfield <ccaulfie(a)redhat.com>
---
cman/init.d/cman.init.defaults.in | 32 ++++++++++++++++----------------
1 files changed, 16 insertions(+), 16 deletions(-)
diff --git a/cman/init.d/cman.init.defaults.in b/cman/init.d/cman.init.defaults.in
index 835b44f..c42672c 100644
--- a/cman/init.d/cman.init.defaults.in
+++ b/cman/init.d/cman.init.defaults.in
@@ -1,23 +1,23 @@
-# CMAN_CLUSTER_TIMEOUT -- amount of time to wait for joinging a cluster
+# CMAN_CLUSTER_TIMEOUT -- amount of time to wait to join a cluster
# before giving up. If CMAN_CLUSTER_TIMEOUT is positive, then we will
-# wait CMAN_CLUSTER_TIMEOUT seconds before giving up and failing when
-# a cluster is not joined. If CMAN_CLUSTER_TIMEOUT is zero, then
-# wait indefinately for a cluster join. If CMAN_CLUSTER_TIMEOUT is
-# negative, do not check to see that the cluster has been joined
+# wait CMAN_CLUSTER_TIMEOUT seconds before giving up and failing if
+# we can't join a cluster. If CMAN_CLUSTER_TIMEOUT is zero, then we
+# will wait indefinitely for a cluster join. If CMAN_CLUSTER_TIMEOUT is
+# negative, do not check to see if we have joined a cluster.
#CMAN_CLUSTER_TIMEOUT=60
# CMAN_QUORUM_TIMEOUT -- amount of time to wait for a quorate cluster on
-# startup quorum is needed by many other applications, so we may as
+# startup. Quorum is needed by many other applications, so we may as
# well wait here. If CMAN_QUORUM_TIMEOUT is zero, quorum will
# be ignored.
#CMAN_QUORUM_TIMEOUT=45
# CMAN_SHUTDOWN_TIMEOUT -- amount of time to wait for cman to become a
-# cluster member before calling cman_tool leave during shutdown.
+# cluster member before calling 'cman_tool' leave during shutdown.
# The default is 60 seconds
#CMAN_SHUTDOWN_TIMEOUT=60
-# CMAN_NOTIFYD_START - control the startup behaviour for cmannotifyd
+# CMAN_NOTIFYD_START - control the startup behaviour for cmannotifyd,
# the variable can take 3 values:
# yes | will always start cmannotifyd
# no | will never start cmannotifyd
@@ -25,7 +25,7 @@
# are found in @NOTIFYDDIR@
#CMAN_NOTIFYD_START=conditional
-# CMAN_SSHD_START - control sshd startup behaviour
+# CMAN_SSHD_START -- control sshd startup behaviour,
# the variable can take 2 values:
# yes | cman will start sshd as early as possible
# no (default) | cman will not start sshd
@@ -74,7 +74,7 @@
# CLUSTERNAME -- override clustername as specified in cluster.conf
#CLUSTERNAME=""
-# NODENAME -- specify the nodename of this node. Default autodetected
+# NODENAME -- specify the nodename of this node. Default autodetected.
#NODENAME=""
# CONFIG_LOADER -- select default config parser.
@@ -83,21 +83,21 @@
# config propagation method. (default)
#CONFIG_LOADER=xmlconfig
-# CONFIG_VALIDATION -- select default config validation behaviour
+# CONFIG_VALIDATION -- select default config validation behaviour.
# This can be:
# FAIL - Use a very strict checking. The config will not be loaded if there
-# for any kind of warnings/errors.
-# WARN - Same as FAIL, but will allow the config to load (this is temporary
+# are any kind of warnings/errors
+# WARN - Same as FAIL, but will allow the config to load (this is temporarily
# the default behaviour)
-# NONE - Disable config validation. Highly discouraged.
+# NONE - Disable config validation. Highly discouraged
#CONFIG_VALIDATION=WARN
# CMAN_LEAVE_OPTS -- allows extra options to be passed to cman_tool when leave
# operation is performed.
#CMAN_LEAVE_OPTS=""
-# INITLOGLEVEL -- select how verbose the init script should be
-# possible values:
+# INITLOGLEVEL -- select how verbose the init script should be.
+# Possible values:
# quiet - only one line notification for start/stop operations
# terse (default) - show only required activity
# full - show everything
Gitweb: http://git.fedorahosted.org/git/?p=fence-agents.git;a=commitdiff;h=908fd921…
Commit: 908fd921599b4e494028f3972a71c99c0aed674f
Parent: 697f4c0e1b59cefa7768f1947ce18d0813723ec5
Author: Marek 'marx' Grac <mgrac(a)redhat.com>
AuthorDate: Mon Dec 16 16:09:50 2013 +0100
Committer: Marek 'marx' Grac <mgrac(a)redhat.com>
CommitterDate: Mon Dec 16 16:09:50 2013 +0100
fence_virsh: Add UUID support for fence_virsh
UUID support for fence_virsh (libvirt) was added. The only function which has to be rewritted was
get_power_status as currently, there is no easy way how to print (name, uuid and state) via virsh.
This patch is based on one proposed by Bogdan Dobrelya for RHEL6.
---
fence/agents/virsh/fence_virsh.py | 26 +++++++++++++++-----------
1 files changed, 15 insertions(+), 11 deletions(-)
diff --git a/fence/agents/virsh/fence_virsh.py b/fence/agents/virsh/fence_virsh.py
index ac8dc7b..1ec5310 100644
--- a/fence/agents/virsh/fence_virsh.py
+++ b/fence/agents/virsh/fence_virsh.py
@@ -15,6 +15,9 @@ REDHAT_COPYRIGHT=""
BUILD_DATE=""
#END_VERSION_GENERATION
+def get_name_or_uuid(options):
+ return options["--uuid"] if options.has_key("--uuid") else options["--plug"]
+
def get_outlets_status(conn, options):
if options.has_key("--use-sudo"):
prefix = SUDO_PATH + " "
@@ -40,20 +43,21 @@ def get_outlets_status(conn, options):
return result
def get_power_status(conn, options):
- outlets = get_outlets_status(conn, options)
+ prefix = SUDO_PATH + " " if options.has_key("--use-sudo") else ""
+ conn.sendline(prefix + "virsh domstate %s" % (get_name_or_uuid(options)))
+ conn.log_expect(options, options["--command-prompt"], int(options["--shell-timeout"]))
- if (not (options["--plug"] in outlets)):
- fail_usage("Failed: You have to enter existing name of virtual machine!")
- else:
- return outlets[options["--plug"]][1]
+ for line in conn.before.splitlines():
+ if line.strip() in ["running", "blocked", "idle", "no state", "paused"]:
+ return "on"
+ if "error:" in line.strip():
+ fail_usage("Failed: You have to enter existing name/UUID of virtual machine!")
-def set_power_status(conn, options):
- if options.has_key("--use-sudo"):
- prefix = SUDO_PATH + " "
- else:
- prefix = ""
+ return "off"
- conn.sendline(prefix + "virsh %s "%(options["--action"] == "on" and "start" or "destroy")+options["--plug"])
+def set_power_status(conn, options):
+ prefix = SUDO_PATH + " " if options.has_key("--use-sudo") else ""
+ conn.sendline(prefix + "virsh %s "%(options["--action"] == "on" and "start" or "destroy") + get_name_or_uuid(options))
conn.log_expect(options, options["--command-prompt"], int(options["--power-timeout"]))
time.sleep(int(options["--power-wait"]))
Gitweb: http://git.fedorahosted.org/git/?p=cluster.git;a=commitdiff;h=10c3dec289b63…
Commit: 10c3dec289b63a0905421ad3e827ec5d245396a7
Parent: c61e71766564cdeaea3c46cfce061000a0aa3879
Author: David Teigland <teigland(a)redhat.com>
AuthorDate: Mon Oct 14 11:42:26 2013 -0500
Committer: Christine Caulfield <ccaulfie(a)redhat.com>
CommitterDate: Tue Dec 17 10:37:56 2013 +0000
gfs_controld: fix plock transfer during first mount recovery
The plock checkpoint is not unlinked properly during certain
first mount recovery situations (lower nodeid mounts while higher
nodeid is doing first mounter recovery). This leaves a stray
checkpoint that prevents the following checkpoint from being created,
which causes plock state to not be transferred to mounting nodes,
which can lead to a plock being granted in multiple places at once.
node2: mount /gfs (it does first mount recovery)
node1: mount /gfs (while node2 is still doing first mount recovery)
node2: creates a plock checkpoint (empty) for node1, then closes
checkpoint because new low nodeid is now in charge of it
node2: sends journal info to node1
node1: gets journal info from node1
Takes special code path because node1 is still doing first
recovery. Does not call retrieve_plocks on this code path
because there are no plocks to retrieve in this case. But,
the retrieve_plocks function is also responsible for unlinking
the existing checkpoint on a new low nodeid, which this is.
So, node1 does not unlink the checkpoint as it should.
node2: finishes first mount recovery, completes mount
node1: notified that node2's first recovery is done, completes mount
node2: doplock /gfs/test (granted)
node1: killed
node1: restarts
node1: mount /gfs
node2: tries to create checkpoint to transfer the plock state to node1,
but this fails because the checkpoint exists, because node1 did
not unlink it above. So, plock state is not transfered to node1.
node1: doplock /gfs/test (granted)
The result is that both nodes have the same plock granted concurrently.
The solution is for node1 to call retrieve_plocks on the first mounter
code path, as it does on the normal code path. retrieve_plocks will
unlink the checkpoint in this case.
This patch also adds a lower level backup method to create plock
checkpoints if an unlink was missed in some cases. If
store_checkpoints finds the checkpoint exists, it will try once
to unlink it and recreate it.
Signed-off-by: David Teigland <teigland(a)redhat.com>
---
group/gfs_controld/plock.c | 11 +++++++----
group/gfs_controld/recover.c | 4 ++++
2 files changed, 11 insertions(+), 4 deletions(-)
diff --git a/group/gfs_controld/plock.c b/group/gfs_controld/plock.c
index d96604f..51e0882 100644
--- a/group/gfs_controld/plock.c
+++ b/group/gfs_controld/plock.c
@@ -2012,6 +2012,7 @@ void store_plocks(struct mountgroup *mg, int nodeid)
struct lock_waiter *w;
int r_count, lock_count, total_size, section_size, max_section_size;
int len, owner;
+ int retry_count = 0;
if (!plocks_online)
return;
@@ -2087,13 +2088,15 @@ void store_plocks(struct mountgroup *mg, int nodeid)
if (rv == SA_AIS_ERR_EXIST) {
log_group(mg, "store_plocks: ckpt already exists");
log_error("store_plocks: ckpt already exists");
- /* TODO: best to unlink and retry? */
- /*
+ /* We should in general be unlinking the ckpt in the
+ proper places to avoid hitting this, but there are
+ probably some cases where we miss the unlink, so
+ this is a backup method. */
+ if (retry_count++)
+ return;
_unlink_checkpoint(mg, &name);
sleep(1);
goto open_retry;
- */
- return;
}
if (rv != SA_AIS_OK) {
log_error("store_plocks: ckpt open error %d %s", rv, mg->name);
diff --git a/group/gfs_controld/recover.c b/group/gfs_controld/recover.c
index f70f798..87eee63 100644
--- a/group/gfs_controld/recover.c
+++ b/group/gfs_controld/recover.c
@@ -1018,6 +1018,10 @@ void received_our_jid(struct mountgroup *mg)
log_group(mg, "other node doing first mounter recovery, "
"set mount_client_delay");
mg->mount_client_delay = 1;
+ /* There should be no plocks to retrieve since the fs is being
+ mounted initially, but retrieve is needed to unlink an
+ existing checkpoint if we are the new master. */
+ retrieve_plocks(mg);
mg->save_plocks = 0;
return;
}
Gitweb: http://git.fedorahosted.org/git/?p=cluster.git;a=commitdiff;h=c61e71766564c…
Commit: c61e71766564cdeaea3c46cfce061000a0aa3879
Parent: dca24557dceec608a0e5a36efcc1005fd03f4549
Author: David Teigland <teigland(a)redhat.com>
AuthorDate: Thu Oct 10 11:58:58 2013 -0500
Committer: Christine Caulfield <ccaulfie(a)redhat.com>
CommitterDate: Tue Dec 17 10:37:50 2013 +0000
gfs_controld: fix plock recovery
When there are two nodes in the cluster, and the
the node in charge of the plock checkpoint fails,
the remaining node does not unlink the checkpoint
that had been created by the failed node. When
the failed node returns, and the new node attempts
to transfer plock state, it fails to create a new
checkpoint because it did not unlink the previous
checkpoint created by the failed node. This leads
to any existing plock state not being transferred
to the newly joined node. The newly joined node
will then mistakenly grant plocks to itself that
may conflict with plocks that the other node could
not transfer. This leads to:
1. conflicting plocks being held concurrently
2. dangling plocks that are not held but not removed
In the explanation above, the reason the remaining
node does not unlink the checkpoint that had been
created by the other node, is that it does not know
that the other node was in charge of the checkpoint.
It could only know this if it had been present before
and after the previous membership change. Because
there are only two nodes, this was not possible.
This, however, is also the point exploited to fix
the problem. When there are only two members, a new
node can assume that the other node is in charge of
the checkpoint.
The following test shows the problem/fix using
a program "doplock" that requests an exclusive,
blocking posix lock on the given file.
node1: mount /gfs
node2: mount /gfs
node1: touch /gfs/test
node1: doplock /gfs/test (granted)
node2: doplock /gfs/test (blocks)
node1: killed
node2: recovery for node1
node2: doplock above granted the lock
node1: restarts
node1: mount /gfs
node1: doplock /gfs/test
In the last step, the node1 doplock should block
because node2 holds the lock. Before the fix,
it was granted.
Signed-off-by: David Teigland <teigland(a)redhat.com>
---
group/gfs_controld/plock.c | 7 +++++++
group/gfs_controld/recover.c | 14 ++++++++++++--
2 files changed, 19 insertions(+), 2 deletions(-)
diff --git a/group/gfs_controld/plock.c b/group/gfs_controld/plock.c
index 4330a2c..d96604f 100644
--- a/group/gfs_controld/plock.c
+++ b/group/gfs_controld/plock.c
@@ -2086,6 +2086,13 @@ void store_plocks(struct mountgroup *mg, int nodeid)
}
if (rv == SA_AIS_ERR_EXIST) {
log_group(mg, "store_plocks: ckpt already exists");
+ log_error("store_plocks: ckpt already exists");
+ /* TODO: best to unlink and retry? */
+ /*
+ _unlink_checkpoint(mg, &name);
+ sleep(1);
+ goto open_retry;
+ */
return;
}
if (rv != SA_AIS_OK) {
diff --git a/group/gfs_controld/recover.c b/group/gfs_controld/recover.c
index b33b3fd..f70f798 100644
--- a/group/gfs_controld/recover.c
+++ b/group/gfs_controld/recover.c
@@ -1257,8 +1257,15 @@ void update_master_nodeid(struct mountgroup *mg)
{
struct mg_member *memb;
int new = -1, low = -1;
+ int other_nodeid = -1;
+ int total = 0;
list_for_each_entry(memb, &mg->members, list) {
+ total++;
+
+ if (memb->nodeid != our_nodeid)
+ other_nodeid = memb->nodeid;
+
if (low == -1 || memb->nodeid < low)
low = memb->nodeid;
if (!memb->finished)
@@ -1268,6 +1275,9 @@ void update_master_nodeid(struct mountgroup *mg)
}
mg->master_nodeid = new;
mg->low_nodeid = low;
+
+ if (new == -1 && total == 2)
+ mg->master_nodeid = other_nodeid;
}
/* This can happen before we receive a journals message for our mount. */
@@ -1354,8 +1364,8 @@ void recover_members(struct mountgroup *mg, int num_nodes,
*pos_out = pos;
*neg_out = neg;
- log_group(mg, "total members %d master_nodeid %d prev %d",
- mg->memb_count, mg->master_nodeid, prev_master_nodeid);
+ log_group(mg, "total members %d master_nodeid %d prev %d failed %d",
+ mg->memb_count, mg->master_nodeid, prev_master_nodeid, master_failed);
/* The master failed and we're the new master, we need to:
Gitweb: http://git.fedorahosted.org/git/?p=cluster.git;a=commitdiff;h=6bb05a0862875…
Commit: 6bb05a0862875033268faceddc764c2b2c313510
Parent: a8b3d9f0d75253ffb63858c0ac6cfd770d9c0d24
Author: David Teigland <teigland(a)redhat.com>
AuthorDate: Mon Oct 14 11:42:26 2013 -0500
Committer: Christine Caulfield <ccaulfie(a)redhat.com>
CommitterDate: Tue Dec 17 09:19:32 2013 +0000
gfs_controld: fix plock transfer during first mount recovery
The plock checkpoint is not unlinked properly during certain
first mount recovery situations (lower nodeid mounts while higher
nodeid is doing first mounter recovery). This leaves a stray
checkpoint that prevents the following checkpoint from being created,
which causes plock state to not be transferred to mounting nodes,
which can lead to a plock being granted in multiple places at once.
node2: mount /gfs (it does first mount recovery)
node1: mount /gfs (while node2 is still doing first mount recovery)
node2: creates a plock checkpoint (empty) for node1, then closes
checkpoint because new low nodeid is now in charge of it
node2: sends journal info to node1
node1: gets journal info from node1
Takes special code path because node1 is still doing first
recovery. Does not call retrieve_plocks on this code path
because there are no plocks to retrieve in this case. But,
the retrieve_plocks function is also responsible for unlinking
the existing checkpoint on a new low nodeid, which this is.
So, node1 does not unlink the checkpoint as it should.
node2: finishes first mount recovery, completes mount
node1: notified that node2's first recovery is done, completes mount
node2: doplock /gfs/test (granted)
node1: killed
node1: restarts
node1: mount /gfs
node2: tries to create checkpoint to transfer the plock state to node1,
but this fails because the checkpoint exists, because node1 did
not unlink it above. So, plock state is not transfered to node1.
node1: doplock /gfs/test (granted)
The result is that both nodes have the same plock granted concurrently.
The solution is for node1 to call retrieve_plocks on the first mounter
code path, as it does on the normal code path. retrieve_plocks will
unlink the checkpoint in this case.
This patch also adds a lower level backup method to create plock
checkpoints if an unlink was missed in some cases. If
store_checkpoints finds the checkpoint exists, it will try once
to unlink it and recreate it.
Signed-off-by: David Teigland <teigland(a)redhat.com>
---
group/gfs_controld/plock.c | 11 +++++++----
group/gfs_controld/recover.c | 4 ++++
2 files changed, 11 insertions(+), 4 deletions(-)
diff --git a/group/gfs_controld/plock.c b/group/gfs_controld/plock.c
index d96604f..51e0882 100644
--- a/group/gfs_controld/plock.c
+++ b/group/gfs_controld/plock.c
@@ -2012,6 +2012,7 @@ void store_plocks(struct mountgroup *mg, int nodeid)
struct lock_waiter *w;
int r_count, lock_count, total_size, section_size, max_section_size;
int len, owner;
+ int retry_count = 0;
if (!plocks_online)
return;
@@ -2087,13 +2088,15 @@ void store_plocks(struct mountgroup *mg, int nodeid)
if (rv == SA_AIS_ERR_EXIST) {
log_group(mg, "store_plocks: ckpt already exists");
log_error("store_plocks: ckpt already exists");
- /* TODO: best to unlink and retry? */
- /*
+ /* We should in general be unlinking the ckpt in the
+ proper places to avoid hitting this, but there are
+ probably some cases where we miss the unlink, so
+ this is a backup method. */
+ if (retry_count++)
+ return;
_unlink_checkpoint(mg, &name);
sleep(1);
goto open_retry;
- */
- return;
}
if (rv != SA_AIS_OK) {
log_error("store_plocks: ckpt open error %d %s", rv, mg->name);
diff --git a/group/gfs_controld/recover.c b/group/gfs_controld/recover.c
index f70f798..87eee63 100644
--- a/group/gfs_controld/recover.c
+++ b/group/gfs_controld/recover.c
@@ -1018,6 +1018,10 @@ void received_our_jid(struct mountgroup *mg)
log_group(mg, "other node doing first mounter recovery, "
"set mount_client_delay");
mg->mount_client_delay = 1;
+ /* There should be no plocks to retrieve since the fs is being
+ mounted initially, but retrieve is needed to unlink an
+ existing checkpoint if we are the new master. */
+ retrieve_plocks(mg);
mg->save_plocks = 0;
return;
}
Gitweb: http://git.fedorahosted.org/git/?p=cluster.git;a=commitdiff;h=a8b3d9f0d7525…
Commit: a8b3d9f0d75253ffb63858c0ac6cfd770d9c0d24
Parent: d2394bea973e62f171040e49dbea3c434ab24724
Author: David Teigland <teigland(a)redhat.com>
AuthorDate: Thu Oct 10 11:58:58 2013 -0500
Committer: Christine Caulfield <ccaulfie(a)redhat.com>
CommitterDate: Tue Dec 17 09:19:10 2013 +0000
gfs_controld: fix plock recovery
When there are two nodes in the cluster, and the
the node in charge of the plock checkpoint fails,
the remaining node does not unlink the checkpoint
that had been created by the failed node. When
the failed node returns, and the new node attempts
to transfer plock state, it fails to create a new
checkpoint because it did not unlink the previous
checkpoint created by the failed node. This leads
to any existing plock state not being transferred
to the newly joined node. The newly joined node
will then mistakenly grant plocks to itself that
may conflict with plocks that the other node could
not transfer. This leads to:
1. conflicting plocks being held concurrently
2. dangling plocks that are not held but not removed
In the explanation above, the reason the remaining
node does not unlink the checkpoint that had been
created by the other node, is that it does not know
that the other node was in charge of the checkpoint.
It could only know this if it had been present before
and after the previous membership change. Because
there are only two nodes, this was not possible.
This, however, is also the point exploited to fix
the problem. When there are only two members, a new
node can assume that the other node is in charge of
the checkpoint.
The following test shows the problem/fix using
a program "doplock" that requests an exclusive,
blocking posix lock on the given file.
node1: mount /gfs
node2: mount /gfs
node1: touch /gfs/test
node1: doplock /gfs/test (granted)
node2: doplock /gfs/test (blocks)
node1: killed
node2: recovery for node1
node2: doplock above granted the lock
node1: restarts
node1: mount /gfs
node1: doplock /gfs/test
In the last step, the node1 doplock should block
because node2 holds the lock. Before the fix,
it was granted.
Signed-off-by: David Teigland <teigland(a)redhat.com>
---
group/gfs_controld/plock.c | 7 +++++++
group/gfs_controld/recover.c | 14 ++++++++++++--
2 files changed, 19 insertions(+), 2 deletions(-)
diff --git a/group/gfs_controld/plock.c b/group/gfs_controld/plock.c
index 4330a2c..d96604f 100644
--- a/group/gfs_controld/plock.c
+++ b/group/gfs_controld/plock.c
@@ -2086,6 +2086,13 @@ void store_plocks(struct mountgroup *mg, int nodeid)
}
if (rv == SA_AIS_ERR_EXIST) {
log_group(mg, "store_plocks: ckpt already exists");
+ log_error("store_plocks: ckpt already exists");
+ /* TODO: best to unlink and retry? */
+ /*
+ _unlink_checkpoint(mg, &name);
+ sleep(1);
+ goto open_retry;
+ */
return;
}
if (rv != SA_AIS_OK) {
diff --git a/group/gfs_controld/recover.c b/group/gfs_controld/recover.c
index b33b3fd..f70f798 100644
--- a/group/gfs_controld/recover.c
+++ b/group/gfs_controld/recover.c
@@ -1257,8 +1257,15 @@ void update_master_nodeid(struct mountgroup *mg)
{
struct mg_member *memb;
int new = -1, low = -1;
+ int other_nodeid = -1;
+ int total = 0;
list_for_each_entry(memb, &mg->members, list) {
+ total++;
+
+ if (memb->nodeid != our_nodeid)
+ other_nodeid = memb->nodeid;
+
if (low == -1 || memb->nodeid < low)
low = memb->nodeid;
if (!memb->finished)
@@ -1268,6 +1275,9 @@ void update_master_nodeid(struct mountgroup *mg)
}
mg->master_nodeid = new;
mg->low_nodeid = low;
+
+ if (new == -1 && total == 2)
+ mg->master_nodeid = other_nodeid;
}
/* This can happen before we receive a journals message for our mount. */
@@ -1354,8 +1364,8 @@ void recover_members(struct mountgroup *mg, int num_nodes,
*pos_out = pos;
*neg_out = neg;
- log_group(mg, "total members %d master_nodeid %d prev %d",
- mg->memb_count, mg->master_nodeid, prev_master_nodeid);
+ log_group(mg, "total members %d master_nodeid %d prev %d failed %d",
+ mg->memb_count, mg->master_nodeid, prev_master_nodeid, master_failed);
/* The master failed and we're the new master, we need to:
Gitweb: http://git.fedorahosted.org/git/?p=cluster.git;a=commitdiff;h=6d60ac310fcb2…
Commit: 6d60ac310fcb224ca921bb78c8c43fd80ec1b03e
Parent: e675576929f09ee208f22fdfdcb750dba92fe00f
Author: David Teigland <teigland(a)redhat.com>
AuthorDate: Mon Oct 14 11:42:26 2013 -0500
Committer: David Teigland <teigland(a)redhat.com>
CommitterDate: Fri Dec 13 10:17:10 2013 -0600
gfs_controld: fix plock transfer during first mount recovery
The plock checkpoint is not unlinked properly during certain
first mount recovery situations (lower nodeid mounts while higher
nodeid is doing first mounter recovery). This leaves a stray
checkpoint that prevents the following checkpoint from being created,
which causes plock state to not be transferred to mounting nodes,
which can lead to a plock being granted in multiple places at once.
node2: mount /gfs (it does first mount recovery)
node1: mount /gfs (while node2 is still doing first mount recovery)
node2: creates a plock checkpoint (empty) for node1, then closes
checkpoint because new low nodeid is now in charge of it
node2: sends journal info to node1
node1: gets journal info from node1
Takes special code path because node1 is still doing first
recovery. Does not call retrieve_plocks on this code path
because there are no plocks to retrieve in this case. But,
the retrieve_plocks function is also responsible for unlinking
the existing checkpoint on a new low nodeid, which this is.
So, node1 does not unlink the checkpoint as it should.
node2: finishes first mount recovery, completes mount
node1: notified that node2's first recovery is done, completes mount
node2: doplock /gfs/test (granted)
node1: killed
node1: restarts
node1: mount /gfs
node2: tries to create checkpoint to transfer the plock state to node1,
but this fails because the checkpoint exists, because node1 did
not unlink it above. So, plock state is not transfered to node1.
node1: doplock /gfs/test (granted)
The result is that both nodes have the same plock granted concurrently.
The solution is for node1 to call retrieve_plocks on the first mounter
code path, as it does on the normal code path. retrieve_plocks will
unlink the checkpoint in this case.
This patch also adds a lower level backup method to create plock
checkpoints if an unlink was missed in some cases. If
store_checkpoints finds the checkpoint exists, it will try once
to unlink it and recreate it.
Signed-off-by: David Teigland <teigland(a)redhat.com>
---
group/gfs_controld/plock.c | 11 +++++++----
group/gfs_controld/recover.c | 4 ++++
2 files changed, 11 insertions(+), 4 deletions(-)
diff --git a/group/gfs_controld/plock.c b/group/gfs_controld/plock.c
index d96604f..51e0882 100644
--- a/group/gfs_controld/plock.c
+++ b/group/gfs_controld/plock.c
@@ -2012,6 +2012,7 @@ void store_plocks(struct mountgroup *mg, int nodeid)
struct lock_waiter *w;
int r_count, lock_count, total_size, section_size, max_section_size;
int len, owner;
+ int retry_count = 0;
if (!plocks_online)
return;
@@ -2087,13 +2088,15 @@ void store_plocks(struct mountgroup *mg, int nodeid)
if (rv == SA_AIS_ERR_EXIST) {
log_group(mg, "store_plocks: ckpt already exists");
log_error("store_plocks: ckpt already exists");
- /* TODO: best to unlink and retry? */
- /*
+ /* We should in general be unlinking the ckpt in the
+ proper places to avoid hitting this, but there are
+ probably some cases where we miss the unlink, so
+ this is a backup method. */
+ if (retry_count++)
+ return;
_unlink_checkpoint(mg, &name);
sleep(1);
goto open_retry;
- */
- return;
}
if (rv != SA_AIS_OK) {
log_error("store_plocks: ckpt open error %d %s", rv, mg->name);
diff --git a/group/gfs_controld/recover.c b/group/gfs_controld/recover.c
index f70f798..87eee63 100644
--- a/group/gfs_controld/recover.c
+++ b/group/gfs_controld/recover.c
@@ -1018,6 +1018,10 @@ void received_our_jid(struct mountgroup *mg)
log_group(mg, "other node doing first mounter recovery, "
"set mount_client_delay");
mg->mount_client_delay = 1;
+ /* There should be no plocks to retrieve since the fs is being
+ mounted initially, but retrieve is needed to unlink an
+ existing checkpoint if we are the new master. */
+ retrieve_plocks(mg);
mg->save_plocks = 0;
return;
}
Gitweb: http://git.fedorahosted.org/git/?p=cluster.git;a=commitdiff;h=e675576929f09…
Commit: e675576929f09ee208f22fdfdcb750dba92fe00f
Parent: 40fb2e03b1df0bb5754c1c8e8d8c4bd6a411e88d
Author: David Teigland <teigland(a)redhat.com>
AuthorDate: Thu Oct 10 11:58:58 2013 -0500
Committer: David Teigland <teigland(a)redhat.com>
CommitterDate: Fri Dec 13 10:17:10 2013 -0600
gfs_controld: fix plock recovery
When there are two nodes in the cluster, and the
the node in charge of the plock checkpoint fails,
the remaining node does not unlink the checkpoint
that had been created by the failed node. When
the failed node returns, and the new node attempts
to transfer plock state, it fails to create a new
checkpoint because it did not unlink the previous
checkpoint created by the failed node. This leads
to any existing plock state not being transferred
to the newly joined node. The newly joined node
will then mistakenly grant plocks to itself that
may conflict with plocks that the other node could
not transfer. This leads to:
1. conflicting plocks being held concurrently
2. dangling plocks that are not held but not removed
In the explanation above, the reason the remaining
node does not unlink the checkpoint that had been
created by the other node, is that it does not know
that the other node was in charge of the checkpoint.
It could only know this if it had been present before
and after the previous membership change. Because
there are only two nodes, this was not possible.
This, however, is also the point exploited to fix
the problem. When there are only two members, a new
node can assume that the other node is in charge of
the checkpoint.
The following test shows the problem/fix using
a program "doplock" that requests an exclusive,
blocking posix lock on the given file.
node1: mount /gfs
node2: mount /gfs
node1: touch /gfs/test
node1: doplock /gfs/test (granted)
node2: doplock /gfs/test (blocks)
node1: killed
node2: recovery for node1
node2: doplock above granted the lock
node1: restarts
node1: mount /gfs
node1: doplock /gfs/test
In the last step, the node1 doplock should block
because node2 holds the lock. Before the fix,
it was granted.
Signed-off-by: David Teigland <teigland(a)redhat.com>
---
group/gfs_controld/plock.c | 7 +++++++
group/gfs_controld/recover.c | 14 ++++++++++++--
2 files changed, 19 insertions(+), 2 deletions(-)
diff --git a/group/gfs_controld/plock.c b/group/gfs_controld/plock.c
index 4330a2c..d96604f 100644
--- a/group/gfs_controld/plock.c
+++ b/group/gfs_controld/plock.c
@@ -2086,6 +2086,13 @@ void store_plocks(struct mountgroup *mg, int nodeid)
}
if (rv == SA_AIS_ERR_EXIST) {
log_group(mg, "store_plocks: ckpt already exists");
+ log_error("store_plocks: ckpt already exists");
+ /* TODO: best to unlink and retry? */
+ /*
+ _unlink_checkpoint(mg, &name);
+ sleep(1);
+ goto open_retry;
+ */
return;
}
if (rv != SA_AIS_OK) {
diff --git a/group/gfs_controld/recover.c b/group/gfs_controld/recover.c
index b33b3fd..f70f798 100644
--- a/group/gfs_controld/recover.c
+++ b/group/gfs_controld/recover.c
@@ -1257,8 +1257,15 @@ void update_master_nodeid(struct mountgroup *mg)
{
struct mg_member *memb;
int new = -1, low = -1;
+ int other_nodeid = -1;
+ int total = 0;
list_for_each_entry(memb, &mg->members, list) {
+ total++;
+
+ if (memb->nodeid != our_nodeid)
+ other_nodeid = memb->nodeid;
+
if (low == -1 || memb->nodeid < low)
low = memb->nodeid;
if (!memb->finished)
@@ -1268,6 +1275,9 @@ void update_master_nodeid(struct mountgroup *mg)
}
mg->master_nodeid = new;
mg->low_nodeid = low;
+
+ if (new == -1 && total == 2)
+ mg->master_nodeid = other_nodeid;
}
/* This can happen before we receive a journals message for our mount. */
@@ -1354,8 +1364,8 @@ void recover_members(struct mountgroup *mg, int num_nodes,
*pos_out = pos;
*neg_out = neg;
- log_group(mg, "total members %d master_nodeid %d prev %d",
- mg->memb_count, mg->master_nodeid, prev_master_nodeid);
+ log_group(mg, "total members %d master_nodeid %d prev %d failed %d",
+ mg->memb_count, mg->master_nodeid, prev_master_nodeid, master_failed);
/* The master failed and we're the new master, we need to: