Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=1d…
Commit: 1d1ba64f39fe8a16deef298aa1e36cf6333daa30
Parent: 90194b38e88b7efdbe7a784aba2393259c135c5d
Author: Jonathan Brassow <jbrassow(a)redhat.com>
AuthorDate: Thu Apr 26 18:21:47 2012 -0500
Committer: Jonathan Brassow <jbrassow(a)redhat.com>
CommitterDate: Thu Apr 26 18:21:47 2012 -0500
Fix 73372: Any change to lvm.conf causes HA LVM to fail
Rather than utterly failing, warn instead. The user should make
sure that the initrd has the latest lvm.conf file, but there are
cases where we certainly don't want to utterly fail. (Like if they
change lvm.conf in a minor way and then a machine crashes before
they've updated the initrd - causing HA LVM to not work properly.)
---
rgmanager/src/resources/lvm.sh | 7 ++++++-
1 files changed, 6 insertions(+), 1 deletions(-)
diff --git a/rgmanager/src/resources/lvm.sh b/rgmanager/src/resources/lvm.sh
index 1e2c93b..cb2f5ec 100755
--- a/rgmanager/src/resources/lvm.sh
+++ b/rgmanager/src/resources/lvm.sh
@@ -108,7 +108,12 @@ function ha_lvm_proper_setup_check
if [ "$(find /boot -name *.img -newer /etc/lvm/lvm.conf)" == "" ]; then
ocf_log err "HA LVM: Improper setup detected"
ocf_log err "- initrd image needs to be newer than lvm.conf"
- return $OCF_ERR_GENERIC
+
+ # While dangerous if not done the first time, there are many
+ # cases where we don't simply want to fail here. Instead,
+ # keep warning until the user remakes the initrd - or has
+ # it done for them by upgrading the kernel.
+ #return $OCF_ERR_GENERIC
fi
return $OCF_SUCCESS
Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=90…
Commit: 90194b38e88b7efdbe7a784aba2393259c135c5d
Parent: bdf7691f84eac400323c84a490f0147ec851afdf
Author: Jonathan Brassow <jbrassow(a)redhat.com>
AuthorDate: Thu Apr 26 18:05:50 2012 -0500
Committer: Jonathan Brassow <jbrassow(a)redhat.com>
CommitterDate: Thu Apr 26 18:05:50 2012 -0500
Bring HA LVM inline with upstream to collect the fixes there.
Most fixes revolve around the manipulation of tags and improving
the robustness of handling failure scenarios.
---
rgmanager/src/resources/lvm.sh | 37 +++++++++++++++----
rgmanager/src/resources/lvm_by_lv.sh | 63 +++++++++++++++------------------
rgmanager/src/resources/lvm_by_vg.sh | 37 +++++++++++++++-----
3 files changed, 86 insertions(+), 51 deletions(-)
diff --git a/rgmanager/src/resources/lvm.sh b/rgmanager/src/resources/lvm.sh
index 0ffcadf..1e2c93b 100755
--- a/rgmanager/src/resources/lvm.sh
+++ b/rgmanager/src/resources/lvm.sh
@@ -42,6 +42,33 @@ rv=0
function ha_lvm_proper_setup_check
{
##
+ # Does the Volume Group exist?
+ # 1) User may have forgotten to create it
+ # 2) User may have misspelled it in the config file
+ ##
+ if ! vgs $OCF_RESKEY_vg_name --config 'global{locking_type=0}'>& /dev/null; then
+ ocf_log err "HA LVM: Unable to get volume group attributes for $OCF_RESKEY_vg_name"
+ return $OCF_ERR_GENERIC
+ fi
+
+ ##
+ # Are we using the "tagging" or "CLVM" variant?
+ # The CLVM variant will have the cluster attribute set
+ ##
+ if [[ $(vgs -o attr --noheadings --config 'global{locking_type=0}' $OCF_RESKEY_vg_name 2>/dev/null) =~ .....c ]]; then
+ # Is clvmd running?
+ if ! ps -C clvmd >& /dev/null; then
+ ocf_log err "HA LVM: $OCF_RESKEY_vg_name has the cluster attribute set, but 'clvmd' is not running"
+ return $OCF_ERR_GENERIC
+ fi
+ return $OCF_SUCCESS
+ fi
+
+ ##
+ # The "tagging" variant is being used if we have gotten this far.
+ ##
+
+ ##
# The default for lvm.conf:activation/volume_list is empty,
# this must be changed for HA LVM.
##
@@ -93,9 +120,7 @@ function ha_lvm_proper_setup_check
case $1 in
start)
- if ! [[ $(vgs -o attr --noheadings $OCF_RESKEY_vg_name) =~ .....c ]]; then
- ha_lvm_proper_setup_check || exit 1
- fi
+ ha_lvm_proper_setup_check || exit 1
if [ -z $OCF_RESKEY_lv_name ]; then
vg_start || exit 1
@@ -115,11 +140,7 @@ status|monitor)
;;
stop)
- if ! [[ $(vgs -o attr --noheadings $OCF_RESKEY_vg_name) =~ .....c ]]; then
- if ! ha_lvm_proper_setup_check; then
- ocf_log err "WARNING: An improper setup can cause data corruption!"
- fi
- fi
+ ha_lvm_proper_setup_check
if [ -z $OCF_RESKEY_lv_name ]; then
vg_stop || exit 1
diff --git a/rgmanager/src/resources/lvm_by_lv.sh b/rgmanager/src/resources/lvm_by_lv.sh
index 966214f..4971173 100644
--- a/rgmanager/src/resources/lvm_by_lv.sh
+++ b/rgmanager/src/resources/lvm_by_lv.sh
@@ -320,45 +320,40 @@ lv_activate()
if ! lv_activate_and_tag $1 $my_name $lv_path; then
ocf_log err "Failed to $1 $lv_path"
- if [ "$1" == "start" ]; then
- ocf_log notice "Attempting cleanup of $OCF_RESKEY_vg_name"
-
- if vgreduce --removemissing --config \
- "activation { volume_list = \"$OCF_RESKEY_vg_name\" }" \
- $OCF_RESKEY_vg_name; then
- ocf_log notice "$OCF_RESKEY_vg_name now consistent"
- owner=`lvs -o tags --noheadings $lv_path`
- if [ ! -z $owner ] && [ $owner != $my_name ]; then
- if is_node_member_clustat $owner ; then
- ocf_log err "$owner owns $lv_path unable to $1"
- return $OCF_ERR_GENERIC
- fi
- ocf_log notice "Owner of $lv_path is not in the cluster"
- ocf_log notice "Stealing $lv_path"
-
- lvchange --deltag $owner $lv_path
- if [ $? -ne 0 ]; then
- ocf_log err "Failed to steal $lv_path from $owner"
- return $OCF_ERR_GENERIC
- fi
-
- # Warning --deltag doesn't always result in failure
- if [ ! -z `lvs -o tags --noheadings $lv_path` ]; then
- ocf_log err "Failed to steal $lv_path from $owner."
- return $OCF_ERR_GENERIC
- fi
+ ocf_log notice "Attempting cleanup of $OCF_RESKEY_vg_name"
+
+ if vgreduce --removemissing --force --config \
+ "activation { volume_list = \"$OCF_RESKEY_vg_name\" }" \
+ $OCF_RESKEY_vg_name; then
+ ocf_log notice "$OCF_RESKEY_vg_name now consistent"
+ owner=`lvs -o tags --noheadings $lv_path`
+ if [ ! -z $owner ] && [ $owner != $my_name ]; then
+ if is_node_member_clustat $owner ; then
+ ocf_log err "$owner owns $lv_path unable to $1"
+ return $OCF_ERR_GENERIC
fi
+ ocf_log notice "Owner of $lv_path is not in the cluster"
+ ocf_log notice "Stealing $lv_path"
- if ! lv_activate_and_tag $1 $my_name $lv_path; then
- ocf_log err "Failed second attempt to $1 $lv_path"
+ lvchange --deltag $owner $lv_path
+ if [ $? -ne 0 ]; then
+ ocf_log err "Failed to steal $lv_path from $owner"
return $OCF_ERR_GENERIC
- else
- ocf_log notice "Second attempt to $1 $lv_path successful"
- return $OCF_SUCCESS
fi
- else
- ocf_log err "Failed to make $OCF_RESKEY_vg_name consistent"
+
+ # Warning --deltag doesn't always result in failure
+ if [ ! -z `lvs -o tags --noheadings $lv_path` ]; then
+ ocf_log err "Failed to steal $lv_path from $owner."
+ return $OCF_ERR_GENERIC
+ fi
+ fi
+
+ if ! lv_activate_and_tag $1 $my_name $lv_path; then
+ ocf_log err "Failed second attempt to $1 $lv_path"
return $OCF_ERR_GENERIC
+ else
+ ocf_log notice "Second attempt to $1 $lv_path successful"
+ return $OCF_SUCCESS
fi
else
ocf_log err "Failed to $1 $lv_path"
diff --git a/rgmanager/src/resources/lvm_by_vg.sh b/rgmanager/src/resources/lvm_by_vg.sh
index 08d8c98..0dd2aaa 100755
--- a/rgmanager/src/resources/lvm_by_vg.sh
+++ b/rgmanager/src/resources/lvm_by_vg.sh
@@ -50,7 +50,7 @@ function vg_owner
return 1
}
-function strip_tags
+function _strip_tags
{
local i
@@ -67,6 +67,29 @@ function strip_tags
return $OCF_SUCCESS
}
+function strip_tags
+{
+ if ! _strip_tags; then
+ ocf_log notice "Attempting cleanup of $OCF_RESKEY_vg_name"
+
+ if ! vgreduce --removemissing --force --config \
+ "activation { volume_list = \"$OCF_RESKEY_vg_name\" }" \
+ $OCF_RESKEY_vg_name; then
+
+ ocf_log err "Failed to make $OCF_RESKEY_vg_name consistent"
+ return $OCF_ERR_GENERIC
+ fi
+
+ ocf_log notice "Cleanup of $OCF_RESKEY_vg_name successful"
+ fi
+ if ! _strip_tags; then
+ ocf_log err "Failed 2nd attempt to remove tags from, $OCF_RESKEY_vg_name"
+ return $OCF_ERR_GENERIC
+ fi
+
+ return $OCF_SUCCESS
+}
+
function strip_and_add_tag
{
if ! strip_tags; then
@@ -179,7 +202,7 @@ function vg_start_clustered
ocf_log err "Failed to activate volume group, $OCF_RESKEY_vg_name"
ocf_log notice "Attempting cleanup of $OCF_RESKEY_vg_name"
- if ! vgreduce --removemissing $OCF_RESKEY_vg_name; then
+ if ! vgreduce --removemissing --force $OCF_RESKEY_vg_name; then
ocf_log err "Failed to make $OCF_RESKEY_vg_name consistent"
return $OCF_ERR_GENERIC
fi
@@ -257,7 +280,7 @@ function vg_start_single
ocf_log err "Failed to activate volume group, $OCF_RESKEY_vg_name"
ocf_log notice "Attempting cleanup of $OCF_RESKEY_vg_name"
- if ! vgreduce --removemissing --config \
+ if ! vgreduce --removemissing --force --config \
"activation { volume_list = \"$OCF_RESKEY_vg_name\" }" \
$OCF_RESKEY_vg_name; then
@@ -265,11 +288,7 @@ function vg_start_single
return $OCF_ERR_GENERIC
fi
- vg_owner
- if [ $? -eq 0 ]; then
- ocf_log err "Unable to claim ownership of $OCF_RESKEY_vg_name"
- return $OCF_ERR_GENERIC
- fi
+ ocf_log notice "Cleanup of $OCF_RESKEY_vg_name successful"
if ! strip_and_add_tag ||
! vgchange -ay $OCF_RESKEY_vg_name; then
@@ -282,7 +301,7 @@ function vg_start_single
else
# The activation commands succeeded, but did they do anything?
# Make sure all the logical volumes are active
- results=(`lvs -o name,attr --noheadings 2> /dev/null $OCF_RESKEY_vg_name`)
+ results=(`lvs -o name,attr --noheadings $OCF_RESKEY_vg_name 2> /dev/null`)
a=0
while [ ! -z ${results[$a]} ]; do
if [[ ! ${results[$(($a + 1))]} =~ ....a. ]]; then
Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=93…
Commit: 93eb663d6b4bfed821ea4f66f18396eadb522468
Parent: e48ce7f40c8e285d5cddc654ed39e1e4381d44ed
Author: Jonathan Brassow <jbrassow(a)redhat.com>
AuthorDate: Thu Apr 26 12:39:47 2012 -0500
Committer: Jonathan Brassow <jbrassow(a)redhat.com>
CommitterDate: Thu Apr 26 14:17:37 2012 -0500
Fix bug in cmirror that caused incorrect status info to print on some nodes.
Here's the upstream commit message:
commit 172a9457bf8dcc1e5c3a607be2e8d1ac80ac619b
Author: Jonathan Earl Brassow <jbrassow(a)redhat.com>
Date: Thu Apr 26 17:30:49 2012 +0000
Fix bug in cmirror that caused incorrect status info to print on some nodes.
Looking at the code in cmirrord/local.c, we can see the various different
request types handled in different ways. Some information that is non-changin
does not need to go around the cluster and can be short-circuited. For
example, once the cluster mirror is in-sync, it is pointless to continue
sending that query around the cluster. We can save network bandwidth and repl
directly back to the kernel. When it comes to status information, there are
two types 'TABLE' and 'INFO'. The 'TABLE' information never changes and
belongs to the group of requests that can be safely short-circuited. The
'STATUS' information can change - and will change if a device fails. Thus it
cannot be short-circuited, but this is exactly what was found. The 'STATUS'
information request was being short-circuited and therefore never reporting th
failure condition to anyone other than the "server" that experienced it
directly.
---
cmirror/src/local.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/cmirror/src/local.c b/cmirror/src/local.c
index 3e6d74a..29184a0 100644
--- a/cmirror/src/local.c
+++ b/cmirror/src/local.c
@@ -209,7 +209,6 @@ static int do_local_work(void *data)
case DM_CLOG_DTR:
case DM_CLOG_IN_SYNC:
case DM_CLOG_GET_SYNC_COUNT:
- case DM_CLOG_STATUS_INFO:
case DM_CLOG_STATUS_TABLE:
case DM_CLOG_PRESUSPEND:
/* We do not specify ourselves as server here */
@@ -245,6 +244,7 @@ static int do_local_work(void *data)
case DM_CLOG_MARK_REGION:
case DM_CLOG_GET_RESYNC_WORK:
case DM_CLOG_SET_REGION_SYNC:
+ case DM_CLOG_STATUS_INFO:
case DM_CLOG_IS_REMOTE_RECOVERING:
case DM_CLOG_POSTSUSPEND:
r = cluster_send(tfr);
Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=bd…
Commit: bdf7691f84eac400323c84a490f0147ec851afdf
Parent: c6fca5e49ad1c8a6e1bce1874d0688aab3ccac26
Author: Jonathan Brassow <jbrassow(a)redhat.com>
AuthorDate: Thu Apr 26 12:39:47 2012 -0500
Committer: Jonathan Brassow <jbrassow(a)redhat.com>
CommitterDate: Thu Apr 26 12:39:47 2012 -0500
Fix bug in cmirror that caused incorrect status info to print on some nodes.
Here's the upstream commit message:
commit 172a9457bf8dcc1e5c3a607be2e8d1ac80ac619b
Author: Jonathan Earl Brassow <jbrassow(a)redhat.com>
Date: Thu Apr 26 17:30:49 2012 +0000
Fix bug in cmirror that caused incorrect status info to print on some nodes.
Looking at the code in cmirrord/local.c, we can see the various different
request types handled in different ways. Some information that is non-changin
does not need to go around the cluster and can be short-circuited. For
example, once the cluster mirror is in-sync, it is pointless to continue
sending that query around the cluster. We can save network bandwidth and repl
directly back to the kernel. When it comes to status information, there are
two types 'TABLE' and 'INFO'. The 'TABLE' information never changes and
belongs to the group of requests that can be safely short-circuited. The
'STATUS' information can change - and will change if a device fails. Thus it
cannot be short-circuited, but this is exactly what was found. The 'STATUS'
information request was being short-circuited and therefore never reporting th
failure condition to anyone other than the "server" that experienced it
directly.
---
cmirror/src/local.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/cmirror/src/local.c b/cmirror/src/local.c
index 3e6d74a..29184a0 100644
--- a/cmirror/src/local.c
+++ b/cmirror/src/local.c
@@ -209,7 +209,6 @@ static int do_local_work(void *data)
case DM_CLOG_DTR:
case DM_CLOG_IN_SYNC:
case DM_CLOG_GET_SYNC_COUNT:
- case DM_CLOG_STATUS_INFO:
case DM_CLOG_STATUS_TABLE:
case DM_CLOG_PRESUSPEND:
/* We do not specify ourselves as server here */
@@ -245,6 +244,7 @@ static int do_local_work(void *data)
case DM_CLOG_MARK_REGION:
case DM_CLOG_GET_RESYNC_WORK:
case DM_CLOG_SET_REGION_SYNC:
+ case DM_CLOG_STATUS_INFO:
case DM_CLOG_IS_REMOTE_RECOVERING:
case DM_CLOG_POSTSUSPEND:
r = cluster_send(tfr);
Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=de…
Commit: de6fa9a06f0ef100edb092342b1466bac3466e70
Parent: 4b3a6a1103b3f83b1f5d34e0aef387c396126eca
Author: Marek 'marx' Grac <mgrac(a)redhat.com>
AuthorDate: Mon Mar 19 16:45:23 2012 +0100
Committer: Marek 'marx' Grac <mgrac(a)redhat.com>
CommitterDate: Thu Apr 12 09:43:16 2012 +0200
fence agents: Using "delay" option can ends with timeout problems
Resolves: rhbz#804170
---
fence/agents/lib/fencing.py.py | 7 +++++--
1 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/fence/agents/lib/fencing.py.py b/fence/agents/lib/fencing.py.py
index 1ff51a4..c4fe218 100644
--- a/fence/agents/lib/fencing.py.py
+++ b/fence/agents/lib/fencing.py.py
@@ -782,8 +782,6 @@ def fence_action(tn, options, set_power_fn, get_power_fn, get_outlet_list = None
print o + options["-C"] + alias
return
- if options["-o"] in ["off", "reboot"]:
- time.sleep(int(options["-f"]))
status = get_power_fn(tn, options)
if status != "on" and status != "off":
@@ -866,6 +864,11 @@ def fence_login(options):
else:
login_eol = "\r\n"
+ ## Do the delay of the fence device before logging in
+ ## Delay is important for two-node clusters fencing but we do not need to delay 'status' operations
+ if options["-o"] in ["off", "reboot"]:
+ time.sleep(int(options["-f"]))
+
try:
re_login = re.compile("(login\s*: )|(Login Name: )|(username: )|(User Name :)", re.IGNORECASE)
re_pass = re.compile("password", re.IGNORECASE)