Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=fe9... Commit: fe9a89972834d0459c312bede9e4a32df52e445a Parent: 1efff01c28dadf735cab90433c20a7dcbe3c81ef Author: Eduardo Damato edamato@redhat.com AuthorDate: Tue Sep 29 09:58:18 2009 -0400 Committer: Lon Hohberger lhh@redhat.com CommitterDate: Fri Oct 30 16:30:23 2009 -0400
qdisk: Implement I/O timeouts in qdiskd
This allows administrators to make qdiskd reboot the system if it can not write its status out for interval*tko seconds.
Resolves: rhbz#511113
Part 1/4
Signed-off-by: Eduardo Damato edamato@redhat.com Signed-off-by: Lon Hohberger lhh@redhat.com --- cman/man/qdisk.5 | 7 +++++++ cman/qdisk/disk.h | 3 ++- cman/qdisk/main.c | 34 ++++++++++++++++++++++++++++++++-- 3 files changed, 41 insertions(+), 3 deletions(-)
diff --git a/cman/man/qdisk.5 b/cman/man/qdisk.5 index 65b9956..513d56b 100644 --- a/cman/man/qdisk.5 +++ b/cman/man/qdisk.5 @@ -291,6 +291,13 @@ if it takes more than (interval * tko) seconds to complete a quorum disk pass. The default for this value is 0 (off).
.in 9 +\fIio_timeout\fP\fB="\fP0\fB"\fP +.in 12 +If set to 1 (on), qdiskd will watch internal timers and reboot the node +if qdisk is not able to write to disk after (interval * tko) seconds. +The default for this value is 0 (off). + +.in 9 \fIscheduler\fP\fB="\fPrr\fB"\fP .in 12 Valid values are 'rr', 'fifo', and 'other'. Selects the scheduling queue diff --git a/cman/qdisk/disk.h b/cman/qdisk/disk.h index 3509339..0b652b2 100644 --- a/cman/qdisk/disk.h +++ b/cman/qdisk/disk.h @@ -73,7 +73,8 @@ typedef enum { RF_PARANOID = 0x8, RF_ALLOW_KILL = 0x10, RF_UPTIME = 0x20, - RF_CMAN_LABEL = 0x40 + RF_CMAN_LABEL = 0x40, + RF_IOTIMEOUT = 0x80 } run_flag_t;
diff --git a/cman/qdisk/main.c b/cman/qdisk/main.c index 81999a0..c86759e 100644 --- a/cman/qdisk/main.c +++ b/cman/qdisk/main.c @@ -867,7 +867,7 @@ quorum_loop(qd_ctx *ctx, node_info_t *ni, int max) int low_id, bid_pending = 0, score, score_max, score_req, upgrade = 0, count, errors, error_cycles = 0; memb_mask_t mask, master_mask; - struct timeval maxtime, oldtime, newtime, diff, sleeptime, interval; + struct timeval maxtime, oldtime, newtime, diff, sleeptime, interval, lastok;
ctx->qc_status = S_NONE; @@ -877,6 +877,9 @@ quorum_loop(qd_ctx *ctx, node_info_t *ni, int max) interval.tv_usec = 0; interval.tv_sec = ctx->qc_interval; + lastok.tv_usec = 0; + lastok.tv_sec = 0; + get_my_score(&score, &score_max); if (score_max < ctx->qc_scoremin) { clulog(LOG_WARNING, "Minimum score (%d) is impossible to " @@ -1065,6 +1068,8 @@ quorum_loop(qd_ctx *ctx, node_info_t *ni, int max) clulog(LOG_ERR, "Error writing to quorum disk\n"); errors++; /* this value isn't really used at this point */ + } else { + get_time(&lastok, ctx->qc_flags&RF_UPTIME); }
/* write out our local status */ @@ -1073,11 +1078,27 @@ quorum_loop(qd_ctx *ctx, node_info_t *ni, int max) /* Cycle. We could time the loop and sleep usleep(interval-looptime), but this is fine for now.*/ get_time(&newtime, ctx->qc_flags&RF_UPTIME); - _diff_tv(&diff, &oldtime, &newtime); + /* + * Reboot if the last successful hearbeat was longer ago than interval*TKO_COUNT + */ + _diff_tv(&diff, &lastok, &newtime); + if (_cmp_tv(&maxtime, &diff) == 1 && + ctx->qc_flags & RF_IOTIMEOUT) { + clulog(LOG_EMERG, "Failed to send a heartbeat within " + "%d second%s (%d.%06d) - REBOOTING\n", + (int)maxtime.tv_sec, + maxtime.tv_sec==1?"":"s", + (int)diff.tv_sec, + (int)diff.tv_usec); + if (!(ctx->qc_flags & RF_DEBUG)) + reboot(RB_AUTOBOOT); + } + /* * Reboot if we didn't send a heartbeat in interval*TKO_COUNT */ + _diff_tv(&diff, &oldtime, &newtime); if (_cmp_tv(&maxtime, &diff) == 1 && ctx->qc_flags & RF_PARANOID) { clulog(LOG_EMERG, "Failed to complete a cycle within " @@ -1347,6 +1368,15 @@ get_config_data(char *cluster_name, qd_ctx *ctx, struct h_data *h, int maxh, free(val); } + /* default = off, so, 1 to turn on */ + snprintf(query, sizeof(query), "/cluster/quorumd/@io_timeout"); + if (ccs_get(ccsfd, query, &val) == 0) { + if (!atoi(val)) + ctx->qc_flags &= ~RF_IOTIMEOUT; + else + ctx->qc_flags |= RF_IOTIMEOUT; + free(val); + } /* * Get flag to see if we're supposed to reboot if we can't complete
cluster-commits@lists.stg.fedorahosted.org