Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=fe4... Commit: fe46f6b6e9ed9a40c37fa60966fafc1cf07e36d2 Parent: 8742ae97a69c8cc282faf39d8c1e7bfda441e5b2 Author: Eduardo Damato edamato@redhat.com AuthorDate: Tue Sep 29 10:06:26 2009 -0400 Committer: Lon Hohberger lhh@redhat.com CommitterDate: Fri Oct 30 16:37:12 2009 -0400
qdisk: Implement I/O timeout for read
This patch creates a timer for last successful read and reboots the system if last successful read was more than interval*tko ago.
Resolves: rhbz#511113
Part 3/4
Signed-off-by: Eduardo Damato edamato@redhat.com Signed-off-by: Lon Hohberger lhh@redhat.com --- cman/qdisk/main.c | 32 ++++++++++++++++++++++++++------ 1 files changed, 26 insertions(+), 6 deletions(-)
diff --git a/cman/qdisk/main.c b/cman/qdisk/main.c index b698f2c..250406a 100644 --- a/cman/qdisk/main.c +++ b/cman/qdisk/main.c @@ -867,7 +867,7 @@ quorum_loop(qd_ctx *ctx, node_info_t *ni, int max) int low_id, bid_pending = 0, score, score_max, score_req, upgrade = 0, count, errors, error_cycles = 0; memb_mask_t mask, master_mask; - struct timeval maxtime, oldtime, newtime, diff, sleeptime, interval, lastok; + struct timeval maxtime, oldtime, newtime, diff, sleeptime, interval, rd_lastok, wr_lastok;
ctx->qc_status = S_NONE; @@ -877,8 +877,11 @@ quorum_loop(qd_ctx *ctx, node_info_t *ni, int max) interval.tv_usec = 0; interval.tv_sec = ctx->qc_interval; - lastok.tv_usec = 0; - lastok.tv_sec = 0; + rd_lastok.tv_usec = 0; + rd_lastok.tv_sec = 0; + + wr_lastok.tv_usec = 0; + wr_lastok.tv_sec = 0; get_my_score(&score, &score_max); if (score_max < ctx->qc_scoremin) { @@ -893,7 +896,8 @@ quorum_loop(qd_ctx *ctx, node_info_t *ni, int max) get_time(&oldtime, (ctx->qc_flags&RF_UPTIME)); /* Read everyone else's status */ - errors = read_node_blocks(ctx, ni, max); + if ( (errors = read_node_blocks(ctx, ni, max) == 0 )) + get_time(&rd_lastok, ctx->qc_flags&RF_UPTIME);
/* Check for node transitions */ check_transitions(ctx, ni, max, mask); @@ -1069,7 +1073,7 @@ quorum_loop(qd_ctx *ctx, node_info_t *ni, int max) errors++; /* this value isn't really used at this point */ } else { - get_time(&lastok, ctx->qc_flags&RF_UPTIME); + get_time(&wr_lastok, ctx->qc_flags&RF_UPTIME); }
/* write out our local status */ @@ -1082,7 +1086,7 @@ quorum_loop(qd_ctx *ctx, node_info_t *ni, int max) /* * Reboot if the last successful hearbeat was longer ago than interval*TKO_COUNT */ - _diff_tv(&diff, &lastok, &newtime); + _diff_tv(&diff, &wr_lastok, &newtime); if (_cmp_tv(&maxtime, &diff) == 1 && ctx->qc_flags & RF_IOTIMEOUT) { clulog(LOG_EMERG, "Failed to send a heartbeat within " @@ -1094,6 +1098,22 @@ quorum_loop(qd_ctx *ctx, node_info_t *ni, int max) if (!(ctx->qc_flags & RF_DEBUG)) reboot(RB_AUTOBOOT); } + + /* + * Reboot if the last successful hearbeat was longer ago than interval*TKO_COUNT + */ + _diff_tv(&diff, &rd_lastok, &newtime); + if (_cmp_tv(&maxtime, &diff) == 1 && + ctx->qc_flags & RF_IOTIMEOUT) { + clulog(LOG_EMERG, "Failed to read from qdisk within " + "%d second%s (%d.%06d) - REBOOTING\n", + (int)maxtime.tv_sec, + maxtime.tv_sec==1?"":"s", + (int)diff.tv_sec, + (int)diff.tv_usec); + if (!(ctx->qc_flags & RF_DEBUG)) + reboot(RB_AUTOBOOT); + } /* * Reboot if we didn't send a heartbeat in interval*TKO_COUNT
cluster-commits@lists.stg.fedorahosted.org