This is an automated email from the git hooks/post-receive script.
andyp pushed a commit to branch andyp-dio-test
in repository gfs2-utils.
commit f757540559a9402fa839a96728123f2a1d6f9380
Author: Andrew Price <anprice(a)redhat.com>
AuthorDate: Wed Aug 15 02:09:24 2018 +0100
fsck.gfs2: Use O_DIRECT
This required changes in libgfs2 and fsck.gfs2 to make sure buffers are
appropriately aligned. posix_fadvise()-based readahead is also removed.
This adds a significant overhead where buffered i/o was previously
beneficial. This should be accounted for in future improvements.
Signed-off-by: Andrew Price <anprice(a)redhat.com>
---
gfs2/fsck/initialize.c | 42 +++++++++++--------
gfs2/fsck/metawalk.c | 101 ----------------------------------------------
gfs2/fsck/pass1.c | 10 -----
gfs2/libgfs2/buf.c | 10 +++--
gfs2/libgfs2/fs_ops.c | 4 +-
gfs2/libgfs2/structures.c | 20 +++++----
6 files changed, 45 insertions(+), 142 deletions(-)
diff --git a/gfs2/fsck/initialize.c b/gfs2/fsck/initialize.c
index d1c620a..f6eea30 100644
--- a/gfs2/fsck/initialize.c
+++ b/gfs2/fsck/initialize.c
@@ -1,5 +1,6 @@
#include "clusterautoconfig.h"
+#define _GNU_SOURCE
#include <stdio.h>
#include <stdint.h>
#include <inttypes.h>
@@ -135,10 +136,12 @@ static int set_block_ranges(struct gfs2_sbd *sdp)
struct osi_node *n, *next = NULL;
struct rgrp_tree *rgd;
struct gfs2_rindex *ri;
- char buf[sdp->sd_sb.sb_bsize];
uint64_t rmax = 0;
uint64_t rmin = 0;
- int error;
+ ssize_t bytes;
+ off_t off;
+ void *buf;
+ int err;
log_info( _("Setting block ranges..."));
@@ -165,28 +168,32 @@ static int set_block_ranges(struct gfs2_sbd *sdp)
last_data_block = rmax;
first_data_block = rmin;
- if (fsck_lseek(sdp->device_fd, (last_fs_block * sdp->sd_sb.sb_bsize))){
- log_crit( _("Can't seek to last block in file system: %llu"
- " (0x%llx)\n"), (unsigned long long)last_fs_block,
- (unsigned long long)last_fs_block);
+ err = posix_memalign(&buf, getpagesize(), sdp->sd_sb.sb_bsize);
+ if (err != 0) {
+ log_crit(_("Failed to allocate memory for buffer: %s\n"),
+ strerror(err));
goto fail;
}
-
- memset(buf, 0, sdp->sd_sb.sb_bsize);
- error = read(sdp->device_fd, buf, sdp->sd_sb.sb_bsize);
- if (error != sdp->sd_sb.sb_bsize){
- log_crit( _("Can't read last block in file system (error %u), "
- "last_fs_block: %llu (0x%llx)\n"), error,
- (unsigned long long)last_fs_block,
- (unsigned long long)last_fs_block);
- goto fail;
+ off = last_fs_block * sdp->sd_sb.sb_bsize;
+ bytes = pread(sdp->device_fd, buf, sdp->sd_sb.sb_bsize, off);
+ if (bytes == -1) {
+ log_crit(_("Error reading last filesystem block (%"PRIu64"): %s\n"),
+ last_fs_block, strerror(errno));
+ goto fail_free;
+ }
+ if (bytes != sdp->sd_sb.sb_bsize) {
+ log_crit(_("Bad read of %ldB for last filesystem block (%"PRIu64")\n"),
+ (long int)bytes, last_fs_block);
+ goto fail_free;
}
log_info(_("0x%llx to 0x%llx\n"), (unsigned long long)first_data_block,
(unsigned long long)last_data_block);
+ free(buf);
return 0;
-
- fail:
+fail_free:
+ free(buf);
+fail:
log_info( _("Error\n"));
return -1;
}
@@ -1569,6 +1576,7 @@ int initialize(struct gfs2_sbd *sdp, int force_check, int preen,
open_flag = O_RDONLY;
else
open_flag = O_RDWR | O_EXCL;
+ open_flag |= O_DIRECT|O_CLOEXEC|O_NOATIME;
sdp->device_fd = open(opts.device, open_flag);
if (sdp->device_fd < 0) {
diff --git a/gfs2/fsck/metawalk.c b/gfs2/fsck/metawalk.c
index a7780d7..027587a 100644
--- a/gfs2/fsck/metawalk.c
+++ b/gfs2/fsck/metawalk.c
@@ -683,37 +683,6 @@ bad_leaf:
return 1;
}
-static int u64cmp(const void *p1, const void *p2)
-{
- uint64_t a = *(uint64_t *)p1;
- uint64_t b = *(uint64_t *)p2;
-
- if (a > b)
- return 1;
- if (a < b)
- return -1;
-
- return 0;
-}
-
-static void dir_leaf_reada(struct gfs2_inode *ip, uint64_t *tbl, unsigned hsize)
-{
- uint64_t *t = alloca(hsize * sizeof(uint64_t));
- uint64_t leaf_no;
- struct gfs2_sbd *sdp = ip->i_sbd;
- unsigned n = 0;
- unsigned i;
-
- for (i = 0; i < hsize; i++) {
- leaf_no = be64_to_cpu(tbl[i]);
- if (valid_block_ip(ip, leaf_no))
- t[n++] = leaf_no * sdp->bsize;
- }
- qsort(t, n, sizeof(uint64_t), u64cmp);
- for (i = 0; i < n; i++)
- posix_fadvise(sdp->device_fd, t[i], sdp->bsize, POSIX_FADV_WILLNEED);
-}
-
/* Checks exhash directory entries */
int check_leaf_blks(struct gfs2_inode *ip, struct metawalk_fxns *pass)
{
@@ -738,17 +707,10 @@ int check_leaf_blks(struct gfs2_inode *ip, struct metawalk_fxns *pass)
orig_di_height = ip->i_di.di_height;
orig_di_blocks = ip->i_di.di_blocks;
- /* Turn off system readahead */
- posix_fadvise(sdp->device_fd, 0, 0, POSIX_FADV_RANDOM);
-
- /* Readahead */
- dir_leaf_reada(ip, tbl, hsize);
-
if (pass->check_hash_tbl) {
error = pass->check_hash_tbl(ip, tbl, hsize, pass->private);
if (error < 0) {
free(tbl);
- posix_fadvise(sdp->device_fd, 0, 0, POSIX_FADV_NORMAL);
return error;
}
/* If hash table changes were made, read it in again. */
@@ -785,7 +747,6 @@ int check_leaf_blks(struct gfs2_inode *ip, struct metawalk_fxns *pass)
(unsigned long long)ip->i_di.di_num.no_addr,
(unsigned long long)ip->i_di.di_num.no_addr);
free(tbl);
- posix_fadvise(sdp->device_fd, 0, 0, POSIX_FADV_NORMAL);
return 1;
}
lindex = 0;
@@ -830,7 +791,6 @@ int check_leaf_blks(struct gfs2_inode *ip, struct metawalk_fxns *pass)
struct gfs2_leaf leaf;
if (fsck_abort) {
free(tbl);
- posix_fadvise(sdp->device_fd, 0, 0, POSIX_FADV_NORMAL);
return 0;
}
error = check_leaf(ip, lindex, pass, &leaf_no, &leaf,
@@ -879,7 +839,6 @@ int check_leaf_blks(struct gfs2_inode *ip, struct metawalk_fxns *pass)
lindex += ref_count;
} /* for every leaf block */
free(tbl);
- posix_fadvise(sdp->device_fd, 0, 0, POSIX_FADV_NORMAL);
return 0;
}
@@ -1157,59 +1116,6 @@ static void free_metalist(struct gfs2_inode *ip, osi_list_t *mlp)
}
}
-static void file_ra(struct gfs2_inode *ip, struct gfs2_buffer_head *bh,
- int head_size, int maxptrs, int h)
-{
- struct gfs2_sbd *sdp = ip->i_sbd;
- uint64_t *p, sblock = 0, block;
- int extlen = 0;
-
- if (h + 2 == ip->i_di.di_height) {
- p = (uint64_t *)(bh->b_data + head_size);
- if (*p && *(p + 1)) {
- sblock = be64_to_cpu(*p);
- p++;
- block = be64_to_cpu(*p);
- extlen = block - sblock;
- if (extlen > 1 && extlen <= maxptrs) {
- posix_fadvise(sdp->device_fd,
- sblock * sdp->bsize,
- (extlen + 1) * sdp->bsize,
- POSIX_FADV_WILLNEED);
- return;
- }
- }
- extlen = 0;
- }
- for (p = (uint64_t *)(bh->b_data + head_size);
- p < (uint64_t *)(bh->b_data + sdp->bsize); p++) {
- if (*p) {
- if (!sblock) {
- sblock = be64_to_cpu(*p);
- extlen = 1;
- continue;
- }
- block = be64_to_cpu(*p);
- if (block == sblock + extlen) {
- extlen++;
- continue;
- }
- }
- if (extlen && sblock) {
- if (extlen > 1)
- extlen--;
- posix_fadvise(sdp->device_fd, sblock * sdp->bsize,
- extlen * sdp->bsize,
- POSIX_FADV_WILLNEED);
- extlen = 0;
- p--;
- }
- }
- if (extlen)
- posix_fadvise(sdp->device_fd, sblock * sdp->bsize,
- extlen * sdp->bsize, POSIX_FADV_WILLNEED);
-}
-
/**
* build_and_check_metalist - check a bunch of indirect blocks
* This includes hash table blocks for directories
@@ -1230,7 +1136,6 @@ static int build_and_check_metalist(struct gfs2_inode *ip, osi_list_t *mlp,
int h, head_size, iblk_type;
uint64_t *ptr, block, *undoptr;
int error, was_duplicate, is_valid;
- int maxptrs;
osi_list_add(&metabh->b_altlist, &mlp[0]);
@@ -1254,16 +1159,12 @@ static int build_and_check_metalist(struct gfs2_inode *ip, osi_list_t *mlp,
iblk_type = GFS2_METATYPE_IN;
if (ip->i_sbd->gfs1) {
head_size = sizeof(struct gfs_indirect);
- maxptrs = (ip->i_sbd->bsize - head_size) /
- sizeof(uint64_t);
} else {
head_size = sizeof(struct gfs2_meta_header);
- maxptrs = ip->i_sbd->sd_inptrs;
}
} else {
iblk_type = GFS2_METATYPE_DI;
head_size = sizeof(struct gfs2_dinode);
- maxptrs = ip->i_sbd->sd_diptrs;
}
prev_list = &mlp[h - 1];
cur_list = &mlp[h];
@@ -1278,8 +1179,6 @@ static int build_and_check_metalist(struct gfs2_inode *ip, osi_list_t *mlp,
continue;
}
- if (pass->readahead)
- file_ra(ip, bh, head_size, maxptrs, h);
/* Now check the metadata itself */
for (ptr = (uint64_t *)(bh->b_data + head_size);
(char *)ptr < (bh->b_data + ip->i_sbd->bsize);
diff --git a/gfs2/fsck/pass1.c b/gfs2/fsck/pass1.c
index 3c2f74f..2788b1b 100644
--- a/gfs2/fsck/pass1.c
+++ b/gfs2/fsck/pass1.c
@@ -1889,22 +1889,12 @@ static int pass1_process_bitmap(struct gfs2_sbd *sdp, struct rgrp_tree *rgd, uin
uint64_t block;
struct gfs2_inode *ip;
int q;
- /* Readahead numbers arrived at by experiment */
- unsigned rawin = 50;
- unsigned ralen = 100 * sdp->bsize;
- unsigned r = 0;
for (i = 0; i < n; i++) {
int is_inode;
uint32_t check_magic;
block = ibuf[i];
-
- if (r++ == rawin) {
- posix_fadvise(sdp->device_fd, block * sdp->bsize, ralen, POSIX_FADV_WILLNEED);
- r = 0;
- }
-
/* skip gfs1 rindex indirect blocks */
if (sdp->gfs1 && blockfind(&gfs1_rindex_blks, block)) {
log_debug(_("Skipping rindex indir block "
diff --git a/gfs2/libgfs2/buf.c b/gfs2/libgfs2/buf.c
index 92cd393..a2af4ff 100644
--- a/gfs2/libgfs2/buf.c
+++ b/gfs2/libgfs2/buf.c
@@ -26,14 +26,16 @@
struct gfs2_buffer_head *bget(struct gfs2_sbd *sdp, uint64_t num)
{
struct gfs2_buffer_head *bh;
+ void *buf;
- bh = calloc(1, sizeof(struct gfs2_buffer_head) + sdp->bsize);
- if (bh == NULL)
+ if (posix_memalign(&buf, getpagesize(), sizeof(*bh) + sdp->bsize))
return NULL;
+ memset(buf, 0, sizeof(*bh) + sdp->bsize);
+ bh = (struct gfs2_buffer_head *)((char *)buf + sdp->bsize);
bh->b_blocknr = num;
bh->sdp = sdp;
- bh->iov.iov_base = (char *)bh + sizeof(struct gfs2_buffer_head);
+ bh->iov.iov_base = buf;
bh->iov.iov_len = sdp->bsize;
return bh;
@@ -106,7 +108,7 @@ int brelse(struct gfs2_buffer_head *bh)
bh->b_blocknr = -1;
if (bh->b_altlist.next && !osi_list_empty(&bh->b_altlist))
osi_list_del(&bh->b_altlist);
- free(bh);
+ free(bh->iov.iov_base);
return error;
}
diff --git a/gfs2/libgfs2/fs_ops.c b/gfs2/libgfs2/fs_ops.c
index 7e87e43..2c1b1d5 100644
--- a/gfs2/libgfs2/fs_ops.c
+++ b/gfs2/libgfs2/fs_ops.c
@@ -1500,13 +1500,13 @@ int lgfs2_write_filemeta(struct gfs2_inode *ip)
}
lgfs2_fill_indir(start, bh->b_data + sdp->bsize, ptr0, ptrs, &p);
if (bwrite(bh)) {
- free(bh);
+ free(bh->iov.iov_base);
return 1;
}
}
ptr0 += ptrs;
}
- free(bh);
+ free(bh->iov.iov_base);
return 0;
}
diff --git a/gfs2/libgfs2/structures.c b/gfs2/libgfs2/structures.c
index c84701d..b857489 100644
--- a/gfs2/libgfs2/structures.c
+++ b/gfs2/libgfs2/structures.c
@@ -74,12 +74,15 @@ int lgfs2_sb_write(const struct gfs2_sb *sb, int fd, const unsigned bsize)
struct iovec *iov;
const size_t sb_addr = GFS2_SB_ADDR * GFS2_BASIC_BLOCK / bsize;
const size_t len = sb_addr + 1;
+ ssize_t bytes;
+ void *buf;
/* We only need 2 blocks: one for zeroing and a second for the superblock */
- char *buf = calloc(2, bsize);
- if (buf == NULL)
+ if ((err = posix_memalign(&buf, getpagesize(), 2 * bsize))) {
+ errno = err;
return -1;
-
+ }
+ memset(buf, 0, 2 * bsize);
iov = malloc(len * sizeof(*iov));
if (iov == NULL)
goto out_buf;
@@ -89,10 +92,11 @@ int lgfs2_sb_write(const struct gfs2_sb *sb, int fd, const unsigned bsize)
iov[i].iov_len = bsize;
}
- gfs2_sb_out(sb, buf + bsize);
- iov[sb_addr].iov_base = buf + bsize;
+ iov[sb_addr].iov_base = (char *)buf + bsize;
+ gfs2_sb_out(sb, iov[sb_addr].iov_base);
- if (pwritev(fd, iov, len, 0) < (len * bsize))
+ bytes = pwritev(fd, iov, len, 0);
+ if (bytes < (long)(len * bsize))
goto out_iov;
err = 0;
@@ -181,7 +185,7 @@ int lgfs2_write_journal_data(struct gfs2_inode *ip)
#endif
if (bwrite(bh)) {
- free(bh);
+ free(bh->iov.iov_base);
return -1;
}
@@ -190,7 +194,7 @@ int lgfs2_write_journal_data(struct gfs2_inode *ip)
} while (++bh->b_blocknr < jext0 + blocks);
- free(bh);
+ free(bh->iov.iov_base);
return 0;
}
--
To stop receiving notification emails like this one, please contact
the administrator of this repository.