This commit should give us new 'filterlist' files alongside the 'fullfilelist' and 'fullfiletimelist' files in the /fedora , /archive and /alt folders on the mirrors. The contents of this file are the same as 'fullfilelist', but with all directories, packages (.rpm or .drpm files) and device tree boot files (.dtb files) removed. This gives a massively smaller list which, right now, will be useful for fedfind (it can parse these lists instead of rsync scraping) and may possibly be useful for other things in future, I guess.
It would be nice to have this now as this all came out of the work around improving generation of the mediawriter 'available images' JSON file: we want to use fedfind to generate that, but the rsync scraping is pretty heavy for something that'll run quite frequently. This should improve things quite a bit (especially as I've written fedfind to cache the files and only re-download them if the Last-Modified header changes). I have the fedfind changes all written and tested (I tested against the fullfilelist files).
From 3f61df2f2879c23a7e44271527facce99bc92286 Mon Sep 17 00:00:00 2001 From: Adam Williamson awilliam@redhat.com Date: Fri, 18 Nov 2016 16:34:38 -0800 Subject: [PATCH] Generate filtered file lists for fedfind to use
This adds `filterlist` files alongside the `fullfilelist` and `fullfiletimelist` files. These are much, much shorter lists which skip the entries for packages, ARM device tree boot files and directories. They are intended for consumption by fedfind, so it can stop using rync scraping to discover the image files it looks for. To enable this, we update to a newer version of `create-filelist` from upstream `quick-fedora-mirror` and make `update-fullfiletimelist` create the filterlist files as well.
We also delete a couple of old copies of `create-filelist`; nirik made the two roles that use it share a common copy a few months back, but missed deleting the copy each role had in its `files` directory. --- files/scripts/create-filelist | 10 ++++++++- files/scripts/update-fullfiletimelist | 19 ++++++++++++++-- roles/bodhi2/backend/files/create-filelist | 36 ------------------------------ roles/releng/files/create-filelist | 36 ------------------------------ 4 files changed, 26 insertions(+), 75 deletions(-) delete mode 100644 roles/bodhi2/backend/files/create-filelist delete mode 100644 roles/releng/files/create-filelist
diff --git a/files/scripts/create-filelist b/files/scripts/create-filelist index eeba9d0..8fc3367 100755 --- a/files/scripts/create-filelist +++ b/files/scripts/create-filelist @@ -57,7 +57,9 @@ def recursedir(path='.', skip=[], alwaysskip=['.~tmp~']): def parseopts(): null = open(os.devnull, 'w') p = argparse.ArgumentParser( - description='Generate a list of files and times, suitable for consumption by quick-fedora-mirror.') + description='Generate a list of files and times, suitable for consumption by quick-fedora-mirror, ' + 'and a much smaller list with packages, Device Tree boot files, HTML files and ' + 'directories filtered out, for consumption by fedfind.') p.add_argument('-c', '--checksum', action='store_true', help='Include checksums of all repomd.xml files in the file list.') p.add_argument('-C', '--checksum-file', action='append', dest='checksum_files', @@ -73,6 +75,8 @@ def parseopts(): help='Filename of the file list with times (default: stdout).') p.add_argument('-f', '--filelist', type=argparse.FileType('w'), default=null, help='Filename of the file list without times (default: no plain file list is generated).') + p.add_argument('-F', '--filterlist', type=argparse.FileType('w'), default=null, + help='Filename of the filtered file list for fedfind (default: not generated).')
opts = p.parse_args()
@@ -107,6 +111,10 @@ def main(): for entry in recursedir(skip=opts.skip_files): # opts.filelist.write(entry.path + '\n') print(entry.path, file=opts.filelist) + # write to filtered list if appropriate + skips = ('.rpm', '.drpm', '.dtb', '.html') + if not any(entry.path.endswith(skip) for skip in skips) and not (entry.is_dir()): + print(entry.path, file=opts.filterlist) if entry.name in opts.checksum_files: checksums[entry.path[2:]] = True info = entry.stat(follow_symlinks=False) diff --git a/files/scripts/update-fullfiletimelist b/files/scripts/update-fullfiletimelist index 016ca8e..e70fadc 100755 --- a/files/scripts/update-fullfiletimelist +++ b/files/scripts/update-fullfiletimelist @@ -25,6 +25,7 @@ CREATE=/usr/local/bin/create-filelist # context. FILELIST=fullfilelist TIMELIST='fullfiletimelist-$mod' +FILTERLIST=filterlist
usage () { echo @@ -107,12 +108,14 @@ cd $tmpd for mod in $MODS; do currentfl=$TOPD/$mod/${FILELIST/'$mod'/$mod} currenttl=$TOPD/$mod/${TIMELIST/'$mod'/$mod} + currentsl=$TOPD/$mod/${FILTERLIST/'$mod'/$mod} flname=$(basename $currentfl) tlname=$(basename $currenttl) + slname=$(basename $currentsl)
- $CREATE -c -s -d $TOPD/$mod -f $flname -t $tlname + $CREATE -c -s -d $TOPD/$mod -f $flname -t $tlname -F $slname
- # If a file list exsts and doesn't differ from what we just generated, + # If a file list exists and doesn't differ from what we just generated, # delete the latter. if [[ -f $currentfl ]] && diff -q $currentfl $flname > /dev/null; then rm -f $flname @@ -120,6 +123,9 @@ cd $tmpd if [[ -f $currenttl ]] && diff -q $currenttl $tlname > /dev/null; then rm -f $tlname fi + if [[ -f $currentsl ]] && diff -q $currentsl $slname > /dev/null; then + rm -f $slname + fi done
# Now we have the new file lists but in a temporary directory which @@ -128,10 +134,13 @@ cd $tmpd for mod in $MODS; do currentfl=$TOPD/$mod/${FILELIST/'$mod'/$mod} currenttl=$TOPD/$mod/${TIMELIST/'$mod'/$mod} + currentsl=$TOPD/$mod/${FILTERLIST/'$mod'/$mod} flname=$(basename $currentfl) fldir=$(dirname $currentfl) tlname=$(basename $currenttl) tldir=$(dirname $currenttl) + slname=$(basename $currentsl) + sldir=$(dirname $currentsl)
if [[ -f $flname ]]; then tmpf=$(mktemp -p $fldir $flname.XXXXXXXXXX) @@ -145,6 +154,12 @@ cd $tmpd chmod 644 $tmpf mv $tmpf $currenttl fi + if [[ -f $slname ]]; then + tmpf=$(mktemp -p $sldir $slname.XXXXXXXXXX) + cp -p $slname $tmpf + chmod 644 $tmpf + mv $tmpf $currentsl + fi done
) 9>$LOCKFILE diff --git a/roles/bodhi2/backend/files/create-filelist b/roles/bodhi2/backend/files/create-filelist deleted file mode 100644 index d95000e..0000000 --- a/roles/bodhi2/backend/files/create-filelist +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/python - -# A simple script to generate a file list in a format easily consumable by a -# shell script. - -# Originally written by Jason Tibbitts tibbs@math.uh.edu in 2016. -# Donated to the public domain. If you require a statement of license, please -# consider this work to be licensed as "CC0 Universal", any version you choose. - - -from scandir import scandir - - -def get_ftype(entry): - """Return a simple indicator of the file type.""" - if entry.is_symlink(): - return 'l' - if entry.is_dir(): - return 'd' - return 'f' - - -def recursedir(path): - """Just like scandir, but recursively.""" - for entry in scandir(path): - if entry.is_dir(follow_symlinks=False): - for rentry in recursedir(entry.path): - yield rentry - yield entry - - -for entry in recursedir('.'): - info = entry.stat(follow_symlinks=False) - modtime = max(info.st_mtime, info.st_ctime) - ftype = get_ftype(entry) - print('{} {} {}'.format(modtime, ftype, entry.path[2:])) diff --git a/roles/releng/files/create-filelist b/roles/releng/files/create-filelist deleted file mode 100644 index d95000e..0000000 --- a/roles/releng/files/create-filelist +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/python - -# A simple script to generate a file list in a format easily consumable by a -# shell script. - -# Originally written by Jason Tibbitts tibbs@math.uh.edu in 2016. -# Donated to the public domain. If you require a statement of license, please -# consider this work to be licensed as "CC0 Universal", any version you choose. - - -from scandir import scandir - - -def get_ftype(entry): - """Return a simple indicator of the file type.""" - if entry.is_symlink(): - return 'l' - if entry.is_dir(): - return 'd' - return 'f' - - -def recursedir(path): - """Just like scandir, but recursively.""" - for entry in scandir(path): - if entry.is_dir(follow_symlinks=False): - for rentry in recursedir(entry.path): - yield rentry - yield entry - - -for entry in recursedir('.'): - info = entry.stat(follow_symlinks=False) - modtime = max(info.st_mtime, info.st_ctime) - ftype = get_ftype(entry) - print('{} {} {}'.format(modtime, ftype, entry.path[2:]))
+1
On 18 November 2016 at 20:05, Adam Williamson adamwill@fedoraproject.org wrote:
This commit should give us new 'filterlist' files alongside the 'fullfilelist' and 'fullfiletimelist' files in the /fedora , /archive and /alt folders on the mirrors. The contents of this file are the same as 'fullfilelist', but with all directories, packages (.rpm or .drpm files) and device tree boot files (.dtb files) removed. This gives a massively smaller list which, right now, will be useful for fedfind (it can parse these lists instead of rsync scraping) and may possibly be useful for other things in future, I guess.
It would be nice to have this now as this all came out of the work around improving generation of the mediawriter 'available images' JSON file: we want to use fedfind to generate that, but the rsync scraping is pretty heavy for something that'll run quite frequently. This should improve things quite a bit (especially as I've written fedfind to cache the files and only re-download them if the Last-Modified header changes). I have the fedfind changes all written and tested (I tested against the fullfilelist files).
From 3f61df2f2879c23a7e44271527facce99bc92286 Mon Sep 17 00:00:00 2001 From: Adam Williamson awilliam@redhat.com Date: Fri, 18 Nov 2016 16:34:38 -0800 Subject: [PATCH] Generate filtered file lists for fedfind to use
This adds `filterlist` files alongside the `fullfilelist` and `fullfiletimelist` files. These are much, much shorter lists which skip the entries for packages, ARM device tree boot files and directories. They are intended for consumption by fedfind, so it can stop using rync scraping to discover the image files it looks for. To enable this, we update to a newer version of `create-filelist` from upstream `quick-fedora-mirror` and make `update-fullfiletimelist` create the filterlist files as well.
We also delete a couple of old copies of `create-filelist`; nirik made the two roles that use it share a common copy a few months back, but missed deleting the copy each role had in its `files` directory.
files/scripts/create-filelist | 10 ++++++++- files/scripts/update-fullfiletimelist | 19 ++++++++++++++-- roles/bodhi2/backend/files/create-filelist | 36
roles/releng/files/create-filelist | 36
4 files changed, 26 insertions(+), 75 deletions(-) delete mode 100644 roles/bodhi2/backend/files/create-filelist delete mode 100644 roles/releng/files/create-filelist
diff --git a/files/scripts/create-filelist b/files/scripts/create-filelist index eeba9d0..8fc3367 100755 --- a/files/scripts/create-filelist +++ b/files/scripts/create-filelist @@ -57,7 +57,9 @@ def recursedir(path='.', skip=[], alwaysskip=['.~tmp~']): def parseopts(): null = open(os.devnull, 'w') p = argparse.ArgumentParser(
description='Generate a list of files and times, suitable for
consumption by quick-fedora-mirror.')
description='Generate a list of files and times, suitable for
consumption by quick-fedora-mirror, '
'and a much smaller list with packages, Device Tree
boot files, HTML files and '
'directories filtered out, for consumption by
fedfind.') p.add_argument('-c', '--checksum', action='store_true', help='Include checksums of all repomd.xml files in the file list.') p.add_argument('-C', '--checksum-file', action='append', dest='checksum_files', @@ -73,6 +75,8 @@ def parseopts(): help='Filename of the file list with times (default: stdout).') p.add_argument('-f', '--filelist', type=argparse.FileType('w'), default=null, help='Filename of the file list without times (default: no plain file list is generated).')
- p.add_argument('-F', '--filterlist', type=argparse.FileType('w'),
default=null,
help='Filename of the filtered file list for fedfind
(default: not generated).')
opts = p.parse_args()
@@ -107,6 +111,10 @@ def main(): for entry in recursedir(skip=opts.skip_files): # opts.filelist.write(entry.path + '\n') print(entry.path, file=opts.filelist)
# write to filtered list if appropriate
skips = ('.rpm', '.drpm', '.dtb', '.html')
if not any(entry.path.endswith(skip) for skip in skips) and not
(entry.is_dir()):
print(entry.path, file=opts.filterlist) if entry.name in opts.checksum_files: checksums[entry.path[2:]] = True info = entry.stat(follow_symlinks=False)
diff --git a/files/scripts/update-fullfiletimelist b/files/scripts/update-fullfiletimelist index 016ca8e..e70fadc 100755 --- a/files/scripts/update-fullfiletimelist +++ b/files/scripts/update-fullfiletimelist @@ -25,6 +25,7 @@ CREATE=/usr/local/bin/create-filelist # context. FILELIST=fullfilelist TIMELIST='fullfiletimelist-$mod' +FILTERLIST=filterlist
usage () { echo @@ -107,12 +108,14 @@ cd $tmpd for mod in $MODS; do currentfl=$TOPD/$mod/${FILELIST/'$mod'/$mod} currenttl=$TOPD/$mod/${TIMELIST/'$mod'/$mod}
currentsl=$TOPD/$mod/${FILTERLIST/'$mod'/$mod} flname=$(basename $currentfl) tlname=$(basename $currenttl)
slname=$(basename $currentsl)
$CREATE -c -s -d $TOPD/$mod -f $flname -t $tlname
$CREATE -c -s -d $TOPD/$mod -f $flname -t $tlname -F $slname
# If a file list exsts and doesn't differ from what we just
generated,
# If a file list exists and doesn't differ from what we just
generated, # delete the latter. if [[ -f $currentfl ]] && diff -q $currentfl $flname > /dev/null; then rm -f $flname @@ -120,6 +123,9 @@ cd $tmpd if [[ -f $currenttl ]] && diff -q $currenttl $tlname > /dev/null; then rm -f $tlname fi
if [[ -f $currentsl ]] && diff -q $currentsl $slname > /dev/null;
then
rm -f $slname
fi
done
# Now we have the new file lists but in a temporary directory which
@@ -128,10 +134,13 @@ cd $tmpd for mod in $MODS; do currentfl=$TOPD/$mod/${FILELIST/'$mod'/$mod} currenttl=$TOPD/$mod/${TIMELIST/'$mod'/$mod}
currentsl=$TOPD/$mod/${FILTERLIST/'$mod'/$mod} flname=$(basename $currentfl) fldir=$(dirname $currentfl) tlname=$(basename $currenttl) tldir=$(dirname $currenttl)
slname=$(basename $currentsl)
sldir=$(dirname $currentsl) if [[ -f $flname ]]; then tmpf=$(mktemp -p $fldir $flname.XXXXXXXXXX)
@@ -145,6 +154,12 @@ cd $tmpd chmod 644 $tmpf mv $tmpf $currenttl fi
if [[ -f $slname ]]; then
tmpf=$(mktemp -p $sldir $slname.XXXXXXXXXX)
cp -p $slname $tmpf
chmod 644 $tmpf
mv $tmpf $currentsl
donefi
) 9>$LOCKFILE diff --git a/roles/bodhi2/backend/files/create-filelist b/roles/bodhi2/backend/files/create-filelist deleted file mode 100644 index d95000e..0000000 --- a/roles/bodhi2/backend/files/create-filelist +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/python
-# A simple script to generate a file list in a format easily consumable by a -# shell script.
-# Originally written by Jason Tibbitts tibbs@math.uh.edu in 2016. -# Donated to the public domain. If you require a statement of license, please -# consider this work to be licensed as "CC0 Universal", any version you choose.
-from scandir import scandir
-def get_ftype(entry):
- """Return a simple indicator of the file type."""
- if entry.is_symlink():
return 'l'
- if entry.is_dir():
return 'd'
- return 'f'
-def recursedir(path):
- """Just like scandir, but recursively."""
- for entry in scandir(path):
if entry.is_dir(follow_symlinks=False):
for rentry in recursedir(entry.path):
yield rentry
yield entry
-for entry in recursedir('.'):
- info = entry.stat(follow_symlinks=False)
- modtime = max(info.st_mtime, info.st_ctime)
- ftype = get_ftype(entry)
- print('{} {} {}'.format(modtime, ftype, entry.path[2:]))
diff --git a/roles/releng/files/create-filelist b/roles/releng/files/create-filelist deleted file mode 100644 index d95000e..0000000 --- a/roles/releng/files/create-filelist +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/python
-# A simple script to generate a file list in a format easily consumable by a -# shell script.
-# Originally written by Jason Tibbitts tibbs@math.uh.edu in 2016. -# Donated to the public domain. If you require a statement of license, please -# consider this work to be licensed as "CC0 Universal", any version you choose.
-from scandir import scandir
-def get_ftype(entry):
- """Return a simple indicator of the file type."""
- if entry.is_symlink():
return 'l'
- if entry.is_dir():
return 'd'
- return 'f'
-def recursedir(path):
- """Just like scandir, but recursively."""
- for entry in scandir(path):
if entry.is_dir(follow_symlinks=False):
for rentry in recursedir(entry.path):
yield rentry
yield entry
-for entry in recursedir('.'):
- info = entry.stat(follow_symlinks=False)
- modtime = max(info.st_mtime, info.st_ctime)
- ftype = get_ftype(entry)
- print('{} {} {}'.format(modtime, ftype, entry.path[2:]))
-- 2.10.2
-- Adam Williamson Fedora QA Community Monkey IRC: adamw | Twitter: AdamW_Fedora | XMPP: adamw AT happyassassin . net http://www.happyassassin.net _______________________________________________ infrastructure mailing list -- infrastructure@lists.fedoraproject.org To unsubscribe send an email to infrastructure-leave@lists.fedoraproject.org
On Fri, 2016-11-18 at 19:05 -0700, Kevin Fenzi wrote:
+1 here. I can push it/test it tomorrow if you like.
kevin
That'd be great!
infrastructure@lists.fedoraproject.org