From aaacce58e9ec87c978ef178ee592253a34452885 Mon Sep 17 00:00:00 2001 From: Leonardo Taccari Date: Mon, 17 Dec 2018 18:16:08 +0100 Subject: [PATCH] tscrape_update: Sync with sfeed_update - Handle signals consistently in different shells - Improve SIGINT handling - Add a variable for max amount of feeds to update concurrently - Add filter(), order() support per feed - Don't always exit 1, exit 130 on SIGINT, exit 0 otherwise - Fail on feed HTTP redirect --http1.0 curl option was not removed (it is not present in sfeed_update) to avoid HTTP/2. --- tscrape_update | 93 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 65 insertions(+), 28 deletions(-) diff --git a/tscrape_update b/tscrape_update index 7fa9de5..0a1016b 100755 --- a/tscrape_update +++ b/tscrape_update @@ -5,11 +5,15 @@ # defaults tscrapepath="$HOME/.tscrape/feeds" +# used for processing feeds concurrently: wait until ${maxjobs} amount of +# feeds are finished at a time. +maxjobs=8 + # load config (evaluate shellscript). # loadconfig(configfile) loadconfig() { # allow to specify config via argv[1]. - if [ ! x"$1" = x"" ]; then + if [ "$1" != "" ]; then # get absolute path of config file. config=$(readlink -f "$1") else @@ -17,8 +21,7 @@ loadconfig() { config="$HOME/.tscrape/tscraperc" fi - # load config: config is loaded here to be able to override $tscrapepath - # or functions. + # config is loaded here to be able to override $tscrapepath or functions. if [ -r "${config}" ]; then . "${config}" else @@ -28,46 +31,69 @@ loadconfig() { fi } -# merge raw files. -# merge(oldfile, newfile) +# merge raw files: unique sort by id, retweetid. +# merge(name, oldfile, newfile) merge() { - # unique sort by id, retweetid. - # order by timestamp (desc). - (sort -t ' ' -u -k5,5 -k8,8 "$1" "$2" 2>/dev/null) | + sort -t ' ' -u -k5,5 -k8,8 "$2" "$3" 2>/dev/null +} + +# filter fields. +# filter(name) +filter() { + cat +} + +# order by timestamp (descending). +# order(name) +order() { sort -t ' ' -k1rn,1 } # fetch a feed via HTTP/HTTPS etc. -# fetchfeed(url, name, feedfile) +# fetchfeed(name, url, feedfile) fetchfeed() { - if curl --http1.0 -H 'User-Agent:' -f -s -S --max-time 15 -z "$3" "$1"; then - printf " OK %s %s\n" "$(date +'%H:%M:%S')" "$2" >&2 + if curl --http1.0 -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \ + -z "$3" "$2" 2>/dev/null; then + printf " OK %s %s\n" "$(date +'%H:%M:%S')" "$1" >&2 else - printf "FAIL %s %s\n" "$(date +'%H:%M:%S')" "$2" >&2 + printf "FAIL %s %s\n" "$(date +'%H:%M:%S')" "$1" >&2 fi } # fetch and parse feed. # feed(name, feedurl) feed() { + # wait until ${maxjobs} are finished: throughput using this logic is + # non-optimal, but it is simple and portable. + [ ${signo} -ne 0 ] && return + [ $((curjobs % maxjobs)) -eq 0 ] && wait + [ ${signo} -ne 0 ] && return + curjobs=$((curjobs + 1)) + (name="$1" - tmpfeedfile="${tscrapetmpdir}/${name}" + filename="$(printf '%s' "$1" | tr '/' '_')" + feedurl="$2" + tmpfeedfile="${tscrapetmpdir}/${filename}" tmpencfile="" - tscrapefile="${tscrapepath}/$1" + tscrapefile="${tscrapepath}/${filename}" - fetchfeed "$2" "$1" "${tscrapefile}" | tscrape > "${tmpfeedfile}" + fetchfeed "${name}" "${feedurl}" "${tscrapefile}" | \ + tscrape | filter "${name}" > "${tmpfeedfile}" # get new data and merge with old. - tscrapefilenew="${tscrapepath}/${name}.new" + tscrapefilenew="${tscrapepath}/${filename}.new" # new feed data is non-empty. if [ -s "${tmpfeedfile}" ]; then # if file exists, merge if [ -e "${tscrapefile}" ]; then - merge "${tscrapefile}" "${tmpfeedfile}" > "${tscrapefilenew}" + merge "${name}" "${tscrapefile}" "${tmpfeedfile}" | \ + order "${name}" > "${tscrapefilenew}" + # overwrite old file with updated file mv "${tscrapefilenew}" "${tscrapefile}" else - merge "/dev/null" "${tmpfeedfile}" > "${tscrapefile}" + merge "${name}" "/dev/null" "${tmpfeedfile}" | \ + order "${name}" > "${tscrapefile}" fi fi) & } @@ -81,28 +107,39 @@ cleanup() { rm -rf "${tscrapetmpdir}" } +sighandler() { + signo="$1" + # ignore TERM signal for myself. + trap -- "" TERM + # kill all running childs >:D + kill -TERM -$$ +} + feeds() { echo "Configuration file \"${config}\" is invalid or does not contain a \"feeds\" function." >&2 echo "See tscraperc.example for an example." >&2 } +# job counter. +curjobs=0 +# signal number received for parent. +signo=0 +# SIGINT: signal to interrupt parent. +trap -- "sighandler 2" "INT" +# SIGTERM: signal to terminate parent. +trap -- "sighandler 15" "TERM" # load config file. loadconfig "$1" -# fetch feeds and store in temporary file. +# fetch feeds and store in temporary directory. tscrapetmpdir="$(mktemp -d '/tmp/tscrape_XXXXXX')" -# kill whole current process group on ^C. -isrunning="1" -# SIGTERM: signal to terminate parent. -trap -- "terminated" "15" -# SIGINT: kill all running childs >:D -trap -- "kill -TERM -$$" "2" # make sure path exists. mkdir -p "${tscrapepath}" # fetch feeds specified in config file. feeds # wait till all feeds are fetched (concurrently). -wait +[ ${signo} -eq 0 ] && wait # cleanup temporary files etc. cleanup -# if terminated. -[ "${isrunning}" = "0" ] && exit 1 +# on signal SIGINT and SIGTERM exit with signal number + 128. +[ ${signo} -ne 0 ] && exit $((signo+128)) +exit 0 -- 2.20.0