From aaacce58e9ec87c978ef178ee592253a34452885 Mon Sep 17 00:00:00 2001
From: Leonardo Taccari <iamleot@gmail.com>
Date: Mon, 17 Dec 2018 18:16:08 +0100
Subject: [PATCH] tscrape_update: Sync with sfeed_update

- Handle signals consistently in different shells
- Improve SIGINT handling
- Add a variable for max amount of feeds to update concurrently
- Add filter(), order() support per feed
- Don't always exit 1, exit 130 on SIGINT, exit 0 otherwise
- Fail on feed HTTP redirect

--http1.0 curl option was not removed (it is not present in sfeed_update)
to avoid HTTP/2.
---
 tscrape_update | 93 +++++++++++++++++++++++++++++++++++---------------
 1 file changed, 65 insertions(+), 28 deletions(-)

diff --git a/tscrape_update b/tscrape_update
index 7fa9de5..0a1016b 100755
--- a/tscrape_update
+++ b/tscrape_update
@@ -5,11 +5,15 @@
 # defaults
 tscrapepath="$HOME/.tscrape/feeds"
 
+# used for processing feeds concurrently: wait until ${maxjobs} amount of
+# feeds are finished at a time.
+maxjobs=8
+
 # load config (evaluate shellscript).
 # loadconfig(configfile)
 loadconfig() {
 	# allow to specify config via argv[1].
-	if [ ! x"$1" = x"" ]; then
+	if [ "$1" != "" ]; then
 		# get absolute path of config file.
 		config=$(readlink -f "$1")
 	else
@@ -17,8 +21,7 @@ loadconfig() {
 		config="$HOME/.tscrape/tscraperc"
 	fi
 
-	# load config: config is loaded here to be able to override $tscrapepath
-	# or functions.
+	# config is loaded here to be able to override $tscrapepath or functions.
 	if [ -r "${config}" ]; then
 		. "${config}"
 	else
@@ -28,46 +31,69 @@ loadconfig() {
 	fi
 }
 
-# merge raw files.
-# merge(oldfile, newfile)
+# merge raw files: unique sort by id, retweetid.
+# merge(name, oldfile, newfile)
 merge() {
-	# unique sort by id, retweetid.
-	# order by timestamp (desc).
-	(sort -t '	' -u -k5,5 -k8,8 "$1" "$2" 2>/dev/null) |
+	sort -t '	' -u -k5,5 -k8,8 "$2" "$3" 2>/dev/null
+}
+
+# filter fields.
+# filter(name)
+filter() {
+	cat
+}
+
+# order by timestamp (descending).
+# order(name)
+order() {
 	sort -t '	' -k1rn,1
 }
 
 # fetch a feed via HTTP/HTTPS etc.
-# fetchfeed(url, name, feedfile)
+# fetchfeed(name, url, feedfile)
 fetchfeed() {
-	if curl --http1.0 -H 'User-Agent:' -f -s -S --max-time 15 -z "$3" "$1"; then
-		printf "  OK %s %s\n" "$(date +'%H:%M:%S')" "$2" >&2
+	if curl --http1.0 -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
+		-z "$3" "$2" 2>/dev/null; then
+		printf "  OK %s %s\n" "$(date +'%H:%M:%S')" "$1" >&2
 	else
-		printf "FAIL %s %s\n" "$(date +'%H:%M:%S')" "$2" >&2
+		printf "FAIL %s %s\n" "$(date +'%H:%M:%S')" "$1" >&2
 	fi
 }
 
 # fetch and parse feed.
 # feed(name, feedurl)
 feed() {
+	# wait until ${maxjobs} are finished: throughput using this logic is
+	# non-optimal, but it is simple and portable.
+	[ ${signo} -ne 0 ] && return
+	[ $((curjobs % maxjobs)) -eq 0 ] && wait
+	[ ${signo} -ne 0 ] && return
+	curjobs=$((curjobs + 1))
+
 	(name="$1"
-	tmpfeedfile="${tscrapetmpdir}/${name}"
+	filename="$(printf '%s' "$1" | tr '/' '_')"
+	feedurl="$2"
+	tmpfeedfile="${tscrapetmpdir}/${filename}"
 	tmpencfile=""
-	tscrapefile="${tscrapepath}/$1"
+	tscrapefile="${tscrapepath}/${filename}"
 
-	fetchfeed "$2" "$1" "${tscrapefile}" | tscrape > "${tmpfeedfile}"
+	fetchfeed "${name}" "${feedurl}" "${tscrapefile}" | \
+		tscrape | filter "${name}" > "${tmpfeedfile}"
 
 	# get new data and merge with old.
-	tscrapefilenew="${tscrapepath}/${name}.new"
+	tscrapefilenew="${tscrapepath}/${filename}.new"
 	# new feed data is non-empty.
 	if [ -s "${tmpfeedfile}" ]; then
 		# if file exists, merge
 		if [ -e "${tscrapefile}" ]; then
-			merge "${tscrapefile}" "${tmpfeedfile}" > "${tscrapefilenew}"
+			merge "${name}" "${tscrapefile}" "${tmpfeedfile}" | \
+				order "${name}" > "${tscrapefilenew}"
+
 			# overwrite old file with updated file
 			mv "${tscrapefilenew}" "${tscrapefile}"
 		else
-			merge "/dev/null" "${tmpfeedfile}" > "${tscrapefile}"
+			merge "${name}" "/dev/null" "${tmpfeedfile}" | \
+				order "${name}" > "${tscrapefile}"
 		fi
 	fi) &
 }
@@ -81,28 +107,39 @@ cleanup() {
 	rm -rf "${tscrapetmpdir}"
 }
 
+sighandler() {
+	signo="$1"
+	# ignore TERM signal for myself.
+	trap -- "" TERM
+	# kill all running childs >:D
+	kill -TERM -$$
+}
+
 feeds() {
 	echo "Configuration file \"${config}\" is invalid or does not contain a \"feeds\" function." >&2
 	echo "See tscraperc.example for an example." >&2
 }
 
+# job counter.
+curjobs=0
+# signal number received for parent.
+signo=0
+# SIGINT: signal to interrupt parent.
+trap -- "sighandler 2" "INT"
+# SIGTERM: signal to terminate parent.
+trap -- "sighandler 15" "TERM"
 # load config file.
 loadconfig "$1"
-# fetch feeds and store in temporary file.
+# fetch feeds and store in temporary directory.
 tscrapetmpdir="$(mktemp -d '/tmp/tscrape_XXXXXX')"
-# kill whole current process group on ^C.
-isrunning="1"
-# SIGTERM: signal to terminate parent.
-trap -- "terminated" "15"
-# SIGINT: kill all running childs >:D
-trap -- "kill -TERM -$$" "2"
 # make sure path exists.
 mkdir -p "${tscrapepath}"
 # fetch feeds specified in config file.
 feeds
 # wait till all feeds are fetched (concurrently).
-wait
+[ ${signo} -eq 0 ] && wait
 # cleanup temporary files etc.
 cleanup
-# if terminated.
-[ "${isrunning}" = "0" ] && exit 1
+# on signal SIGINT and SIGTERM exit with signal number + 128.
+[ ${signo} -ne 0 ] && exit $((signo+128))
+exit 0
-- 
2.20.0