-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawlfiles.sh
executable file
·83 lines (69 loc) · 1.69 KB
/
crawlfiles.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/bin/sh
#
# Dirty recursive scraper which will break when the sites layout changes.
cookiejar=${XDG_CACHE_HOME:-$HOME/.cache}/wbs_cookies
baseurl=https://ecampus.wbstraining.de
basedir=$PWD
die() {
fmt="%s: $1"
shift
printf "$fmt" "${0##*/}" "$@" >&2
exit 1
}
info() {
printf "$@" >&2
}
crawl() {
curl --cookie "$cookiejar" --no-progress-meter "$1" \
| grep '<h3 class="il_ContainerItemTitle"><a href="' \
| while IFS= read -r line; do
link=${line#*<a href=}
link=${link%% class=*}
link=${link#\"}
link=${link%\"}
name=${line%</a>*}
name=${name##*>}
# No slashes in directory names please!
case $name in */*)
name=$(printf '%s\n' "$name" | tr / \|)
esac
if [ "$PWD" = "$basedir" ]; then
relapath=$name
else
relapath=${PWD#"$basedir"/}/$name
fi
case $link in
*[?\&]cmd=calldirectlink*) # Link
info 'Creating link file: %s\n' "$relapath"
# TODO: html decode.
# TODO: don't overwrite files blindly
printf '%s/%s\n' "$baseurl" "$link" >"$name" ||
die 'Failed writing file: %s\n' "$PWD/$name"
;;
*[?\&]cmd=view*) # Directory
info 'Making directory: %s\n' "$relapath"
[ -e "$name" ] ||
mkdir -- "$name" || exit
cd "$name" || exit
crawl "$baseurl/$link"
cd .. || exit
;;
*goto.php*[?\&]target=*) # File
info 'Getting file: %s\n' "$relapath"
curl \
--cookie "$cookiejar" \
--no-clobber \
--no-progress-meter \
--remote-name \
--remote-header-name \
--remote-time \
"$link"
;;
esac
done
}
[ $# -ne 1 ] &&
die 'Usage: %s <url>\n' "${0##*/}"
[ -r "$cookiejar" ] ||
die 'Cookie jar does not exist or not readable: %s\n' "$cookiejar"
crawl "$1"