-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathruncjkwp.sh
121 lines (106 loc) · 2.77 KB
/
runcjkwp.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
export WIKILANG=$1
treeJson() {
tree -J > tree.json
sed -i .bak -E 's/"type":"file""/"/g' tree.json
sed -i .bak -E 's/"type":"directory""/"/g' tree.json
sed -i .bak -E 's/"type":"link""/"type":"link","/g' tree.json
rm tree.json.bak
zip tree.json.zip tree.json
}
if [ $1 = "-c" ]; then
export WIKILANG=$2
VERSION=`cat VERSION`
else
day=$(date +%d)
VERSION=`date '+%Y%m01'`
if [ "$day" -gt "20" ]; then
VERSION=`date '+%Y%m20'`
fi
echo $VERSION > VERSION
fi
if [ $1 = "-t" ]; then
treeJson
exit 1
fi
if [ $1 = "-r" ]; then
export WIKILANG=$2
fi
if [ -z $WIKILANG ]; then
export WIKILANG=zh
fi
DATADIR=$WIKILANG"wiki"
FILTER="filters/gfm-cjk.lua"
if [ "$WIKILANG" = "en" ]; then
FILTER="filters/gfm-en.lua"
fi
if [ -f $FILTER ]; then
echo "Filter "$FILTER" exists... Version: $VERSION"
else
echo "Filter "$FILTER" does not exist, exit..."
exit 1
fi
command -v jq >/dev/null 2>&1 || { apt install jq; }
wget https://dumps.wikimedia.org/$DATADIR/$VERSION/dumpstatus.json -O $DATADIR.dumpstatus.json
if [ $1 = "-c" ]; then
echo "Continue $WIKILANG..."
else
if [ $1 = "-r" ]; then
echo "Start over and reusing data..."
else
echo "Start over $WIKILANG..."
rm -rv $DATADIR
fi
fi
mkdir $DATADIR
cat $DATADIR.dumpstatus.json | jq ".jobs.articlesdump.files[].url" | sort | awk -F '"' '{print "https://dumps.wikimedia.org" $2 }' > $DATADIR/$VERSION.sh
LOG=$DATADIR.convert.log
REPO="../wikipedia."$WIKILANG
if [ -d $REPO ]; then
echo "$REPO exists"
else
cd ..
git clone [email protected]:chinapedia/wikipedia.$WIKILANG.git --depth=3
cd -
fi
cp -f VERSION $REPO/VERSION
> $LOG
counter=1
for url in $(cat $DATADIR/$VERSION.sh); do
echo "Start "$url >> $LOG
cd $DATADIR
if [ -f $counter ]; then
echo "stream " $counter " exists."
if [ $1 = "-c" ]; then
cd ..
counter=$((counter + 1))
continue
fi
else
wget $url -O $counter.bz2
bzip2 -dk $counter.bz2
rm $counter.bz2
fi
cd ..
mkdir "$REPO/Errors"
mkdir "$REPO/Redirect"
php -d memory_limit=4096M convert.php --filename="$DATADIR"/"$counter" --output="$REPO" --luafilter="$FILTER" --template=cfm-"$WIKILANG"
cd $REPO
rm Errors/*.wikitext
python3 clean.py
find Errors -name "*.log" -type f -size -1c -delete
git add .
git commit -m "Convert from $VERSION stream$counter"
git push
cd -
echo "Done stream "$counter >> $LOG
> "$DATADIR"/"$counter"
counter=$((counter + 1))
done
cd $REPO
sed -i .bak -E "s/[0-9]{8}/$VERSION/g" README.md
git add README.md
rm README.md.bak
treeJson
git add tree.json.zip
git commit -m "Set version to $VERSION"
git push