-
-
Notifications
You must be signed in to change notification settings - Fork 23
/
Copy pathsync_csv.sh
executable file
·143 lines (121 loc) · 3.79 KB
/
sync_csv.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/bin/bash
#
# Usage:
#
# ./sync_csv.sh [mobile_][Mon_D_YYYY]
#
# Examples:
#
# ./sync_csv.sh mobile_Dec_15_2018
# ./sync_csv.sh Jan_1_2019
DATA=$HOME/archive
BASE=`pwd`
if [ -n "$1" ]; then
archive=$1
if [[ $archive == *mobile* ]]; then
mobile=1
adate=${archive#mobile_}
else
mobile=0
adate=$archive
fi
echo "Processing $adate, mobile: $mobile, archive: $archive"
else
echo "Must provide date, eg. Apr_15_2013"
exit 1
fi
mkdir -p $DATA/processed/$archive
cd $DATA
YYYY_MM_DD=$(date --date="$(echo $adate | sed "s/_/ /g" -)" "+%Y_%m_%d")
if [[ $mobile == 1 ]]; then
client="mobile"
else
client="desktop"
fi
ptable="summary_pages.${YYYY_MM_DD}_${client}"
rtable="summary_requests.${YYYY_MM_DD}_${client}"
if bq show httparchive:${ptable} &> /dev/null && \
bq show httparchive:${rtable} &> /dev/null; then
# Tables should be deleted from BigQuery first if the intent is to overwrite them.
echo -e "BigQuery summary tables for ${YYYY_MM_DD}_${client} already exist, exiting"
exit 0
fi
if [ ! -f httparchive_${archive}_pages.csv.gz ]; then
echo -e "Downloading data for $archive"
gsutil cp "gs://httparchive/downloads/httparchive_${archive}_pages.csv.gz" ./
if [ $? -ne 0 ]; then
echo "Pages data for ${adate} is missing, exiting"
exit 1
fi
else
echo -e "Pages data already downloaded for $archive, skipping."
fi
if [ ! -f httparchive_${archive}_requests.csv.gz ]; then
gsutil cp "gs://httparchive/downloads/httparchive_${archive}_requests.csv.gz" ./
if [ $? -ne 0 ]; then
echo "Request data for ${adate} is missing, exiting"
exit 1
fi
else
echo -e "Request data already downloaded for $archive, skipping."
fi
if [ ! -f processed/${archive}/pages.csv.gz ]; then
echo -e "Converting pages data"
gunzip -c "httparchive_${archive}_pages.csv.gz" \
| sed -e 's/\\N,/"",/g' -e 's/\\N$/""/g' -e's/\([^\]\)\\"/\1""/g' -e's/\([^\]\)\\"/\1""/g' -e 's/\\"","/\\\\","/g' \
| gzip > "processed/${archive}/pages.csv.gz"
else
echo -e "Pages data already converted, skipping."
fi
if ls processed/${archive}/requests_* &> /dev/null; then
echo -e "Request data already converted, skipping."
else
echo -e "Converting requests data"
gunzip -c "httparchive_${archive}_requests.csv.gz" \
| sed -e 's/\\N,/"",/g' -e 's/\\N$/""/g' -e 's/\\"/""/g' -e 's/\\"","/\\\\","/g' \
| python fixcsv.py \
| split --lines=8000000 --filter='pigz - > $FILE.gz' - processed/$archive/requests_
fi
cd processed/${archive}
echo -e "Syncing data to Google Storage"
gsutil cp -n * gs://httparchive/${archive}/
bq show httparchive:${ptable} &> /dev/null
if [ $? -ne 0 ]; then
echo -e "Submitting new pages import ${ptable} to BigQuery"
bq load --max_bad_records 10 --replace $ptable gs://httparchive/${archive}/pages.csv.gz $BASE/schema/pages.json
if [ $? -ne 0 ]; then
echo "Error loading ${ptable}, exiting"
exit 1
fi
else
echo -e "${ptable} already exists, skipping."
fi
bq show httparchive:${rtable} &> /dev/null
if [ $? -ne 0 ]; then
echo -e "Submitting new requests import ${rtable} to BigQuery"
bq load --max_bad_records 10 --replace $rtable gs://httparchive/${archive}/requests_* $BASE/schema/requests.json
if [ $? -ne 0 ]; then
echo "Error loading ${rtable}, exiting"
exit 1
fi
else
echo -e "${rtable} already exists, skipping."
fi
bq show httparchive:${rtable} &> /dev/null
if [ $? -eq 0 ]; then
echo -e "Deleting CSV artifacts..."
rm $DATA/httparchive_${archive}_*
rm -r $DATA/processed/$archive
else
echo "Error loading into BigQuery, exiting"
exit 1
fi
echo -e "Attempting to generate reports for ${YYYY_MM_DD}..."
cd $HOME/code
gsutil -q stat gs://httparchive/reports/${YYYY_MM_DD}/*
if [ $? -eq 1 ]; then
. sql/generate_reports.sh -th ${YYYY_MM_DD} -l ALL
else
echo -e "Reports for ${YYYY_MM_DD} already exist, skipping."
fi
echo "Done"