-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathgetdomains.sh
executable file
·77 lines (64 loc) · 2.5 KB
/
getdomains.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/bin/sh
#
# This script fetches all the domains and splits them up into batches
# that can be used to parallelize the scanning across tasks.
#
# This is how many domains to scan in a single task
BATCHSIZE="${BATCHSIZE:-6000}"
BINDIR=$(dirname "$0")
# This function uses the mergedomaincsv.py script to ensure that
# there are no duplicate domains, and that the metadata from the first
# instance of a domain remains.
mergedomains() {
"$BINDIR/mergedomaincsv.py" /tmp/domains.csv "$1" /tmp/mergeddomains.csv
mv /tmp/mergeddomains.csv /tmp/domains.csv
}
# get the list of domains
if [ -f "$DOMAINCSV" ] ; then
# this is so we can supply our own file for testing
cp "$DOMAINCSV" /tmp/domains.csv
else
# This is the base domain file with metadata in it already
wget -O /tmp/domains.csv https://github.com/GSA/data/raw/master/dotgov-domains/current-federal.csv
# XXX HERE IS WHERE TO ADD MORE DOMAINS/SUBDOMAINS!!!
# To add more files, make sure they are in the same format as the
# current-federal.csv file, OR make sure that the domain is the first
# field in your csv file. You can put them in the domains dir, and
# they will be merged in so there are no duplicates and metadata from
# the first instance of the domain will be preserved.
#wget -O "$BINDIR/domains/other-websites.csv" https://raw.githubusercontent.com/GSA/data/master/dotgov-websites/other-websites.csv
wget -O "$BINDIR/domains/0pulse.csv" https://github.com/GSA/data/raw/master/dotgov-websites/pulse-subdomains-snapshot-06-08-2020-https.csv
for i in $BINDIR/domains/*.csv ; do
echo "merging $i into /tmp/domains.csv"
mergedomains "$i"
done
# clean up in case you are running this by hand and don't want to accidentally check this in
rm -f "$BINDIR/domains/other-websites.csv"
rm -f "$BINDIR/domains/0pulse.csv"
fi
# figure out where to split the CSV up
if [ -n "$1" ] ; then
SPLITDIR="$1"
else
if [ -d "$TMPDIR" ] ; then
SPLITDIR="$TMPDIR/splitdir"
elif [ -d "/home/vcap/tmp/splitdir" ] ; then
SPLITDIR="/home/vcap/tmp/splitdir"
else
SPLITDIR="/tmp/splitdir"
fi
fi
# remove the header line from the CSV
tail -n +2 /tmp/domains.csv > /tmp/domains.txt
# split the CSV up!
echo splitting into "$SPLITDIR"
rm -rf "$SPLITDIR"/*
mkdir -p "$SPLITDIR"
cd "$SPLITDIR"
split -l "$BATCHSIZE" /tmp/domains.txt
# put the header line back into each file so that the CSV parser does the right thing
for i in $(ls "$SPLITDIR") ; do
echo 'Domain Name,Domain Type,Agency,Organization,City,State,Security Contact Email' > "$i.csv"
cat "$i" >> "$i.csv"
rm "$i"
done