-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmake_lang_corpus.sh
executable file
·33 lines (28 loc) · 1.39 KB
/
make_lang_corpus.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#!/bin/bash
echo "Make sure you're in a good conda environment!"
echo "This whole script can take hours, make sure that's ok!"
lang=$1
mkdir $lang ; cd $lang
echo "Getting and unzipping wikipedia dumps for $lang"
curl -O https://dumps.wikimedia.org/"$lang"wiki/latest/"$lang"wiki-latest-pages-articles.xml.bz2
bunzip2 "$lang"wiki-latest-pages-articles.xml.bz2
cd ..
# Download the trained language models you want for the language, and save them
# to the common location on the cluster # Make sure you have the conda env you want activated when you do this
echo "Downloading the pretrained models for stanfordnlp"
echo "import stanfordnlp; stanfordnlp.download(\"$lang\")"
printf "yes\n/u/nlp/data/stanfordnlp_resources/" | python -c "import stanfordnlp; stanfordnlp.download(\"$lang\")"
# Now we are ready to run a corpus creation job.
# This creates a file where every line is a sentence (sentence segmentation done
# by stanfordnlp) and every token is separated by a space (tokenisation also by
# stanfordnlp)
echo "Parsing the wikipedia dump into a corpus. This is the part that really takes a while"
echo $lang
echo "$lang"/parselog
python process_wiki_dump.py \
--dump-file "$lang"/"$lang"wiki-latest-pages-articles.xml \
--output-file "$lang"/"$lang"wiki-corpus > \
"$lang"/parselog
echo "Splitting the corpus into train/val/test"
# Split the corpus into train-dev-test
./split_corpus.sh $lang > "$lang"/splitlog