Skip to content

Commit

Permalink
fix: rm preprocessing
Browse files Browse the repository at this point in the history
  • Loading branch information
10zinten committed May 15, 2023
1 parent c95666e commit 9a72167
Showing 1 changed file with 1 addition and 41 deletions.
42 changes: 1 addition & 41 deletions align_tib_en.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,48 +13,8 @@ cp $2 $2.work
output_dir=${3:-"output"}
mkdir $output_dir

# this is a lot of preprocessing steps to check new-line behaviour etc. Ideally, there should be one "sentence" per line, and the number of sentences between Tibetan and English should match up as closely as possible before we apply the aligner.
perl -p -CIO -i -e 's/། །/།_།/g;' $1.work
perl -C -p -i -e 's/\n//g;' $1.work
perl -C -p -i -e 's/\r//g;' $1.work
perl -p -CIO -i -e 's/དང་། /དང་།_/g;' $1.work

perl -p -CIO -i -e 's/།([^_])/།\n$1/g;' $1.work
perl -p -CIO -i -e 's/དང་།_/དང་། /g;' $1.work
perl -p -CIO -i -e 's/^ +//g;' $1.work
perl -p -CIO -i -e 's/[0-9a-zA-Z]+//g;' $1.work
sed -i -e 's/_/ /g' $1.work
sed -i "s/[0-9]://g;" $1.work


perl -p -CIO -i -e 's/ [1-9]+[a-z.-]+\.//g;' $2.work
perl -p -CIO -i -e 's/vs\./vs /g;' $2.work
perl -p -CIO -i -e 's/ +/ /g;' $2.work
perl -p -CIO -i -e 's/([.!?:;;!?:] )/$1\n/g;' $2.work
sed -i '/^.\{,7\}$/d' $2.work

cp $2.work $2.work2

sed -i -e 's/([^()]*)//g' $2.work
sed -i -e 's/\[[^][]*\]//g' $2.work
sed -i -e 's/{[^}{]*}//g' $2.work

#sed -i -e 's/{[^}{]*}//g' $1.work
sed -i "s/{[^{}]*}//g" $2.work
sed -i "s/{[^{}]*}//g" $2.work
sed -i "s/{[^{}]*}//g" $2.work
sed -i "s/{[^{}]*}//g" $2.work
sed -i "s/{[^{}]*}//g" $2.work

#sed -i "s/{[^{}]*}//g" $1.work
#sed -i "s/{[^{}]*}//g" $1.work
#sed -i "s/{[^{}]*}//g" $1.work
#sed -i "s/{[^{}]*}//g" $1.work
#sed -i "s/{[^{}]*}//g" $1.work

sed -i '/^$/d' $1.work
sed -i '/^$/d' $2.work

echo '[INFO] Getting Embedding...'
time python get_vectors.py $1.work $number_of_overlays
time python get_vectors.py $2.work $number_of_overlays
Expand All @@ -76,7 +36,7 @@ mv *.txt* $output_dir/
mv $output_dir/requirements.txt ./
rm $output_dir/$1.work
rm $output_dir/$2.work
rm $output_dir/$2.work2
rm $output_dir/$2.work2
rm $output_dir/$1.work_vectors.npy
rm $output_dir/$2.work_vectors.npy

Expand Down

0 comments on commit 9a72167

Please sign in to comment.