-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathalign.sh
83 lines (53 loc) · 2.8 KB
/
align.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/bin/bash
source path.sh
# input transcript (can be nbest)
input_trans=$1
# input wave file
input_wav=$2
input_lang_dir=$3
# ctm file contains the aligned trancript
out_ctm=$4
out_phone_ctm=$5
out_state_seq=$6
likelihod_per_frame=$7
out_vtt=$8
#rm -rf temp
mkdir -p temp
# create scp files
python scripts/create_scp.py -i $input_wav -o temp
# copy the lm data
mkdir -p temp/lang
mkdir -p temp/graph_pp
cp -rf $input_lang_dir/* temp/lang
# expand and update the transcriptions
python scripts/convert_trans.py -i $input_trans -o temp/trans.txt \
-l [laughter] -n [noise] -u "<unk>" -w temp/lang/words.txt
# make sure srilm is installed in is in the path (path.sh should do it but in case it does not)
ngram-count -text temp/trans.txt -order 2 -addsmooth 0.1 -unk -lm temp/custom.lm
#sh test.sh
# create a FST for custom lm (G.fst)
cat temp/custom.lm | arpa2fst --disambig-symbol=#0 --read-symbol-table=temp/lang/words.txt - temp/lang/G.fst
# create HCLG graph
utils/mkgraph.sh --self-loop-scale 1.0 temp/lang exp/tdnn_7b_chain_online temp/graph_pp
# run the recognizer
# subsample the frames by factor 3
# --frame-subsampling-factor=1
mkdir -p temp/out
online2-wav-nnet3-latgen-faster --online=true --do-endpointing=false --config=exp/tdnn_7b_chain_online/conf/online.conf --max-active=7000 --beam=15.0 \
--lattice-beam=6.0 --acoustic-scale=1.0 --word-symbol-table=temp/graph_pp/words.txt exp/tdnn_7b_chain_online/final.mdl \
temp/graph_pp/HCLG.fst "ark:temp/spk2utt.scp" "scp:temp/wav.scp" "ark:|lattice-scale --acoustic-scale=0.5 ark:- ark:- | gzip -c >temp/out/lat.1.gz" > temp/recog_logs 2>&1
python extract_likelihood_per_frame.py temp/recog_logs $likelihod_per_frame
# create time alignment
# also acount for frame sub-sampling
# --frame-shift=0.01
lattice-align-words-lexicon temp/lang/phones/align_lexicon.int exp/tdnn_7b_chain_online/final.mdl "ark:gunzip -c temp/out/lat.1.gz|" ark:- | lattice-1best ark:- ark:- | nbest-to-ctm ark:- temp/out/align.ctm
zcat temp/out/lat.1.gz > temp/out/lat.1
lattice-1best --acoustic-scale=1 ark:temp/out/lat.1 ark:temp/out/1best.lats
nbest-to-linear ark:temp/out/1best.lats ark,t:temp/out/1.ali
ali-to-phones --ctm-output exp/tdnn_7b_chain_online/final.mdl ark:temp/out/1.ali temp/out/phone_alined.ctm
python scripts/convert_ctm.py -i temp/out/align.ctm -w temp/lang/words.txt -o $out_ctm
python scripts/convert_ctm.py -i temp/out/phone_alined.ctm -w temp/lang/phones.txt -o $out_phone_ctm
copy-int-vector ark:temp/out/1.ali ark,t:temp/out/transids.txt
show-transitions temp/lang/phones.txt exp/tdnn_7b_chain_online/final.mdl > temp/out/transitions.txt
python map_kaldi_transitionids.py --input temp/out/transids.txt --input_transitions temp/out/transitions.txt --output $out_state_seq
python scripts/convert_vtt.py --input_ctm $out_ctm --output_vtt $out_vtt