-
Notifications
You must be signed in to change notification settings - Fork 35
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
trunk: Adding DNN-based speaker recognition recipe in egs/sre10
git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@5223 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
- Loading branch information
1 parent
189c774
commit 55d8f86
Showing
31 changed files
with
2,833 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
#!/bin/bash | ||
|
||
# Copyright 2013 Daniel Povey | ||
# 2014-2015 David Snyder | ||
# 2015 Johns Hopkins University (Author: Daniel Garcia-Romero) | ||
# 2015 Johns Hopkins University (Author: Daniel Povey) | ||
# Apache 2.0. | ||
|
||
# This script extracts iVectors for a set of utterances, given | ||
# features and a trained DNN-based iVector extractor. | ||
|
||
# Begin configuration section. | ||
nj=30 | ||
cmd="run.pl" | ||
stage=0 | ||
min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out) | ||
posterior_scale=1.0 # This scale helps to control for successive features being highly | ||
# correlated. E.g. try 0.1 or 0.3. | ||
# End configuration section. | ||
|
||
echo "$0 $@" # Print the command line for logging | ||
|
||
if [ -f path.sh ]; then . ./path.sh; fi | ||
. parse_options.sh || exit 1; | ||
|
||
|
||
if [ $# != 5 ]; then | ||
echo "Usage: $0 <extractor-dir> <data> <ivector-dir>" | ||
echo " e.g.: $0 exp/extractor_2048_male data/train_male exp/ivectors_male" | ||
echo "main options (for others, see top of script file)" | ||
echo " --config <config-file> # config containing options" | ||
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs." | ||
echo " --num-iters <#iters|10> # Number of iterations of E-M" | ||
echo " --nj <n|10> # Number of jobs (also see num-processes and num-threads)" | ||
echo " --num-threads <n|8> # Number of threads for each process" | ||
echo " --stage <stage|0> # To control partial reruns" | ||
echo " --num-gselect <n|20> # Number of Gaussians to select using" | ||
echo " # diagonal model." | ||
echo " --min-post <min-post|0.025> # Pruning threshold for posteriors" | ||
exit 1; | ||
fi | ||
|
||
srcdir=$1 | ||
nnet=$2 | ||
data=$3 | ||
data_dnn=$4 | ||
dir=$5 | ||
|
||
for f in $srcdir/final.ie $srcdir/final.ubm $data/feats.scp ; do | ||
[ ! -f $f ] && echo "No such file $f" && exit 1; | ||
done | ||
|
||
# Set various variables. | ||
mkdir -p $dir/log | ||
sdata=$data/split$nj; | ||
utils/split_data.sh $data $nj || exit 1; | ||
|
||
sdata_dnn=$data_dnn/split$nj; | ||
utils/split_data.sh $data_dnn $nj || exit 1; | ||
|
||
delta_opts=`cat $srcdir/delta_opts 2>/dev/null` | ||
|
||
splice_opts=`cat exp/nnet//splice_opts 2>/dev/null` # frame-splicing options | ||
|
||
## Set up features. | ||
feats="ark,s,cs:add-deltas $delta_opts scp:$sdata/JOB/feats.scp ark:- | apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 ark:- ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |" | ||
|
||
nnet_feats="ark,s,cs:apply-cmvn-sliding --center=true scp:$sdata_dnn/JOB/feats.scp ark:- |" | ||
|
||
if [ $stage -le 0 ]; then | ||
echo "$0: extracting iVectors" | ||
$cmd JOB=1:$nj $dir/log/extract_ivectors.JOB.log \ | ||
nnet-am-compute --apply-log=true $nnet "$nnet_feats" ark:- \ | ||
\| select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- \ | ||
\| logprob-to-post --min-post=$min_post ark:- ark:- \| \ | ||
scale-post ark:- $posterior_scale ark:- \| \ | ||
ivector-extract --verbose=2 $srcdir/final.ie "$feats" ark,s,cs:- \ | ||
ark,scp,t:$dir/ivector.JOB.ark,$dir/ivector.JOB.scp || exit 1; | ||
fi | ||
|
||
if [ $stage -le 1 ]; then | ||
echo "$0: combining iVectors across jobs" | ||
for j in $(seq $nj); do cat $dir/ivector.$j.scp; done >$dir/ivector.scp || exit 1; | ||
fi | ||
|
||
if [ $stage -le 2 ]; then | ||
# Be careful here: the speaker-level iVectors are now length-normalized, | ||
# even if they are otherwise the same as the utterance-level ones. | ||
echo "$0: computing mean of iVectors for each speaker and length-normalizing" | ||
$cmd $dir/log/speaker_mean.log \ | ||
ivector-normalize-length scp:$dir/ivector.scp ark:- \| \ | ||
ivector-mean ark:$data/spk2utt ark:- ark:- ark,t:$dir/num_utts.ark \| \ | ||
ivector-normalize-length ark:- ark,scp:$dir/spk_ivector.ark,$dir/spk_ivector.scp || exit 1; | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
#!/bin/bash | ||
# Copyright 2015 David Snyder | ||
# 2015 Johns Hopkins University (Author: Daniel Garcia-Romero) | ||
# 2015 Johns Hopkins University (Author: Daniel Povey) | ||
# Apache 2.0 | ||
|
||
# This script derives a full-covariance UBM from DNN posteriors and | ||
# speaker recognition features. | ||
|
||
# Begin configuration section. | ||
nj=40 | ||
cmd="run.pl" | ||
stage=-2 | ||
delta_window=3 | ||
delta_order=2 | ||
num_components=5297 | ||
# End configuration section. | ||
|
||
echo "$0 $@" # Print the command line for logging | ||
|
||
if [ -f path.sh ]; then . ./path.sh; fi | ||
. parse_options.sh || exit 1; | ||
|
||
if [ $# != 3 ]; then | ||
echo "Usage: steps/init_full_ubm_from_dnn.sh <data-speaker-id> <data-dnn> <dnn-model> <new-ubm-dir>" | ||
echo "Initializes a full-covariance UBM from DNN posteriors and speaker recognition features." | ||
echo " e.g.: steps/init_full_ubm_from_dnn.sh data/train data/train_dnn exp/dnn/final.mdl exp/full_ubm" | ||
echo "main options (for others, see top of script file)" | ||
echo " --config <config-file> # config containing options" | ||
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs." | ||
echo " --nj <n|16> # number of parallel training jobs" | ||
echo " --delta-window <n|3> # delta window size" | ||
echo " --delta-order <n|2> # delta order" | ||
echo " --number-components <n|5297> # number of components in the final GMM needs" | ||
echo " # to be equal to the size of the DNN output layer." | ||
exit 1; | ||
fi | ||
|
||
data=$1 | ||
data_dnn=$2 | ||
nnet=$3 | ||
dir=$4 | ||
|
||
for f in $data/feats.scp $data/vad.scp; do | ||
[ ! -f $f ] && echo "No such file $f" && exit 1; | ||
done | ||
|
||
mkdir -p $dir/log | ||
echo $nj > $dir/num_jobs | ||
sdata=$data/split$nj; | ||
utils/split_data.sh $data $nj || exit 1; | ||
|
||
sdata_dnn=$data_dnn/split$nj; | ||
utils/split_data.sh $data_dnn $nj || exit 1; | ||
|
||
delta_opts="--delta-window=$delta_window --delta-order=$delta_order" | ||
echo $delta_opts > $dir/delta_opts | ||
|
||
logdir=$dir/log | ||
|
||
nnet_feats="ark,s,cs:apply-cmvn-sliding --center=true scp:$sdata_dnn/JOB/feats.scp ark:- |" | ||
|
||
feats="ark,s,cs:add-deltas $delta_opts scp:$sdata/JOB/feats.scp ark:- | \ | ||
apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 ark:- ark:- | \ | ||
select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |" | ||
|
||
$cmd JOB=1:$nj $logdir/make_stats.JOB.log \ | ||
nnet-am-compute --apply-log=true $nnet "$nnet_feats" ark:- \ | ||
\| select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- \ | ||
\| logprob-to-post ark:- ark:- \| \ | ||
fgmm-global-acc-stats-post ark:- $num_components "$feats" \ | ||
$dir/stats.JOB.acc || exit 1; | ||
|
||
$cmd $dir/log/init.log \ | ||
fgmm-global-init-from-accs --verbose=2 \ | ||
"fgmm-global-sum-accs - $dir/stats.*.acc |" $num_components \ | ||
$dir/final.ubm || exit 1; | ||
|
||
exit 0; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,181 @@ | ||
#!/bin/bash | ||
|
||
# Copyright 2013 Daniel Povey | ||
# 2014-2015 David Snyder | ||
# 2015 Johns Hopkins University (Author: Daniel Garcia-Romero) | ||
# 2015 Johns Hopkins University (Author: Daniel Povey) | ||
# Apache 2.0. | ||
|
||
# This script trains the i-vector extractor using a DNN-based UBM. It also requires | ||
# an fGMM, usually created by the script sid/init_full_gmm_from_dnn.sh. | ||
# Note: there are 3 separate levels of parallelization: num_threads, num_processes, | ||
# and num_jobs. This may seem a bit excessive. It has to do with minimizing | ||
# memory usage and disk I/O, subject to various constraints. The "num_threads" | ||
# is how many threads a program uses; the "num_processes" is the number of separate | ||
# processes a single job spawns, and then sums the accumulators in memory. | ||
# Our recommendation: | ||
# - Set num_threads to the minimum of (4, or how many virtual cores your machine has). | ||
# (because of needing to lock various global quantities, the program can't | ||
# use many more than 4 threads with good CPU utilization). | ||
# - Set num_processes to the number of virtual cores on each machine you have, divided by | ||
# num_threads. E.g. 4, if you have 16 virtual cores. If you're on a shared queue | ||
# that's busy with other people's jobs, it may be wise to set it to rather less | ||
# than this maximum though, or your jobs won't get scheduled. And if memory is | ||
# tight you need to be careful; in our normal setup, each process uses about 5G. | ||
# - Set num_jobs to as many of the jobs (each using $num_threads * $num_processes CPUs) | ||
# your queue will let you run at one time, but don't go much more than 10 or 20, or | ||
# summing the accumulators will possibly get slow. If you have a lot of data, you | ||
# may want more jobs, though. | ||
|
||
# Begin configuration section. | ||
nj=10 # this is the number of separate queue jobs we run, but each one | ||
# contains num_processes sub-jobs.. the real number of threads we | ||
# run is nj * num_processes * num_threads, and the number of | ||
# separate pieces of data is nj * num_processes. | ||
num_threads=4 | ||
num_processes=4 # each job runs this many processes, each with --num-threads threads | ||
cmd="run.pl" | ||
stage=-4 | ||
num_gselect=20 # Gaussian-selection using diagonal model: number of Gaussians to select | ||
ivector_dim=400 # dimension of the extracted i-vector | ||
use_weights=false # set to true to turn on the regression of log-weights on the ivector. | ||
num_iters=10 | ||
min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out) | ||
num_samples_for_weights=3 # smaller than the default for speed (relates to a sampling method) | ||
cleanup=true | ||
posterior_scale=1.0 # This scale helps to control for successve features being highly | ||
# correlated. E.g. try 0.1 or 0.3 | ||
sum_accs_opt= | ||
# End configuration section. | ||
|
||
echo "$0 $@" # Print the command line for logging | ||
|
||
if [ -f path.sh ]; then . ./path.sh; fi | ||
. parse_options.sh || exit 1; | ||
|
||
|
||
if [ $# != 5 ]; then | ||
echo "Usage: $0 <fgmm-model> <dnn-model> <data-speaker-id> <data-dnn> <extractor-dir>" | ||
echo " e.g.: $0 exp/sup_ubm/final.ubm exp/dnn/final.mdl data/train data/train_dnn exp/extractor_male" | ||
echo "main options (for others, see top of script file)" | ||
echo " --config <config-file> # config containing options" | ||
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs." | ||
echo " --num-iters <#iters|10> # Number of iterations of E-M" | ||
echo " --nj <n|10> # Number of jobs (also see num-processes and num-threads)" | ||
echo " --num-processes <n|4> # Number of processes for each queue job (relates" | ||
echo " # to summing accs in memory)" | ||
echo " --num-threads <n|4> # Number of threads for each process (can't be usefully" | ||
echo " # increased much above 4)" | ||
echo " --stage <stage|-4> # To control partial reruns" | ||
echo " --num-gselect <n|20> # Number of Gaussians to select using" | ||
echo " # diagonal model." | ||
echo " --sum-accs-opt <option|''> # Option e.g. '-l hostname=a15' to localize" | ||
echo " # sum-accs process to nfs server." | ||
exit 1; | ||
fi | ||
|
||
fgmm_model=$1 | ||
nnet=$2 | ||
data=$3 | ||
data_dnn=$4 | ||
dir=$5 | ||
|
||
srcdir=$(dirname $fgmm_model) | ||
|
||
for f in $fgmm_model $data/feats.scp ; do | ||
[ ! -f $f ] && echo "No such file $f" && exit 1; | ||
done | ||
|
||
# Set various variables. | ||
mkdir -p $dir/log | ||
nj_full=$[$nj*$num_processes] | ||
sdata=$data/split$nj_full; | ||
utils/split_data.sh $data $nj_full || exit 1; | ||
|
||
sdata_dnn=$data_dnn/split$nj_full; | ||
utils/split_data.sh $data_dnn $nj_full || exit 1; | ||
|
||
delta_opts=`cat $srcdir/delta_opts 2>/dev/null` | ||
if [ -f $srcdir/delta_opts ]; then | ||
cp $srcdir/delta_opts $dir/ 2>/dev/null | ||
fi | ||
|
||
splice_opts=`cat exp/nnet//splice_opts 2>/dev/null` # frame-splicing options | ||
|
||
parallel_opts="-pe smp $[$num_threads*$num_processes]" | ||
## Set up features. | ||
feats="ark,s,cs:add-deltas $delta_opts scp:$sdata/JOB/feats.scp ark:- | apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 ark:- ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |" | ||
|
||
nnet_feats="ark,s,cs:apply-cmvn-sliding --center=true scp:$sdata_dnn/JOB/feats.scp ark:- |" | ||
|
||
|
||
# Initialize the i-vector extractor using the FGMM input | ||
if [ $stage -le -2 ]; then | ||
cp $fgmm_model $dir/final.ubm || exit 1; | ||
$cmd $dir/log/convert.log \ | ||
fgmm-global-to-gmm $dir/final.ubm $dir/final.dubm || exit 1; | ||
$cmd $dir/log/init.log \ | ||
ivector-extractor-init --ivector-dim=$ivector_dim --use-weights=$use_weights \ | ||
$dir/final.ubm $dir/0.ie || exit 1; | ||
fi | ||
|
||
# Do Gaussian selection and posterior extracion | ||
|
||
if [ $stage -le -1 ]; then | ||
echo $nj_full > $dir/num_jobs | ||
echo "$0: doing DNN posterior computation" | ||
$cmd JOB=1:$nj_full $dir/log/post.JOB.log \ | ||
nnet-am-compute --apply-log=true $nnet "$nnet_feats" ark:- \ | ||
\| select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- \ | ||
\| logprob-to-post --min-post=$min_post ark,s,cs:- ark:- \| \ | ||
scale-post ark:- $posterior_scale "ark:|gzip -c >$dir/post.JOB.gz" || exit 1; | ||
|
||
else | ||
if ! [ $nj_full -eq $(cat $dir/num_jobs) ]; then | ||
echo "Num-jobs mismatch $nj_full versus $(cat $dir/num_jobs)" | ||
exit 1 | ||
fi | ||
fi | ||
|
||
x=0 | ||
while [ $x -lt $num_iters ]; do | ||
if [ $stage -le $x ]; then | ||
rm $dir/.error 2>/dev/null | ||
|
||
Args=() # bash array of training commands for 1:nj, that put accs to stdout. | ||
for j in $(seq $nj_full); do | ||
Args[$j]=`echo "ivector-extractor-acc-stats --num-threads=$num_threads --num-samples-for-weights=$num_samples_for_weights $dir/$x.ie '$feats' 'ark,s,cs:gunzip -c $dir/post.JOB.gz|' -|" | sed s/JOB/$j/g` | ||
done | ||
|
||
echo "Accumulating stats (pass $x)" | ||
for g in $(seq $nj); do | ||
start=$[$num_processes*($g-1)+1] | ||
$cmd $parallel_opts $dir/log/acc.$x.$g.log \ | ||
ivector-extractor-sum-accs --parallel=true "${Args[@]:$start:$num_processes}" \ | ||
$dir/acc.$x.$g || touch $dir/.error & | ||
done | ||
wait | ||
[ -f $dir/.error ] && echo "Error accumulating stats on iteration $x" && exit 1; | ||
accs="" | ||
for j in $(seq $nj); do | ||
accs+="$dir/acc.$x.$j " | ||
done | ||
echo "Summing accs (pass $x)" | ||
$cmd $sum_accs_opt $dir/log/sum_acc.$x.log \ | ||
ivector-extractor-sum-accs $accs $dir/acc.$x || exit 1; | ||
echo "Updating model (pass $x)" | ||
nt=$[$num_threads*$num_processes] # use the same number of threads that | ||
# each accumulation process uses, since we | ||
# can be sure the queue will support this many. | ||
$cmd -pe smp $nt $dir/log/update.$x.log \ | ||
ivector-extractor-est --num-threads=$nt $dir/$x.ie $dir/acc.$x $dir/$[$x+1].ie || exit 1; | ||
rm $dir/acc.$x.* | ||
if $cleanup; then | ||
rm $dir/acc.$x | ||
# rm $dir/$x.ie | ||
fi | ||
fi | ||
x=$[$x+1] | ||
done | ||
|
||
ln -s $x.ie $dir/final.ie |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
This directory contains DNN scripts based on the nnet2 recipes found in | ||
the ASR examples (e.g., fisher_english). The scripts have been modified | ||
for speaker recognition purposes. Most of the scripts are lightly modified | ||
versions of those appearing in the steps or local directories of | ||
egs/fisher_english. |
Oops, something went wrong.