-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtypester.sh
executable file
·177 lines (149 loc) · 4.93 KB
/
typester.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#!/bin/bash
# input required by user are:
# Y fasta reference
Y_ref=""
# list of bam files
samples=""
HipSTR="HipSTR"
max_motif=6
max_len=100
skip=false
# usage print if -h
usage() {
echo "Usage: $(basename $0) -r ref_y -s samples.txt" 2>&1
echo 'Identify and genotype Y-STRS from WGS samples.'
echo "[ -r Ref_Y ] Y chromosome reference fasta"
echo "[ -s samples ] file with bam file paths"
echo "[ -t HipSTR ] path to compiled HipSTR (optional if in \$PATH)"
echo "[ -e exclude ] BED of regions to exclude (optional)"
echo "[ -o outprefix ] prefix of output files (optional, default: Y_STRs)"
echo "[ -m max_motif ] maximum motif length (default: 6)"
echo "[ -M max_len ] maximum STR length (default: 100)"
echo "[ -n no_geno ] skip genotyping with HipSTR"
exit 2
}
exit_abnormal() {
usage
exit 1
}
# check if dependencies exists
# https://raymii.org/s/snippets/Bash_Bits_Check_if_command_is_available.html
command_exists() {
# check if command exists and fail otherwise
command -v "$1" >/dev/null 2>&1
if [[ $? -ne 0 ]]; then
echo "$1 is required but it's not installed. Abort."
exit 1
fi
}
while getopts ":r:s:t:e:o:m:M:n" options; do
case "${options}" in
r)
Y_ref=${OPTARG}
;;
s)
samples=${OPTARG}
;;
t)
HipSTR=${OPTARG}
;;
e)
exclude=${OPTARG}
;;
o)
outprefix=${OPTARG}
;;
m)
max_motif=${OPTARG}
;;
M)
max_len=${OPTARG}
;;
n)
skip=true
;;
:)
echo "Error: -${OPTARG} requires an argument."
exit_abnormal
;;
*)
exit_abnormal
;;
esac
done
# check if trf installed
for COMMAND in "trf"; do
command_exists "${COMMAND}"
done
# if exclude is active check if bedtools is installed
if [ ! $exclude ]; then
command_exists "bedtools"
fi
# fasta is required and must exists
if [ -z $Y_ref ]; then echo "Y reference not given"; usage; exit 1; fi
if [ ! -f $Y_ref ]; then echo "Y reference file $Y_ref not found"; exit 1; fi
# can hipstr be found?
if [[ "$skip" == false ]]; then
# samples is required and must exists
if [ -z $samples ]; then echo "samples file not given"; usage; exit 1; fi
if [ ! -f $samples ]; then echo "samples file $samples not found"; exit 1; fi
command_exists $HipSTR
fi
# if prexif not given default to Y_STRss
if [ ! $outprefix ]; then
outprefix="Y_STRs"
fi
# exit if max motif or max len are not integers
if ! [[ "$max_motif" =~ ^[0-9]+$ ]];
then echo "error: -m $max_motif is not an integer" >&2; exit 1
fi
if ! [[ "$max_len" =~ ^[0-9]+$ ]];
then echo "error: -M $max_len is not an integer" >&2; exit 1
fi
echo "Looking for STR across $Y_ref on $samples with trf" 1>&2
echo "filtering for motifs shorter than $max_motif and regions shorter then $max_len" 1>&2
# take fasta id to produce regions file
Y_id=$(head -1 $Y_ref | cut -f1 -d' ' | sed 's/>//')
Y_name=$(basename $Y_ref)
Y_name="${Y_name%.*}"
# run trf with reccomended parameters and max motif
trf $Y_ref 2 5 7 80 10 80 $max_motif -h -d
# name of regions file
filtered_bed=${Y_name}_${max_motif}_${max_len}_filtered_STR.bed
# Filter regions, if exclude is given remove STR overlapping excluded regions
if [ ! $exclude ]; then
sed '1,15d' $(basename ${Y_ref}).2.5.7.80.10.80.${max_motif}.dat | \
awk -v Y_id="$Y_id" -v max_motif="$max_motif" -v max_len="$max_len" \
'BEGIN{OFS="\t"} {if ($3 > 2 && $3 <= max_motif && $2-$1 <= max_len) \
{print Y_id, $1, $2, $3, $4}}' | \
awk '!seen[$3]++' > $filtered_bed
else
sed '1,15d' $(basename ${Y_ref}).2.5.7.80.10.80.${max_motif}.dat | \
awk -v Y_id="$Y_id" -v max_motif="$max_motif" -v max_len="$max_len" \
'BEGIN{OFS="\t"} {if ($3 > 2 && $3 <= max_motif && $2-$1 <= max_len) \
{print Y_id, $1, $2, $3, $4}}' | \
awk '!seen[$3]++' |\
bedtools intersect -v -a stdin -b $exclude > $filtered_bed
fi
# how many STRs?
STRs_number=$(wc -l < $filtered_bed)
echo -e "\n$STRs_number STRs were found after filtering" 1>&2
if [[ "$skip" == true ]]; then
echo "Skipping genotyping"
exit 0
fi
echo -e "\nRunning HipSTR to genotype the samples"
# relaxed filter of min reads per str
nsamp=$(wc -l < $samples)
# if the str is exactly as max_len hipstr will ignore it therefore add 1
max_len_h=$((max_len + 1))
# run hipstr
$HipSTR --bam-files $samples --fasta $Y_ref --regions $filtered_bed --haploid-chrs $Y_id \
--str-vcf ${outprefix}.vcf.gz --log ${outprefix}.log --lib-from-samp --output-filters \
--max-flank-indel 1 --def-stutter-model --min-reads $nsamp --max-str-len $max_len_h
# print some useful stats on hipstr log
no_reads=$(grep -c "Skipping locus with too few reads" ${outprefix}.log)
too_rep=$(grep -c "too repetitive" ${outprefix}.log)
grep "succeeded" ${outprefix}.log
echo "$no_reads had too few reads (less than the number of samples: $nsamp), in $too_rep the sequence upstream of the repeat is too repetitive for accurate genotyping"
exit 0