-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathskandiver.sh
95 lines (77 loc) · 2.95 KB
/
skandiver.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/bin/bash
#Function ensuring files/directories are present
check_directory() {
if [ ! -d "$1" ]; then
echo "Error: $2 not found or incorrectly formatted; please double-check if file initialized correctly."
echo "Exiting skandiver."
exit 1
fi
}
#Function to extract fasta/fna files from compressed files
gunzip_files() {
echo "Step 1: Gunzipping files..."
for file in "$1"/*.gz; do
gunzip "$file"
done
}
#Function to fragment genomes according to a set size
fragment_genomes() {
echo "Step 2: Fragmenting genomes..."
echo "python3 chunktwentytwo.py $1 $2 $3"
python3 chunktwentytwo.py "$1" "$2" "$3"
}
#Function to search fragmented genomes against representative genome database
skani_search() {
echo "Step 3: skani searching against representative genomes..."
echo "skani search --qi $1 -d $2 -o $3 -t 10"
skani search --qi "$1" -d "$2" -o "$3" -t 10
}
#Extract search results with a high match
filter_skani_results() {
echo "Step 4: Filtering skani search result..."
echo "awk 'NR==1 || (\$3 > 95 && \$5 > 90)' $1 > $2"
awk 'NR==1 || ($3 > 95 && $5 > 90)' "$1" > "$2"
#Check if filtered file can be annotated
if [ $(wc -l < "$2") -eq 1 ]; then
echo "No results to annotate in $2. Exiting..."
exit 1
fi
}
#Annotate search results with relevant evolutionary divergence information
annotate_divergence() {
echo "Step 5: Annotating evolutionary divergence information..."
echo "python3 nuudivergefour.py $1 $2"
python3 nuudivergefour.py "$1" "$2.txt"
}
main() {
if [ "$#" -ne 4 ]; then
echo "Usage: $0 INPUT_DIRECTORY OUTPUT_NAME CHUNKSIZE REFERENCE_DATABASE"
exit 1
fi
INPUT_DIRECTORY=$1
OUTPUT_NAME=$2
CHUNKSIZE=$3
REFERENCE_DATABASE=$4
OUTPUT_NAME_SEARCH="${OUTPUT_NAME}search.fna"
OUTPUT_NAME_SKANI="${OUTPUT_NAME}skani.txt"
OUTPUT_NAME_SKANI_FILTERED="${OUTPUT_NAME}skanifiltered.txt"
check_directory "$INPUT_DIRECTORY" "Input directory"
check_directory "$REFERENCE_DATABASE" "Reference database"
echo ========================================
gunzip_files "$INPUT_DIRECTORY"
echo ========================================
fragment_genomes "$INPUT_DIRECTORY" "$OUTPUT_NAME_SEARCH" "$CHUNKSIZE"
echo ========================================
skani_search "$OUTPUT_NAME_SEARCH" "$REFERENCE_DATABASE" "$OUTPUT_NAME_SKANI"
echo ========================================
filter_skani_results "$OUTPUT_NAME_SKANI" "$OUTPUT_NAME_SKANI_FILTERED"
echo ========================================
annotate_divergence "$OUTPUT_NAME_SKANI_FILTERED" "$OUTPUT_NAME"
echo ========================================
if [ -s "${OUTPUT_NAME}.txt" ]; then
echo "skandiver completed successfully. Potential mobile element regions written to ${OUTPUT_NAME}.txt"
else
echo "Error: skandiver was unable to find any mobile regions of interest in $INPUT_DIRECTORY"
fi
}
main "$@"