-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun-program.sh
executable file
·150 lines (107 loc) · 4.76 KB
/
run-program.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/bin/bash
echo "Starting..."
# paths to required programs, edit if necessary.
pathTo_makeblastdb=makeblastdb;
pathTo_makembindex=makembindex;
pathTo_blastn=blastn;
pathTo_python=python;
# check if required program paths are okay to use.
command -v $pathTo_makeblastdb >/dev/null 2>&1 || { echo "I require blast, please provide the right path. Aborting." >&2; exit 1; }
command -v $pathTo_makembindex >/dev/null 2>&1 || { echo "I require blast, please provide the right path. Aborting." >&2; exit 1; }
command -v $pathTo_blastn >/dev/null 2>&1 || { echo "I require blastn, please provide the right path. Aborting." >&2; exit 1; }
command -v $pathTo_python >/dev/null 2>&1 || { echo "I require python, please provide the right path. Aborting." >&2; exit 1; }
echo "Required program path are okay to use"
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
SRC=$DIR/src
echo "Checking provided input"
EXTANT_GENOMES="";
CONTIGS="";
GRAPH="";
TREE="";
WEIGHT="";
#read parameters from input and check if all parameters are provided
if [ $# -ne 10 ]
then
echo "Please provide the expected parameter!"
echo "usage: $0 -fasta <extant_genomes.fa> -contigs <contigs> -dot <assembly-graph> -tree <nwk-file> -weight <weight>"
exit 1
fi
while [ $# -gt 0 ]
do
case "$1" in
-fasta) EXTANT_GENOMES="$2"; shift;;
-contigs) CONTIGS="$2"; shift;;
-dot) GRAPH="$2"; shift;;
-tree) TREE="$2"; shift;;
-weight) WEIGHT="$2"; shift;;
--) shift; break;;
-*)
echo >&2 \
"usage: $0 -fasta <extant_genomes.fa> -contigs <contigs> -dot <assembly-graph> -tree <nwk-file> -weight <weight>"
exit 1;;
*) break;; # terminate while loop
esac
shift
done
echo "creating file structure"
INTERIM=./interim
RESULTS=./results
EXTANT_GENOMES_DB=$INTERIM/extant_genomes_db
MEGABLAST_HITS=$INTERIM/megablast_hits
FPSAC=/home/nina/software/FPSAC/src
ANCIENT_CONTIGS_LENGTH=$INTERIM/ancient_contigs_length
ANCIENT_EXTANT_HITS=$INTERIM/ancient_extant_hits
FAMILIES_WITH_CONTIG_NAMES=$INTERIM/families_with_contig_names
FAMILIES_PROFILES=$INTERIM/families_profiles
FAMILIES_WITH_CONTIG_NAMES_CORRECTED=$INTERIM/families_with_contig_names_corrected
FAMILIES_PROFILES_CORRECTED=$INTERIM/families_profiles_corrected
FAMILIES_ERRORS=$INTERIM/families_errors
FILTERED_CONTIGS=$INTERIM/filtered_contigs
FILTERED_CONTIGS_IDS=$INTERIM/filtered_contigs_ids
PRUNED_GRAPH=$INTERIM/pruned_graph.dot
ALL_ADJACENCIES=$INTERIM/all_adjacencies
OUTPUT_ADJACENCIES=$RESULTS/adjacencies/
OUTPUT_SCAFFOLDS=$RESULTS/scaffolds/
FILTERED_FAMILIES=$INTERIM/filtered_families
mkdir -p $INTERIM
mkdir -p $RESULTS
mkdir -p $OUTPUT_ADJACENCIES
mkdir -p $OUTPUT_SCAFFOLDS
echo "Filtering assembly contigs"
#filter contigs longer than 500bp
python $SRC/biggestContigs.py $CONTIGS $FILTERED_CONTIGS 500 $ANCIENT_CONTIGS_LENGTH
#store IDs of all filtered contigs
grep ">" $FILTERED_CONTIGS | cut -d " " -f 1 | cut -c 2- > $FILTERED_CONTIGS_IDS
#get all contig ids from original file
biggestContigID=`tail -n 2 $CONTIGS | grep ">" | awk '{ print $1 }' | cut -c 2-`
echo "Prune assembly graph to only include biggest contigs"
#prune assembly graph so that it only contains filtered contigs
python $SRC/pruneAssemblyGraph.py $GRAPH $FILTERED_CONTIGS_IDS $biggestContigID $PRUNED_GRAPH
echo "Preprocessing reads to find marker sequence"
echo "Index extant genomes"
#indexing extant genomes fasta
makembindex -input $EXTANT_GENOMES -old_style_index true -output $EXTANT_GENOMES_DB -verbosity quiet
echo "Create database from extant genomes"
#creating database from extant genomes
makeblastdb -in $EXTANT_GENOMES -dbtype nucl
echo "blast contigs"
#blast contigs from ancient genome (assembly graph) against extant genomes
blastn -task megablast -db $EXTANT_GENOMES -query $FILTERED_CONTIGS -use_index True -index_name $EXTANT_GENOMES_DB -out $MEGABLAST_HITS -outfmt 6
echo "format blast hits"
#format blast hits
python $FPSAC/fpsac_format_blast_hits.py \
${MEGABLAST_HITS} \
${ANCIENT_CONTIGS_LENGTH} \
${ANCIENT_EXTANT_HITS}
echo "find families"
#find families
python $FPSAC/fpsac_compute_families_coordinates_and_profiles.py $ANCIENT_EXTANT_HITS $FAMILIES_WITH_CONTIG_NAMES $FAMILIES_PROFILES 100 95
echo "error correction step"
#error correction
python $FPSAC/fpsac_correct_families.py $FAMILIES_WITH_CONTIG_NAMES $FAMILIES_PROFILES $FAMILIES_WITH_CONTIG_NAMES_CORRECTED $FAMILIES_PROFILES_CORRECTED $FAMILIES_ERRORS
python $SRC/correctFamilies.py $FAMILIES_WITH_CONTIG_NAMES_CORRECTED $FILTERED_FAMILIES $TREE
echo "Compute adjacency set for internal nodes of the phylogeny"
#extract adjacencies for extant genomes and assembly graph
python $SRC/extractAdjacencies.py $FILTERED_FAMILIES $PRUNED_GRAPH $TREE $ALL_ADJACENCIES
#compute Fitch on extracted adjacencies
python $SRC/Main.py $TREE $ALL_ADJACENCIES $OUTPUT_ADJACENCIES $OUTPUT_SCAFFOLDS $WEIGHT