forked from eead-csic-compbio/get_homologues
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathplot_matrix_heatmap.sh
executable file
·318 lines (262 loc) · 10.4 KB
/
plot_matrix_heatmap.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
#!/usr/bin/env bash
# 2015-6 Pablo Vinuesa (1) and Bruno Contreras-Moreira (2):
# 1: http://www.ccg.unam.mx/~vinuesa (Center for Genomic Sciences, UNAM, Mexico)
# 2: http://www.eead.csic.es/compbio (Laboratory of Computational Biology, EEAD/CSIC, Spain)
#: AIM: Prints ordered matrix heatmap (without computing distance)
# based on a tab-delimited input file with header and rownames
#: OUTPUT: svg and pdf; png not implemented yet
progname=${0##*/} # plot_matrix_heatmap.sh
VERSION='v0.3_13Apr16' # added option -c to filter input matrix by a maximum similarity cut-off value
# to reduce excessive redundancy. Improved the help text printed with -M
#'0.2_26Feb15' # wrote R function sim2dist() to compute bioNJ tree with ape
# based on ANI sim-matrix, and write it to file as newick string
#'0.1_16Feb15'; first version
date_F=$(date +%F |sed 's/-/_/g')-
date_T=$(date +%T |sed 's/:/./g')
start_time="$date_F$date_T"
#---------------------------------------------------------------------------------#
#>>>>>>>>>>>>>>>>>>>>>>>>>>>> FUNCTION DEFINITIONS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<#
#---------------------------------------------------------------------------------#
function print_help()
{
cat << HELP
USAGE synopsis for: [$progname v.$VERSION]:
$progname <string (tab-delimited matrix file)> [-t|-m|-v|-o|-p|-H|-W|-C|-M]
REQUIRED
-i <string> presence_absence tab file
OPTIONAL:
I) filter out excessive redundancy in the tab-delimited ANI matrix file
-c <float> (maximum) identity cut-off value (e.g. 97.3) [def: $sim_cutoff]
II) tweak the graphical output:
-t <string> text for plot title [def input_tab_file_name]
-m <integer> margins_horizontal [def $margin_hor]
-v <integer> margins_vertical [def $margin_vert]
-o <string> output file format [def $outformat]
-p <integer> points for plotting device [def $points]
-H <integer> output device height [def $height]
-W <integer> output device width [def $width]
-C <flag> do not reorder clusters [def reorder clusters and plot dendrogram]
RUN NJ ANALYSIS using ANI matrix (average identity matrix) generated by get_homologues.pl -A -t 0 -M|G
-N <flag> will compute a distance matrix from the input similarity matrix
and use the former to construct a NJ tree and save it as newick string
Do not use with a binary presence-absence matrix
MORE DETAILED HELP:
-M <flag> prints gplot installation instructions and further usage information
EXAMPLE:
$progname -i Avg_identity.tab -c 98.5 -t "Genus X ANIb (OMCL all clusters)" -N -o pdf -m 22 -v 22 -p 20 -H 20 -W 30
#------------------------------------------------------------------------------------------------------------------
AIM: Plot ordered heatmaps with row and col. dendrogram, from squared numeric (distance or presence-absence) matrix,
like the *Avg_identity.tab matrices produced by get_homologues.pl with the -A flag
OUTPUT: svg|pdf files
DEPENDENCIES:
R packages ape and gplots. Run $progname -M for installation instructions.
NOTES:
1. To improve the display, shorten the genome/taxon names as much as possible
2. Use -m and -v to control horizontal and vertial margins to the plot
to fit the taxon labels
#------------------------------------------------------------------------------------------------------------------
HELP
check_dependencies
exit 0
}
#--------------------------------------------------------------------------------------
function print_man()
{
cat << MAN
$progname is a simple shell wrapper around heatmap.2() from the gplots pacakge.
Generates ordered heatmaps with row and column dendrograms from
an input distace/dissimilarity matrix, or a binary presence-abasence matrix.
1) If the package is not installed on your system, then proceed as follows:
i) with root privileges, type the following into your shell console:
sudo R
> install.packages(c("gplots", "ape"), dependencies=TRUE)
> q()
$ exit # exit from the root account
$ R # call R
> library("gplots") # load the lib; do your stuff
> library("ape") # load the lib; do your stuff
ii) without root privileges, intall the package into ~/lib/R as follows:
$ mkdir -p ~/lib/R
# set the R_LIBS environment variable before starting R as follows:
$ export R_LIBS=~/lib/R # bash syntax
$ setenv R_LIBS=~/lib/R # csh syntax
# You can type the corresponding line into your .bashrc (or similar) configuration file
# to make this options persistent
# Call R from your terminal and type:
> install.packages("gplots", dependencies=TRUE, lib="~/lib/R")
2) Once installed, you can read the documentation for packages and functions by typing the following into the R console:
library("gplots") # loads the lib into the environment
library("ape")
help(package="gplots") # read about the gplots package
help(heatmap.2) # read about the heatmap.2 function
help(svg) # read about the svg function, which generates the svg ouput file
help(pdf) # read about the pdf function, which generates the pdf ouput file
...
MAN
exit 0
}
#--------------------------------------------------------------------------------------
function check_dependencies()
{
for prog in R
do
bin=$(type -P $prog)
if [ -z $bin ]; then
echo
echo "# ERROR: $prog not in place!"
echo "# ... you will need to install \"$prog\" first or include it in \$PATH"
echo "# ... exiting"
exit 1
fi
done
echo
echo '# Run check_dependencies() ... looks good: R is installed.'
echo
}
#---------------------------------------------------------------------------------------#
#----------------------------------- GET OPTIONS ---------------------------------------#
#---------------------------------------------------------------------------------------#
tab_file=
check_dep=0
sim_cutoff=100
text=
width=15
height=10
points=15
margin_hor=18
margin_vert=18
outformat=svg
do_nj=0
reorder_clusters=1
# See bash cookbook 13.1 and 13.2
while getopts ':c:i:t:m:o:p:v:H:W:hMNC?:' OPTIONS
do
case $OPTIONS in
c) sim_cutoff=$OPTARG
;;
i) tab_file=$OPTARG
;;
m) margin_hor=$OPTARG
;;
v) margin_vert=$OPTARG
;;
o) outformat=$OPTARG
;;
p) points=$OPTARG
;;
t) text=$OPTARG
;;
H) height=$OPTARG
;;
W) width=$OPTARG
;;
M) print_man && exit 0
;;
N) do_nj=1
;;
C) reorder_clusters=0
;;
\:) printf "argument missing from -%s option\n" $OPTARG
print_help
exit 2
;;
\?) echo "need the following args: "
print_help
exit 3
;;
*) echo "An unexpected parsing error occurred"
echo
print_help
exit 4
;;
esac >&2 # print the ERROR MESSAGES to STDERR
done
shift $(($OPTIND - 1))
if [ -z $tab_file ]
then
echo "# ERROR: no input tab file defined!"
print_help
exit 1
fi
if [ -z $text ]
then
text=$(echo $tab_file)
fi
#-------------------#
#>>>>>> MAIN <<<<<<<#
#-------------------#
# 0) print run's parameter setup
wkdir=$(pwd)
cat << PARAMS
##############################################################################################
>>> $progname v$VERSION run started at $start_time
working directory = $wkdir
input tab_file = $tab_file | sim_cutoff = $sim_cutoff
text=$text|margin_hor=$margin_hor|margin_vert=$margin_vert|points=$points
width=$width|height=$height|outformat=$outformat
reorder_clusters=$reorder_clusters|do_bioNJ=$do_nj
##############################################################################################
PARAMS
# 1) prepare R's output file names
sim_cutoff_int=$(echo $sim_cutoff | cut -d\. -f1)
if [ $sim_cutoff_int -ne 100 ]
then
heatmap_outfile="${tab_file%.*}_sim_cutoff_${sim_cutoff}_heatmap.$outformat"
echo "# Plotting file $heatmap_outfile"
nj_tree="${tab_file%.*}_sim_cutoff_${sim_cutoff}_BioNJ.ph"
else
heatmap_outfile="${tab_file%.*}_heatmap.$outformat"
echo "# Plotting file $heatmap_outfile"
nj_tree="${tab_file%.*}_BioNJ.ph"
fi
# 2) call R using a heredoc and write the resulting script to file
R --no-save -q <<RCMD > ${progname%.*}_script_run_at_${start_time}.R
library("gplots")
library("ape")
options(expressions = 100000) #https://stat.ethz.ch/pipermail/r-help/2004-January/044109.html
tab <- read.table(file="$tab_file", header=TRUE)
mat_dat <- data.matrix(tab[,2:ncol(tab)])
rnames <- tab[,1]
rownames(mat_dat) <- rnames
if($sim_cutoff < 100)
{
tmp_mat = mat_dat
diag(tmp_mat) = NA
rows <- (!apply( tmp_mat , 1 , function(x) any( x > $sim_cutoff , na.rm=T) ) )
mat_dat <- mat_dat[rows, rows]
}
if($reorder_clusters > 0){
$outformat("$heatmap_outfile", width=$width, height=$height, pointsize=$pointsize)
heatmap.2(mat_dat, cellnote=mat_dat, main="$text", notecol="black", density.info="none", trace="none", margins=c($margin_vert,$margin_hor), lhei = c(1,5))
dev.off()
} else {
$outformat("$heatmap_outfile", width=$width, height=$height, pointsize=$pointsize)
heatmap.2(mat_dat, cellnote=mat_dat, main="$text", notecol="black", density.info="none", trace="none", margins=c($margin_vert,$margin_hor), lhei = c(1,5), dendrogram = "row", Colv = FALSE)
dev.off()
}
if($do_nj > 0){
sim2dist <- function(x) 100 - x
bionj <- bionj(as.dist(apply(mat_dat, 1, sim2dist)))
write.tree(phy=bionj, file="$nj_tree")
}
RCMD
if [ -s $heatmap_outfile ]
then
echo ">>> file $heatmap_outfile was produced"
echo
else
echo ">>> ERROR: file $heatmap_outfile was NOT produced."
echo
echo ">>> You can try option -C or alternatively remove columns in the matrix."
echo
fi
if [ $do_nj -eq 1 ] && [ -s $nj_tree ]
then
echo ">>> file $nj_tree was produced"
echo
fi
if [ $do_nj -eq 1 ] && [ ! -s $nj_tree ]
then
echo ">>> ERROR: file $nj_tree was NOT produced!"
echo
fi