Skip to content

Commit

Permalink
updates to v0.1.8
Browse files Browse the repository at this point in the history
  • Loading branch information
Jon Palmer authored and Jon Palmer committed Apr 27, 2016
1 parent 9854bc6 commit 6e454e6
Show file tree
Hide file tree
Showing 4 changed files with 271 additions and 171 deletions.
230 changes: 146 additions & 84 deletions bin/funannotate-compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def __init__(self,prog):
lib.copyDirectory(os.path.join(parentdir, 'html_template', 'css'), os.path.join(args.out, 'css'))
if not os.path.isdir(os.path.join(args.out, 'js')):
lib.copyDirectory(os.path.join(parentdir, 'html_template', 'js'), os.path.join(args.out, 'js'))

#loop through each genome
stats = []
merops = []
Expand All @@ -106,6 +106,7 @@ def __init__(self,prog):
eggnog = []
busco = []
gbkfilenames = []
scinames = []
num_input = len(args.input)
if num_input == 0:
lib.log.error("Error, you did not specify an input, -i")
Expand Down Expand Up @@ -142,22 +143,24 @@ def __init__(self,prog):
lib.parseGOterms(GBK, go_folder, stats[i][0].replace(' ', '_'))
lib.gb2proteinortho(GBK, protortho, stats[i][0].replace(' ', '_'))
eggnog.append(lib.getEggNogfromNote(GBK))

#convert eggnog to a single dictionary for lookup later
EGGNOG = { k: v for d in eggnog for k, v in d.items() }
scinames.append(stats[i][0].replace(' ', '_'))

#convert busco to dictionary
busco = lib.dictFlip(busco)
busco = lib.busco_dictFlip(busco)

#add species names to pandas table
names = []
for i in stats:
sci_name = i[0]
genus = sci_name.split(' ')[0]
species = ' '.join(sci_name.split(' ')[1:])
abbrev = genus[:1] + '.'
final_name = abbrev + ' ' + species
names.append(final_name)
if '_' in sci_name: #here I'm assuming that somebody used an abbreviated name and an underscore, this would be atypical I think
names.append(sci_name)
else:
genus = sci_name.split(' ')[0]
species = ' '.join(sci_name.split(' ')[1:])
abbrev = genus[:1] + '.'
final_name = abbrev + ' ' + species
names.append(final_name)


#PFAM#############################################
lib.log.info("Summarizing PFAM domain results")
Expand All @@ -170,8 +173,11 @@ def __init__(self,prog):
pfamdf['species'] = names
pfamdf.set_index('species', inplace=True)

#remove any "empty" genomes
pfamdf = pfamdf[(pfamdf.T != 0).any()]

#make an nmds
if len(args.input) > 1:
if len(pfamdf.index) > 1: #make sure number of species is at least two
lib.distance2mds(pfamdf, 'braycurtis', 'PFAM', os.path.join(args.out, 'pfam','PFAM.nmds.pdf'))

#get the PFAM descriptions
Expand All @@ -195,7 +201,7 @@ def __init__(self,prog):
output.write(lib.FOOTER)

##################################################

####InterProScan##################################
lib.log.info("Summarizing InterProScan results")
if not os.path.isdir(os.path.join(args.out, 'interpro')):
Expand All @@ -207,14 +213,21 @@ def __init__(self,prog):
IPRdf['species'] = names
IPRdf.set_index('species', inplace=True)

#some checking here of data, if genome is missing, i.e. counts are zero, drop it
#print IPRdf
#print len(IPRdf.columns)
IPRdf = IPRdf[(IPRdf.T != 0).any()]
#print len(IPRdf.index)

#analysis of InterPro Domains
#get IPR descriptions
lib.log.info("Loading InterPro descriptions")
INTERPRO = lib.iprxml2dict(os.path.join(parentdir, 'DB', 'interpro.xml'))
#NMDS
if len(args.input) > 1:
if len(IPRdf.columns) > 1:
if len(IPRdf.index) > 1: #count number of species
if len(IPRdf.columns) > 1: #count number of IPR domains
lib.distance2mds(IPRdf, 'braycurtis', 'InterProScan', os.path.join(args.out, 'interpro', 'InterProScan.nmds.pdf'))

#write to csv file
ipr2 = IPRdf.transpose()
ipr_desc = []
Expand All @@ -232,12 +245,12 @@ def __init__(self,prog):
output.write(lib.HEADER)
output.write(lib.INTERPRO)
if len(IPRdf.columns) > 1:
output.write(ipr2.to_html(index=False, escape=False, classes='table table-hover'))
if len(IPRdf.index) > 1:
output.write(ipr2.to_html(index=False, escape=False, classes='table table-hover'))
output.write(lib.FOOTER)

##############################################


####MEROPS################################
lib.log.info("Summarizing MEROPS protease results")
if not os.path.isdir(os.path.join(args.out, 'merops')):
Expand Down Expand Up @@ -388,7 +401,6 @@ def __init__(self,prog):
output.write(lib.FOOTER)
########################################################


####GO Terms, GO enrichment############################
if not os.path.isdir(os.path.join(args.out, 'go_enrichment')):
os.makedirs(os.path.join(args.out, 'go_enrichment'))
Expand Down Expand Up @@ -444,7 +456,7 @@ def __init__(self,prog):
else:
output.write('<table border="1" class="dataframe table table-hover">\n<th>No enrichment found</th></table>')
output.write(lib.FOOTER)

####################################################

##ProteinOrtho################################
Expand All @@ -455,78 +467,126 @@ def __init__(self,prog):
lib.log.info("Running orthologous clustering tool, ProteinOrtho5. This may take awhile...")
#setup protein ortho inputs, some are a bit strange in the sense that they use equals signs
log = os.path.join(protortho, 'proteinortho.log')
#get list of files in folder

#generate list of files based on input order for consistency
filelist = []
for file in os.listdir(protortho):
if file.endswith('.faa'):
filelist.append(file)
for i in stats:
name = i[0].replace(' ', '_')
name = name+'.faa'
filelist.append(name)
fileinput = ' '.join(filelist)
#print fileinput
cmd = ['proteinortho5.pl', '-project=funannotate', '-synteny', '-cpus='+str(args.cpus), '-singles', '-selfblast']
cmd2 = cmd + filelist
if not os.path.isfile(os.path.join(args.out, 'protortho', 'funannotate.poff')):
with open(log, 'w') as logfile:
subprocess.call(cmd2, cwd = protortho, stderr = logfile, stdout = logfile)

#now process the output, get # of singletons per genome, total orthologs, single-copy orthologs and append to stats, output text file with groups
#open poff in pandas to parse "easier" for stats, orthologs, etc
df = pd.read_csv(os.path.join(args.out, 'protortho', 'funannotate.poff'), sep='\t', header=0)
df.rename(columns=lambda x: x.replace('.faa', ''), inplace=True)
#reorder table to it matches up with busco list of dicts
newhead = [df.columns.values[0], df.columns.values[1], df.columns.values[2]]
newhead += scinames
df = df[newhead]
#write to file (not sure I need this now?)
#df.to_csv(os.path.join(args.out, 'protortho', 'funannotate_reorder.poff'), sep='\t', index=False)
#now filter table to only single copy orthologs to use with phylogeny
num_species = len(df.columns) - 3
sco = df[(df['# Species'] == num_species) & (df['Genes'] == num_species)]
sco_hits = sco.drop(sco.columns[0:3], axis=1)
#now cross reference with busco, as we want this for phylogeny
keep = []
sc_buscos = []
for index, row in sco_hits.iterrows():
busco_check = []
for i in range(0, num_species):
if row[i] in busco[i]:
busco_check.append(busco[i].get(row[i]))
busco_check = lib.flatten(busco_check)
#need to check if outgroup is passed and this model exists in that outgroup
if len(set(busco_check)) == 1:
if args.outgroup:
available_busco = []
with open(outgroup_species, 'rU') as outfasta:
for line in outfasta:
if line.startswith('>'):
line = line.replace('\n', '')
name = line.replace('>', '')
available_busco.append(name)
if busco_check[0] in available_busco:
keep.append(index)
sc_buscos.append(busco_check[0])
else:
keep.append(index)
sco_final = sco_hits.ix[keep]

#take dataframe and output the ortholog table.
dftrim = df.drop(df.columns[0:3], axis=1) #trim down to just gene models
orthdf = df[(df['# Species'] > 1)] #get rid of singletons in this dataset
orth_hits = orthdf.drop(orthdf.columns[0:3], axis=1) #trim to just gene models

orthologs = os.path.join(args.out, 'annotations','orthology_groups.txt')
with open(orthologs, 'w') as output:
with open(os.path.join(args.out, 'protortho', 'funannotate.poff'), 'rU') as input:
count = 0
scoCount = 0
for line in input:
line = line.replace('\n', '') #strip line ending
if line.startswith('#'):
header = line
species = header.split('\t')[3:]
num_species = header.count('\t') - 2
continue
col = re.split(r'[,\t]', line)
if col[0] != '1':
count +=1
ID = 'orth'+str(count)
prots = col[3:]
prots = [x for x in prots if x != '*']
eggs = []
buscos = []
for i in prots:
hit = EGGNOG.get(i)
if not hit in eggs:
eggs.append(hit)
hit2 = busco.get(i)
if not hit2 in buscos:
buscos.append(hit2)
eggs = [x for x in eggs if x is not None]
buscos = [x for x in buscos if x is not None]
buscos = lib.flatten(buscos)
if len(eggs) > 0:
eggs = ', '.join(str(v) for v in eggs)
else:
eggs = 'None'
if len(buscos) > 0:
buscos = set(buscos)
buscos = ', '.join(str(v) for v in buscos)
else:
buscos = 'None'
if col[0] == str(num_species) and col[1] == str(num_species):
scoCount += 1
output.write("%s\t%s\t%s\t%s\n" % (ID, eggs, buscos, ', '.join(prots)))
#should be able to parse the pandas ortho dataframe now
for index, row in orth_hits.iterrows():
ID = 'orth'+str(index)
buscos = []
eggs = []
proteins = []
for x in range(0, len(row)):
if row[x] != '*':
prots = row[x].split(',')
for y in prots:
proteins.append(y)
egghit = eggnog[x].get(y)
if not egghit in eggs:
eggs.append(egghit)
buscohit = busco[x].get(y)
if not buscohit in buscos:
buscos.append(buscohit)
#clean up the None's that get added
eggs = [x for x in eggs if x is not None]
buscos = [x for x in buscos if x is not None]
buscos = lib.flatten(buscos)

#write to output
if len(eggs) > 0:
eggs = ', '.join(str(v) for v in eggs)
else:
eggs = 'None'
if len(buscos) > 0:
buscos = set(buscos)
buscos = ', '.join(str(v) for v in buscos)
else:
buscos = 'None'
output.write("%s\t%s\t%s\t%s\n" % (ID, eggs, buscos, ', '.join(proteins)))

if not os.path.isdir(os.path.join(args.out, 'stats')):
os.makedirs(os.path.join(args.out, 'stats'))
summary = []
for i in stats:
try:
singles = lib.singletons(os.path.join(args.out, 'protortho', 'funannotate.poff'), i[0])
except IOError:
singles = 0
i.append("{0:,}".format(singles))
try:
orthos = lib.orthologs(os.path.join(args.out, 'protortho', 'funannotate.poff'), i[0])
except IOError:
orthos = 0
i.append("{0:,}".format(orthos))
i.append("{0:,}".format(scoCount))
summary.append(i)
#get stats, this is all single copy orthologs
scoCount = len(sco_hits)
for i in range(0, len(stats)):
orthos = 0
for index, row in orth_hits[scinames[i]].iteritems():
if row != '*':
add = row.count(',') + 1
orthos += add
singletons = 0
for index, row in dftrim.iterrows():
if row[scinames[i]] != '*':
others = []
for y in range(0, len(row)):
others.append(row[y])
others = set(others)
if len(others) == 2:
singletons += 1
stats[i].append("{0:,}".format(singletons))
stats[i].append("{0:,}".format(orthos))
stats[i].append("{0:,}".format(scoCount))
summary.append(stats[i])

#convert to dataframe for easy output
header = ['species', 'isolate', 'Assembly Size', 'Largest Scaffold', 'Average Scaffold', 'Num Scaffolds', 'Scaffold N50', 'Percent GC', 'Num Genes', 'Num Proteins', 'Num tRNA', 'Unique Proteins', 'Prots atleast 1 ortholog', 'Single-copy orthologs']
df = pd.DataFrame(summary, columns=header)
Expand All @@ -539,6 +599,7 @@ def __init__(self,prog):
output.write(df.transpose().to_html(classes='table table-condensed'))
output.write(lib.FOOTER)
############################################

######summarize all annotation for each gene in a table
lib.log.info("Compiling all annotations for each genome")

Expand Down Expand Up @@ -574,14 +635,13 @@ def __init__(self,prog):
meropsDict = lib.dictFlip(merops)
cazyDict = lib.dictFlip(cazy)


table = []
header = ['GeneID','length','description', 'Ortho Group', 'EggNog', 'BUSCO','Protease family', 'CAZyme family', 'InterPro Domains', 'PFAM Domains', 'GO terms', 'SecMet Cluster', 'SMCOG']
for i in range(0,num_input):
outputname = os.path.join(args.out, 'annotations', stats[i][0].replace(' ', '_')+'.all.annotations.tsv')
for y in range(0,num_input):
outputname = os.path.join(args.out, 'annotations', scinames[y]+'.all.annotations.tsv')
with open(outputname, 'w') as output:
output.write("%s\n" % ('\t'.join(header)))
with open(gbkfilenames[i], 'rU') as input:
with open(gbkfilenames[y], 'rU') as input:
SeqRecords = SeqIO.parse(input, 'genbank')
for record in SeqRecords:
for f in record.features:
Expand All @@ -608,8 +668,8 @@ def __init__(self,prog):
cazydomains = "; ".join(cazyDict.get(ID))
else:
cazydomains = ''
if ID in busco:
buscogroup = busco.get(ID)[0]
if ID in busco[y]:
buscogroup = busco[y].get(ID)[0]
else:
buscogroup = ''
if ID in goDict:
Expand All @@ -635,6 +695,7 @@ def __init__(self,prog):
final_result = [ID, str(length), description, orthogroup, egg, buscogroup, meropsdomains, cazydomains, IPRdomains, pfamdomains, goTerms, cluster, smcog]
output.write("%s\n" % ('\t'.join(final_result)))
############################################

#build phylogeny
if not os.path.isfile(os.path.join(args.out, 'phylogeny', 'RAxML.phylogeny.pdf')):
if outgroup:
Expand All @@ -643,7 +704,8 @@ def __init__(self,prog):
num_phylogeny = len(args.input)
if num_phylogeny > 3:
lib.log.info("Inferring phylogeny using RAxML")
lib.ortho2phylogeny(os.path.join(args.out, 'protortho', 'funannotate.poff'), args.num_orthos, busco, args.cpus, args.bootstrap, phylogeny, outgroup, outgroup_species, outgroup_name)
folder = os.path.join(args.out, 'protortho')
lib.ortho2phylogeny(folder, sco_final, args.num_orthos, busco, args.cpus, args.bootstrap, phylogeny, outgroup, outgroup_species, outgroup_name, sc_buscos)
else:
lib.log.info("Skipping RAxML phylogeny as at least 4 taxa are required")
with open(os.path.join(args.out,'phylogeny.html'), 'w') as output:
Expand Down
Loading

0 comments on commit 6e454e6

Please sign in to comment.