Skip to content

Commit

Permalink
Merge branch 'phylo_test'
Browse files Browse the repository at this point in the history
  • Loading branch information
leonarDubois committed Jan 12, 2021
2 parents 965c70a + 4de551c commit 9104590
Show file tree
Hide file tree
Showing 2 changed files with 208 additions and 56 deletions.
258 changes: 202 additions & 56 deletions panphlan_find_gene_grp.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,22 @@
import os, subprocess, sys, time, bz2
import re
import random
import itertools
from sklearn.cluster import OPTICS
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform
from scipy.spatial.distance import pdist, jaccard
from scipy import stats
import argparse as ap
import multiprocessing
from joblib import Parallel, delayed
from statsmodels.stats.multitest import multipletests


author__ = 'Leonard Dubois and Nicola Segata (contact on https://forum.biobakery.org/)'
__version__ = '3.0'
__date__ = '20 April 2020'
__version__ = '3.0.2'
__date__ = '4 Dec 2020'

OPTICS_MIN_PTS = 5

Expand All @@ -34,20 +39,26 @@ def read_params():
help='Path to presence/absence matrix')
p.add_argument('-o', '--output', type = str, default = None,
help='Path to ouput file with genes groups')
p.add_argument('-c', '--cut_core_thres', type = float, default = 0.9,
help='Remove gene families present in [cut_core_thres] of the sample. Defaul is 90 percent (extended core)')
p.add_argument('--cut_core_thres', type = float, default = 0.9,
help='Remove gene families present in [cut_core_thres] or more of the sample. Defaul is 90 percent (extended core)')
p.add_argument('--cut_cloud_thres', type = float, default = 0.1,
help='Remove gene families present in [cut_core_thres] or less of the sample. Defaul is 5 percent')
p.add_argument('--out_plot', type = str, default = None,
help='Path to heatmap plot output.')
p.add_argument('--plot_clust_pa', type = str, default = None,
help='Output : Path to heatmap of clusters presence-absence plot')
p.add_argument('-p', '--pangenome', type = str, default = None,
help='Path to pangenome file.')

p.add_argument('--optics_xi', type = float, default = 0.01,
p.add_argument('--optics_xi', type = float, default = 0.05,
help='Xi parameter for OPTICS clustering')
p.add_argument('--n_jobs', type = int, default = 4,
help='How many cores to use for OPTICS clustering. Default 4')

p.add_argument('--close_analysis', action='store_true',
help='Compute analysis of genes proximity in genomes')
p.add_argument('--ssp_analysis', action='store_true',
help='Compute analysis of genes cluster links to intra species phylogeny')
p.add_argument('--empirical', type = int, default = 1000,
help='How many ramdom sample in empirical pvalue generation ? Default 1000')

Expand All @@ -59,19 +70,27 @@ def read_params():
# READ AND PROCESS PANPHLAN MATRIX
# ------------------------------------------------------------------------------

def read_and_filter_matrix(filepath, threshold_sums, verbose):
def read_and_filter_matrix(filepath, core_thres, cloud_thres, verbose):
panphlan_matrix = pd.read_csv(filepath, sep = '\t', header = 0, index_col = 0)
if verbose:
print(' [I] Reading PanPhlAn presence/absence matrix from : ' + str(filepath))
print(' Matrix with ' + str(panphlan_matrix.shape[0]) + ' genes families (rows) and')
print(' ' + str(panphlan_matrix.shape[1]) + ' samples (columns)')

row_sums = panphlan_matrix.sum(axis = 1)
thres_bottom = round(panphlan_matrix.shape[1] * threshold_sums)
thres_bottom = round(panphlan_matrix.shape[1] * core_thres)
keep = row_sums < thres_bottom
panphlan_matrix = panphlan_matrix[keep]
if verbose:
print(' [I] After removing genes too often present/absent. Cut-off at ' + str(threshold_sums * 100) + ' % of the samples :')
print(' [I] After removing genes families too often present. Cut-off at ' + str(core_thres * 100) + ' % of the samples :')
print(' Matrix with ' + str(panphlan_matrix.shape[0]) + ' genes families (rows) and')
print(' ' + str(panphlan_matrix.shape[1]) + ' samples (columns)')

thres_top = round(panphlan_matrix.shape[1] * cloud_thres)
keep = row_sums > thres_top
panphlan_matrix = panphlan_matrix[keep]
if verbose:
print(' [I] After removing genes too often absent. Cut-off at ' + str(cloud_thres * 100) + ' % of the samples :')
print(' Matrix with ' + str(panphlan_matrix.shape[0]) + ' genes families (rows) and')
print(' ' + str(panphlan_matrix.shape[1]) + ' samples (columns)')

Expand Down Expand Up @@ -99,31 +118,30 @@ def process_OPTICS(dist_matrix, xi_value, n_jobs, verbose):
cluster_method = "xi", xi = xi_value,
metric="precomputed", n_jobs = n_jobs).fit(dist_matrix)

a = pd.DataFrame(clustering.labels_)
# a[0].value_counts()
optics_res = dict(zip(dist_matrix.index, clustering.labels_) )
# dbscan_res = {"UniRef90_XXX" : cluster_ID, "UniRef90_YYY" : cluster_ID , ...}
optics_res_df = pd.DataFrame(clustering.labels_)
# optics_res_df[0].value_counts()
optics_res_dict = dict(zip(dist_matrix.index, clustering.labels_) )
# optics_res = {"UniRef90_XXX" : cluster_ID, "UniRef90_YYY" : cluster_ID , ...}
if verbose:
print(' Done')
return optics_res
return optics_res_dict, optics_res_df


def write_clusters(dbscan_res, out_file, operon_pval = None, subspec_pval = None):
OUT = open(out_file, mode='w')
#OUT.write("clust_ID\tsize\toperon_pval\tsubspec_pval\tUniRef_ID\n")
OUT.write("clust_ID\tsize\toperon_pval\tUniRef_ID\n")
for i in range(-1, max(dbscan_res.values()) + 1 ):
OUT.write("clust_ID\tsize\toperon_pval\tsubspec_pval\tUniRef_ID\n")
for i in range(0, max(dbscan_res.values()) + 1 ):
OUT.write(str(i) + "\t")
ids = [k for k,v in dbscan_res.items() if v == i]
OUT.write(str(len(ids)) + "\t")
if operon_pval:
OUT.write(str(operon_pval[i]) + "\t")
else:
OUT.write("NA\t")
# if subspec_pval:
# OUT.write(str(subspec_pval[i]) + "\t")
# else:
# OUT.write("NA\t")
if subspec_pval:
OUT.write(str(subspec_pval["clust_" + str(i)]) + "\t")
else:
OUT.write("NA\t")
OUT.write(";".join(ids))
OUT.write("\n")
OUT.close()
Expand Down Expand Up @@ -180,8 +198,38 @@ def plot_heatmap(panphlan_matrix, out_path, clust_res = None ):
p.cax.set_visible(False)
p.savefig(out_path, dpi = 300)


def plot_hm_genes_clusters(clust_PA_matrix, out_path, ssp_pval, args ):
import seaborn as sns
from matplotlib import pyplot as plt

clust_PA_matrix = clust_PA_matrix.astype(int)
print(' [I] Exporting HM of clusters')
sys.setrecursionlimit(10**7)

sns.set(color_codes = True, font_scale = 0.1)

clust_names = ["clust_" + str(x) for x in set(ssp_pval.keys())]
annot_col_map = {True : "#f7f7f7", False : "#1a9641"}
lbl_to_col = dict()
for lbl, pval in ssp_pval.items():
col = annot_col_map[pval <= 0.01]
lbl_to_col.update({lbl : col})
lbl_to_col = pd.Series(lbl_to_col)

#plot_width = round(panphlan_matrix.shape[0] / 50)
#plot_height = round(panphlan_matrix.shape[1] / 50)
#plt.figure()
p = sns.clustermap(data = clust_PA_matrix,
metric = 'jaccard',
row_colors = lbl_to_col,
cmap = sns.color_palette(["#f7f7f7", "#2c7bb6"]),
xticklabels = [], yticklabels = [],
dendrogram_ratio=0.1)
p.cax.set_visible(False)
p.savefig(out_path, dpi = 300)
# ------------------------------------------------------------------------------
# ANALYZING GENES GROUPS
# ANALYZING GENES GROUPS - Proximity
# ------------------------------------------------------------------------------

def get_span_ratio(gene_list, df):
Expand Down Expand Up @@ -249,39 +297,121 @@ def assessment_operon(clust_res, args):
clust_is_operon.update({cluster : "NA"})

return clust_is_operon

# ------------------------------------------------------------------------------
# ANALYZING GENES GROUPS - Phylogeny
# ------------------------------------------------------------------------------


def assessment_subspecies_gene(dbscan_res, panphlan_matrix):
def create_cluster_PA_matrix(clust_res, panphlan_matrix):
"""
Collapse the presence-absence signature of genes families to genes clusters level
Gene cluster is considered present if more than half of the genes are present
"""
clust_res.index = panphlan_matrix.index
clust_IDs = set(clust_res[0].values)
clust_IDs.remove(-1)
clust_PA_matrix = pd.DataFrame(index = ["clust_" + str(x) for x in clust_IDs],
columns = panphlan_matrix.columns )
for clust in clust_IDs :
uniref_in_clust = clust_res[clust_res[0] == clust].index
pres_abs_submat = panphlan_matrix.loc[uniref_in_clust]
clust_PA = pres_abs_submat.apply(lambda x : int(sum(x) > 0.5 * len(uniref_in_clust))).T
clust_PA_matrix.loc["clust_" + str(clust)] = clust_PA
return clust_PA_matrix


def assessment_cluster_phylo_link(cluster, clust_PA_matrix, samples_dist, args):
if args.verbose:
print(' [I] Analyzing genes OPTICS cluster {}'.format(cluster))
grp_present = clust_PA_matrix.columns[clust_PA_matrix.loc[str(cluster)] == 1]
dist_grp_present = samples_dist.loc[samples_dist['sampleA'].isin(grp_present) & samples_dist['sampleB'].isin(grp_present)]['dist']
dist_inter_grp = samples_dist.loc[~samples_dist['sampleA'].isin(grp_present) ^ ~samples_dist['sampleB'].isin(grp_present)]['dist']
#mean_base = dist_inter_grp.mean() - dist_grp_present.mean()
mean_base = np.std(dist_inter_grp) - np.std(dist_grp_present)
mean_base = abs(mean_base)

empirical_means = []
for i in range(1, args.empirical):
rand_samples = random.sample(set(clust_PA_matrix.columns), k = len(grp_present))
dist_rand_samples = samples_dist.loc[samples_dist['sampleA'].isin(rand_samples) & samples_dist['sampleB'].isin(rand_samples)]['dist']
dist_not_rand_samples = samples_dist.loc[samples_dist['sampleA'].isin(rand_samples) ^ samples_dist['sampleB'].isin(rand_samples)]['dist']
# here to avoid computing the mean over way too many values and having a narrow range of empirical value, a subset is extracted
dist_rand_samples = random.sample(list(dist_rand_samples), k = 300)
dist_not_rand_samples = random.sample(list(dist_not_rand_samples), k = 300)
# empirical_means.append(dist_rand_samples.mean())
empirical_means.append(abs(np.mean(dist_not_rand_samples) - np.mean(dist_rand_samples)))

clust_pval = sum([mean_base < x for x in empirical_means]) / args.empirical
print("Cluster {} has pval {}".format(cluster, clust_pval))
print("Mean Base {} ".format(mean_base))

return([cluster, clust_pval])


# TEST WITH MEDOID
def assessment_cluster_phylo_link_medoid(cluster, clust_PA_matrix, sample_dist_matrix, args):
if args.verbose:
print(' [I] Analyzing genes OPTICS cluster {}'.format(cluster))
grp_present = clust_PA_matrix.columns[clust_PA_matrix.loc[str(cluster)] == 1]

tmp = sample_dist_matrix[grp_present]
tmp = tmp.T[grp_present]
medoid_index = np.argmin(tmp.sum(axis=0))
mean_dist_to_medoid = tmp.iloc[medoid_index].mean()
empirical_means = []
for i in range(1, args.empirical):
rand_samples = random.sample(set(clust_PA_matrix.columns), k = len(grp_present ))
tmp = sample_dist_matrix[rand_samples]
tmp = tmp.T[rand_samples]
medoid_index_emp = np.argmin(tmp.sum(axis=0))
empirical_means.append(tmp.iloc[medoid_index_emp].mean())

clust_pval = sum([mean_dist_to_medoid > x for x in empirical_means]) / args.empirical
print("Cluster {} has pval {}".format(cluster, clust_pval))
print("Mean Base {} ".format(mean_dist_to_medoid))
return([cluster, clust_pval])


def assessment_subspecies_gene(clust_res, panphlan_matrix, clust_PA_matrix, args):
"""WIP
Check if group of genes is driving the phylogenetic dendrogramm
Check if group of genes is linked to the phylogenetic dendrogramm
Check if distance between samples is significantly different between
the samples which have the gene cluster (or not) and the others.
Use empirical pvalue (random samples)
WIP
"""
clust_subspec = dict()
table_count = {a : list(dbscan_res.values()).count(a) for a in dbscan_res.values()}
table_count = sorted(table_count, key = table_count.get, reverse = True)
for cluster in table_count:
if cluster in table_count[0:2]:
clust_subspec[cluster] = "NA"
continue
genes = [k for k,v in dbscan_res.items() if v == cluster]
slice = panphlan_matrix.loc[genes]
colsums = slice.sum()
# DO NOT WORK
present_grp = [a == len(genes) for a in colsums]
absent_grp = [not a for a in present_grp]
#present_grp = slice[slice.columns[present_grp]]
#absent_grp = slice[slice.columns[absent_grp]]
present_grp = panphlan_matrix[slice.columns[present_grp]]
absent_grp = panphlan_matrix[slice.columns[absent_grp]]
print(present_grp)

pres_dist = pdist(present_grp.T, 'jaccard')
absent_dist = pdist(absent_grp.T, 'jaccard')
x= stats.mannwhitneyu(pres_dist, absent_dist)
clust_subspec[cluster] = x.pvalue
# pvalue always 0 does not work :(

return clust_subspec
clust_res.index = panphlan_matrix.index
clust_IDs = set(clust_res[0].values)
clust_IDs.remove(-1)

# then ompute pairwise distances of samples based on PanPhlAn presence_absence
# Shown to be correlated with StrainPhlAn distances
samples_dist = pd.DataFrame(itertools.combinations(panphlan_matrix.columns, 2),
columns=['sampleA','sampleB'])
samples_dist['dist'] = pdist(panphlan_matrix.T, 'jaccard')
sample_dist_matrix = pd.DataFrame(squareform(samples_dist['dist'] ),
index = panphlan_matrix.columns,
columns = panphlan_matrix.columns)

clust_phylo_test = dict()

par_res = Parallel(n_jobs=args.n_jobs)(
delayed(assessment_cluster_phylo_link)(i, clust_PA_matrix, samples_dist, args) for i in clust_PA_matrix.index)

for x,y in par_res :
clust_phylo_test.update({x : y})

# stick with bonferroni ? or BH ?
clust_phylo_test_df = pd.DataFrame.from_dict(clust_phylo_test, orient='index', columns = ['raw_pval'])
clust_phylo_test_df['pval_bonf'] = multipletests(clust_phylo_test_df['raw_pval'], method = 'bonferroni')[1]
clust_phylo_test_bonf = dict()
for index, row in clust_phylo_test_df.iterrows():
clust_phylo_test_bonf.update({index : row['pval_bonf']})


# return clust_phylo_test_bonf
return clust_phylo_test

# ------------------------------------------------------------------------------
# MAIN
Expand All @@ -292,21 +422,37 @@ def main():
sys.exit('[E] This software uses Python 3, please update Python')
args = read_params()

panphlan_matrix = read_and_filter_matrix(args.i_matrix, args.cut_core_thres, args.verbose)
panphlan_matrix = read_and_filter_matrix(args.i_matrix, args.cut_core_thres, args.cut_cloud_thres, args.verbose)

if args.output:
dist_matrix = compute_dist(panphlan_matrix, args.verbose)

optics_res = process_OPTICS(dist_matrix, args.optics_xi, args.n_jobs, args.verbose)
optics_res_dict, optics_res_df = process_OPTICS(dist_matrix, args.optics_xi, args.n_jobs, args.verbose)
if args.close_analysis:
operon_pval = assessment_operon(optics_res, args)
operon_pval = assessment_operon(optics_res_dict, args)
else:
operon_pval = None
#subspec_pval = assessment_subspecies_gene(dbscan_res, panphlan_matrix)
#write_clusters(dbscan_res, args.output, operon_pval = operon_pval, subspec_pval = subspec_pval)
write_clusters(optics_res, args.output, operon_pval = operon_pval)
if args.ssp_analysis:
clust_PA_matrix = create_cluster_PA_matrix(optics_res_df, panphlan_matrix)
ssp_pval = assessment_subspecies_gene(optics_res_df, panphlan_matrix, clust_PA_matrix, args)
# with open('ssp_pval.pkl', 'wb') as out_pkl:
# pickle.dump(ssp_pval, out_pkl)

# with open('ssp_pval.pkl', 'rb') as in_pkl:
# ssp_pval = pickle.load(in_pkl)
# del ssp_pval[-1]
# with open('PA_clust.pkl', 'wb') as out_pkl:
# pickle.dump(clust_PA_matrix, out_pkl)

else :
ssp_pval = None

# print(clust_PA_matrix)
write_clusters(optics_res_dict, args.output, operon_pval = operon_pval, subspec_pval = ssp_pval)
if args.out_plot:
plot_heatmap(panphlan_matrix, args.out_plot, optics_res)
plot_heatmap(panphlan_matrix, args.out_plot, optics_res_dict)
if args.plot_clust_pa:
plot_hm_genes_clusters(clust_PA_matrix, args.plot_clust_pa, ssp_pval, args )
elif args.out_plot:
plot_heatmap(panphlan_matrix, args.out_plot)

Expand Down
6 changes: 6 additions & 0 deletions panphlan_profiling.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,13 @@ def get_sampleID_from_path(sample_path):
def read_gene_cov_file(input_file):
"""Convert coverage mapping file into a dictionary data structure"""
d = {}

# this seem to have been designed for Python 2
#f = bz2.BZ2File(input_file, mode='r')
# Here is the new way (Python 3)
f = bz2.open(input_file, mode='rt')
# even if first way was working as well

for line in f:
words = line.strip().split('\t')
gene, coverage = words[0], int(words[1])
Expand Down

0 comments on commit 9104590

Please sign in to comment.