diff --git a/panphlan_find_gene_grp.py b/panphlan_find_gene_grp.py index 94c9dd4..985f43b 100755 --- a/panphlan_find_gene_grp.py +++ b/panphlan_find_gene_grp.py @@ -9,6 +9,7 @@ import os, subprocess, sys, time, bz2 import re import random +import itertools from sklearn.cluster import OPTICS import numpy as np import pandas as pd @@ -16,10 +17,14 @@ from scipy.spatial.distance import pdist, jaccard from scipy import stats import argparse as ap +import multiprocessing +from joblib import Parallel, delayed +from statsmodels.stats.multitest import multipletests + author__ = 'Leonard Dubois and Nicola Segata (contact on https://forum.biobakery.org/)' -__version__ = '3.0' -__date__ = '20 April 2020' +__version__ = '3.0.2' +__date__ = '4 Dec 2020' OPTICS_MIN_PTS = 5 @@ -34,20 +39,26 @@ def read_params(): help='Path to presence/absence matrix') p.add_argument('-o', '--output', type = str, default = None, help='Path to ouput file with genes groups') - p.add_argument('-c', '--cut_core_thres', type = float, default = 0.9, - help='Remove gene families present in [cut_core_thres] of the sample. Defaul is 90 percent (extended core)') + p.add_argument('--cut_core_thres', type = float, default = 0.9, + help='Remove gene families present in [cut_core_thres] or more of the sample. Defaul is 90 percent (extended core)') + p.add_argument('--cut_cloud_thres', type = float, default = 0.1, + help='Remove gene families present in [cut_core_thres] or less of the sample. Defaul is 5 percent') p.add_argument('--out_plot', type = str, default = None, help='Path to heatmap plot output.') + p.add_argument('--plot_clust_pa', type = str, default = None, + help='Output : Path to heatmap of clusters presence-absence plot') p.add_argument('-p', '--pangenome', type = str, default = None, help='Path to pangenome file.') - p.add_argument('--optics_xi', type = float, default = 0.01, + p.add_argument('--optics_xi', type = float, default = 0.05, help='Xi parameter for OPTICS clustering') p.add_argument('--n_jobs', type = int, default = 4, help='How many cores to use for OPTICS clustering. Default 4') p.add_argument('--close_analysis', action='store_true', help='Compute analysis of genes proximity in genomes') + p.add_argument('--ssp_analysis', action='store_true', + help='Compute analysis of genes cluster links to intra species phylogeny') p.add_argument('--empirical', type = int, default = 1000, help='How many ramdom sample in empirical pvalue generation ? Default 1000') @@ -59,7 +70,7 @@ def read_params(): # READ AND PROCESS PANPHLAN MATRIX # ------------------------------------------------------------------------------ -def read_and_filter_matrix(filepath, threshold_sums, verbose): +def read_and_filter_matrix(filepath, core_thres, cloud_thres, verbose): panphlan_matrix = pd.read_csv(filepath, sep = '\t', header = 0, index_col = 0) if verbose: print(' [I] Reading PanPhlAn presence/absence matrix from : ' + str(filepath)) @@ -67,11 +78,19 @@ def read_and_filter_matrix(filepath, threshold_sums, verbose): print(' ' + str(panphlan_matrix.shape[1]) + ' samples (columns)') row_sums = panphlan_matrix.sum(axis = 1) - thres_bottom = round(panphlan_matrix.shape[1] * threshold_sums) + thres_bottom = round(panphlan_matrix.shape[1] * core_thres) keep = row_sums < thres_bottom panphlan_matrix = panphlan_matrix[keep] if verbose: - print(' [I] After removing genes too often present/absent. Cut-off at ' + str(threshold_sums * 100) + ' % of the samples :') + print(' [I] After removing genes families too often present. Cut-off at ' + str(core_thres * 100) + ' % of the samples :') + print(' Matrix with ' + str(panphlan_matrix.shape[0]) + ' genes families (rows) and') + print(' ' + str(panphlan_matrix.shape[1]) + ' samples (columns)') + + thres_top = round(panphlan_matrix.shape[1] * cloud_thres) + keep = row_sums > thres_top + panphlan_matrix = panphlan_matrix[keep] + if verbose: + print(' [I] After removing genes too often absent. Cut-off at ' + str(cloud_thres * 100) + ' % of the samples :') print(' Matrix with ' + str(panphlan_matrix.shape[0]) + ' genes families (rows) and') print(' ' + str(panphlan_matrix.shape[1]) + ' samples (columns)') @@ -99,20 +118,19 @@ def process_OPTICS(dist_matrix, xi_value, n_jobs, verbose): cluster_method = "xi", xi = xi_value, metric="precomputed", n_jobs = n_jobs).fit(dist_matrix) - a = pd.DataFrame(clustering.labels_) - # a[0].value_counts() - optics_res = dict(zip(dist_matrix.index, clustering.labels_) ) - # dbscan_res = {"UniRef90_XXX" : cluster_ID, "UniRef90_YYY" : cluster_ID , ...} + optics_res_df = pd.DataFrame(clustering.labels_) + # optics_res_df[0].value_counts() + optics_res_dict = dict(zip(dist_matrix.index, clustering.labels_) ) + # optics_res = {"UniRef90_XXX" : cluster_ID, "UniRef90_YYY" : cluster_ID , ...} if verbose: print(' Done') - return optics_res + return optics_res_dict, optics_res_df def write_clusters(dbscan_res, out_file, operon_pval = None, subspec_pval = None): OUT = open(out_file, mode='w') - #OUT.write("clust_ID\tsize\toperon_pval\tsubspec_pval\tUniRef_ID\n") - OUT.write("clust_ID\tsize\toperon_pval\tUniRef_ID\n") - for i in range(-1, max(dbscan_res.values()) + 1 ): + OUT.write("clust_ID\tsize\toperon_pval\tsubspec_pval\tUniRef_ID\n") + for i in range(0, max(dbscan_res.values()) + 1 ): OUT.write(str(i) + "\t") ids = [k for k,v in dbscan_res.items() if v == i] OUT.write(str(len(ids)) + "\t") @@ -120,10 +138,10 @@ def write_clusters(dbscan_res, out_file, operon_pval = None, subspec_pval = None OUT.write(str(operon_pval[i]) + "\t") else: OUT.write("NA\t") - # if subspec_pval: - # OUT.write(str(subspec_pval[i]) + "\t") - # else: - # OUT.write("NA\t") + if subspec_pval: + OUT.write(str(subspec_pval["clust_" + str(i)]) + "\t") + else: + OUT.write("NA\t") OUT.write(";".join(ids)) OUT.write("\n") OUT.close() @@ -180,8 +198,38 @@ def plot_heatmap(panphlan_matrix, out_path, clust_res = None ): p.cax.set_visible(False) p.savefig(out_path, dpi = 300) + +def plot_hm_genes_clusters(clust_PA_matrix, out_path, ssp_pval, args ): + import seaborn as sns + from matplotlib import pyplot as plt + + clust_PA_matrix = clust_PA_matrix.astype(int) + print(' [I] Exporting HM of clusters') + sys.setrecursionlimit(10**7) + + sns.set(color_codes = True, font_scale = 0.1) + + clust_names = ["clust_" + str(x) for x in set(ssp_pval.keys())] + annot_col_map = {True : "#f7f7f7", False : "#1a9641"} + lbl_to_col = dict() + for lbl, pval in ssp_pval.items(): + col = annot_col_map[pval <= 0.01] + lbl_to_col.update({lbl : col}) + lbl_to_col = pd.Series(lbl_to_col) + + #plot_width = round(panphlan_matrix.shape[0] / 50) + #plot_height = round(panphlan_matrix.shape[1] / 50) + #plt.figure() + p = sns.clustermap(data = clust_PA_matrix, + metric = 'jaccard', + row_colors = lbl_to_col, + cmap = sns.color_palette(["#f7f7f7", "#2c7bb6"]), + xticklabels = [], yticklabels = [], + dendrogram_ratio=0.1) + p.cax.set_visible(False) + p.savefig(out_path, dpi = 300) # ------------------------------------------------------------------------------ -# ANALYZING GENES GROUPS +# ANALYZING GENES GROUPS - Proximity # ------------------------------------------------------------------------------ def get_span_ratio(gene_list, df): @@ -249,39 +297,121 @@ def assessment_operon(clust_res, args): clust_is_operon.update({cluster : "NA"}) return clust_is_operon + +# ------------------------------------------------------------------------------ +# ANALYZING GENES GROUPS - Phylogeny +# ------------------------------------------------------------------------------ -def assessment_subspecies_gene(dbscan_res, panphlan_matrix): +def create_cluster_PA_matrix(clust_res, panphlan_matrix): + """ + Collapse the presence-absence signature of genes families to genes clusters level + Gene cluster is considered present if more than half of the genes are present + """ + clust_res.index = panphlan_matrix.index + clust_IDs = set(clust_res[0].values) + clust_IDs.remove(-1) + clust_PA_matrix = pd.DataFrame(index = ["clust_" + str(x) for x in clust_IDs], + columns = panphlan_matrix.columns ) + for clust in clust_IDs : + uniref_in_clust = clust_res[clust_res[0] == clust].index + pres_abs_submat = panphlan_matrix.loc[uniref_in_clust] + clust_PA = pres_abs_submat.apply(lambda x : int(sum(x) > 0.5 * len(uniref_in_clust))).T + clust_PA_matrix.loc["clust_" + str(clust)] = clust_PA + return clust_PA_matrix + + +def assessment_cluster_phylo_link(cluster, clust_PA_matrix, samples_dist, args): + if args.verbose: + print(' [I] Analyzing genes OPTICS cluster {}'.format(cluster)) + grp_present = clust_PA_matrix.columns[clust_PA_matrix.loc[str(cluster)] == 1] + dist_grp_present = samples_dist.loc[samples_dist['sampleA'].isin(grp_present) & samples_dist['sampleB'].isin(grp_present)]['dist'] + dist_inter_grp = samples_dist.loc[~samples_dist['sampleA'].isin(grp_present) ^ ~samples_dist['sampleB'].isin(grp_present)]['dist'] + #mean_base = dist_inter_grp.mean() - dist_grp_present.mean() + mean_base = np.std(dist_inter_grp) - np.std(dist_grp_present) + mean_base = abs(mean_base) + + empirical_means = [] + for i in range(1, args.empirical): + rand_samples = random.sample(set(clust_PA_matrix.columns), k = len(grp_present)) + dist_rand_samples = samples_dist.loc[samples_dist['sampleA'].isin(rand_samples) & samples_dist['sampleB'].isin(rand_samples)]['dist'] + dist_not_rand_samples = samples_dist.loc[samples_dist['sampleA'].isin(rand_samples) ^ samples_dist['sampleB'].isin(rand_samples)]['dist'] + # here to avoid computing the mean over way too many values and having a narrow range of empirical value, a subset is extracted + dist_rand_samples = random.sample(list(dist_rand_samples), k = 300) + dist_not_rand_samples = random.sample(list(dist_not_rand_samples), k = 300) + # empirical_means.append(dist_rand_samples.mean()) + empirical_means.append(abs(np.mean(dist_not_rand_samples) - np.mean(dist_rand_samples))) + + clust_pval = sum([mean_base < x for x in empirical_means]) / args.empirical + print("Cluster {} has pval {}".format(cluster, clust_pval)) + print("Mean Base {} ".format(mean_base)) + + return([cluster, clust_pval]) + + +# TEST WITH MEDOID +def assessment_cluster_phylo_link_medoid(cluster, clust_PA_matrix, sample_dist_matrix, args): + if args.verbose: + print(' [I] Analyzing genes OPTICS cluster {}'.format(cluster)) + grp_present = clust_PA_matrix.columns[clust_PA_matrix.loc[str(cluster)] == 1] + + tmp = sample_dist_matrix[grp_present] + tmp = tmp.T[grp_present] + medoid_index = np.argmin(tmp.sum(axis=0)) + mean_dist_to_medoid = tmp.iloc[medoid_index].mean() + empirical_means = [] + for i in range(1, args.empirical): + rand_samples = random.sample(set(clust_PA_matrix.columns), k = len(grp_present )) + tmp = sample_dist_matrix[rand_samples] + tmp = tmp.T[rand_samples] + medoid_index_emp = np.argmin(tmp.sum(axis=0)) + empirical_means.append(tmp.iloc[medoid_index_emp].mean()) + + clust_pval = sum([mean_dist_to_medoid > x for x in empirical_means]) / args.empirical + print("Cluster {} has pval {}".format(cluster, clust_pval)) + print("Mean Base {} ".format(mean_dist_to_medoid)) + return([cluster, clust_pval]) + + +def assessment_subspecies_gene(clust_res, panphlan_matrix, clust_PA_matrix, args): """WIP - Check if group of genes is driving the phylogenetic dendrogramm + Check if group of genes is linked to the phylogenetic dendrogramm + Check if distance between samples is significantly different between + the samples which have the gene cluster (or not) and the others. + Use empirical pvalue (random samples) WIP """ - clust_subspec = dict() - table_count = {a : list(dbscan_res.values()).count(a) for a in dbscan_res.values()} - table_count = sorted(table_count, key = table_count.get, reverse = True) - for cluster in table_count: - if cluster in table_count[0:2]: - clust_subspec[cluster] = "NA" - continue - genes = [k for k,v in dbscan_res.items() if v == cluster] - slice = panphlan_matrix.loc[genes] - colsums = slice.sum() - # DO NOT WORK - present_grp = [a == len(genes) for a in colsums] - absent_grp = [not a for a in present_grp] - #present_grp = slice[slice.columns[present_grp]] - #absent_grp = slice[slice.columns[absent_grp]] - present_grp = panphlan_matrix[slice.columns[present_grp]] - absent_grp = panphlan_matrix[slice.columns[absent_grp]] - print(present_grp) - - pres_dist = pdist(present_grp.T, 'jaccard') - absent_dist = pdist(absent_grp.T, 'jaccard') - x= stats.mannwhitneyu(pres_dist, absent_dist) - clust_subspec[cluster] = x.pvalue - # pvalue always 0 does not work :( - - return clust_subspec + clust_res.index = panphlan_matrix.index + clust_IDs = set(clust_res[0].values) + clust_IDs.remove(-1) + + # then ompute pairwise distances of samples based on PanPhlAn presence_absence + # Shown to be correlated with StrainPhlAn distances + samples_dist = pd.DataFrame(itertools.combinations(panphlan_matrix.columns, 2), + columns=['sampleA','sampleB']) + samples_dist['dist'] = pdist(panphlan_matrix.T, 'jaccard') + sample_dist_matrix = pd.DataFrame(squareform(samples_dist['dist'] ), + index = panphlan_matrix.columns, + columns = panphlan_matrix.columns) + + clust_phylo_test = dict() + + par_res = Parallel(n_jobs=args.n_jobs)( + delayed(assessment_cluster_phylo_link)(i, clust_PA_matrix, samples_dist, args) for i in clust_PA_matrix.index) + + for x,y in par_res : + clust_phylo_test.update({x : y}) + + # stick with bonferroni ? or BH ? + clust_phylo_test_df = pd.DataFrame.from_dict(clust_phylo_test, orient='index', columns = ['raw_pval']) + clust_phylo_test_df['pval_bonf'] = multipletests(clust_phylo_test_df['raw_pval'], method = 'bonferroni')[1] + clust_phylo_test_bonf = dict() + for index, row in clust_phylo_test_df.iterrows(): + clust_phylo_test_bonf.update({index : row['pval_bonf']}) + + + # return clust_phylo_test_bonf + return clust_phylo_test # ------------------------------------------------------------------------------ # MAIN @@ -292,21 +422,37 @@ def main(): sys.exit('[E] This software uses Python 3, please update Python') args = read_params() - panphlan_matrix = read_and_filter_matrix(args.i_matrix, args.cut_core_thres, args.verbose) + panphlan_matrix = read_and_filter_matrix(args.i_matrix, args.cut_core_thres, args.cut_cloud_thres, args.verbose) if args.output: dist_matrix = compute_dist(panphlan_matrix, args.verbose) - optics_res = process_OPTICS(dist_matrix, args.optics_xi, args.n_jobs, args.verbose) + optics_res_dict, optics_res_df = process_OPTICS(dist_matrix, args.optics_xi, args.n_jobs, args.verbose) if args.close_analysis: - operon_pval = assessment_operon(optics_res, args) + operon_pval = assessment_operon(optics_res_dict, args) else: operon_pval = None - #subspec_pval = assessment_subspecies_gene(dbscan_res, panphlan_matrix) - #write_clusters(dbscan_res, args.output, operon_pval = operon_pval, subspec_pval = subspec_pval) - write_clusters(optics_res, args.output, operon_pval = operon_pval) + if args.ssp_analysis: + clust_PA_matrix = create_cluster_PA_matrix(optics_res_df, panphlan_matrix) + ssp_pval = assessment_subspecies_gene(optics_res_df, panphlan_matrix, clust_PA_matrix, args) + # with open('ssp_pval.pkl', 'wb') as out_pkl: + # pickle.dump(ssp_pval, out_pkl) + + # with open('ssp_pval.pkl', 'rb') as in_pkl: + # ssp_pval = pickle.load(in_pkl) + # del ssp_pval[-1] + # with open('PA_clust.pkl', 'wb') as out_pkl: + # pickle.dump(clust_PA_matrix, out_pkl) + + else : + ssp_pval = None + + # print(clust_PA_matrix) + write_clusters(optics_res_dict, args.output, operon_pval = operon_pval, subspec_pval = ssp_pval) if args.out_plot: - plot_heatmap(panphlan_matrix, args.out_plot, optics_res) + plot_heatmap(panphlan_matrix, args.out_plot, optics_res_dict) + if args.plot_clust_pa: + plot_hm_genes_clusters(clust_PA_matrix, args.plot_clust_pa, ssp_pval, args ) elif args.out_plot: plot_heatmap(panphlan_matrix, args.out_plot) diff --git a/panphlan_profiling.py b/panphlan_profiling.py index 689f43d..c43c44a 100755 --- a/panphlan_profiling.py +++ b/panphlan_profiling.py @@ -281,7 +281,13 @@ def get_sampleID_from_path(sample_path): def read_gene_cov_file(input_file): """Convert coverage mapping file into a dictionary data structure""" d = {} + + # this seem to have been designed for Python 2 + #f = bz2.BZ2File(input_file, mode='r') + # Here is the new way (Python 3) f = bz2.open(input_file, mode='rt') + # even if first way was working as well + for line in f: words = line.strip().split('\t') gene, coverage = words[0], int(words[1])