From 94b7032d0bd0dae39b6ed510846d93316fb5e430 Mon Sep 17 00:00:00 2001
From: leonarDubois <lha.dubois@gmail.com>
Date: Wed, 20 Jan 2021 09:27:44 +0000
Subject: [PATCH] 3.1 convenience update push

---
 panphlan_find_gene_grp.py | 465 --------------------------------------
 1 file changed, 465 deletions(-)
 delete mode 100755 panphlan_find_gene_grp.py

diff --git a/panphlan_find_gene_grp.py b/panphlan_find_gene_grp.py
deleted file mode 100755
index 985f43b..0000000
--- a/panphlan_find_gene_grp.py
+++ /dev/null
@@ -1,465 +0,0 @@
-#!/usr/bin/env python
-
-"""
-Analyzing PanPhlAn presence absence matrices
-    - plot the matrix as heatmap using seaborn
-    - based on location on the contigs, assess if a group of gene can be mobile element
-"""
-
-import os, subprocess, sys, time, bz2
-import re
-import random
-import itertools
-from sklearn.cluster import OPTICS
-import numpy as np
-import pandas as pd
-from scipy.spatial.distance import squareform
-from scipy.spatial.distance import pdist, jaccard
-from scipy import stats
-import argparse as ap
-import multiprocessing
-from joblib import Parallel, delayed
-from statsmodels.stats.multitest import multipletests
-
-
-author__ = 'Leonard Dubois and Nicola Segata (contact on https://forum.biobakery.org/)'
-__version__ = '3.0.2'
-__date__ = '4 Dec 2020'
-
-OPTICS_MIN_PTS = 5
-
-# ------------------------------------------------------------------------------
-"""
-Reads and parses the command line arguments of the script.
-:returns: the parsed arguments
-"""
-def read_params():
-    p = ap.ArgumentParser(description="")
-    p.add_argument('-i','--i_matrix', type=str, default = None,
-                    help='Path to presence/absence matrix')
-    p.add_argument('-o', '--output', type = str, default = None,
-                    help='Path to ouput file with genes groups')
-    p.add_argument('--cut_core_thres', type = float, default = 0.9,
-                    help='Remove gene families present in [cut_core_thres] or more of the sample. Defaul is 90 percent (extended core)')
-    p.add_argument('--cut_cloud_thres', type = float, default = 0.1,
-                    help='Remove gene families present in [cut_core_thres] or less of the sample. Defaul is 5 percent')
-    p.add_argument('--out_plot', type = str, default = None,
-                    help='Path to heatmap plot output.')
-    p.add_argument('--plot_clust_pa', type = str, default = None,
-                    help='Output : Path to heatmap of clusters presence-absence plot')                
-    p.add_argument('-p', '--pangenome', type = str, default = None,
-                    help='Path to pangenome file.')
-
-    p.add_argument('--optics_xi', type = float, default = 0.05,
-                    help='Xi parameter for OPTICS clustering')
-    p.add_argument('--n_jobs', type = int, default = 4,
-                    help='How many cores to use for OPTICS clustering.  Default 4')
-
-    p.add_argument('--close_analysis', action='store_true',
-                    help='Compute analysis of genes proximity in genomes')
-    p.add_argument('--ssp_analysis', action='store_true',
-                    help='Compute analysis of genes cluster links to intra species phylogeny')
-    p.add_argument('--empirical', type = int, default = 1000,
-                    help='How many ramdom sample in empirical pvalue generation ? Default 1000')
-
-    p.add_argument('-v', '--verbose', action='store_true',
-                    help='Show progress information')
-    return p.parse_args()
-
-# ------------------------------------------------------------------------------
-#   READ AND PROCESS PANPHLAN MATRIX
-# ------------------------------------------------------------------------------
-
-def read_and_filter_matrix(filepath, core_thres, cloud_thres, verbose):
-    panphlan_matrix = pd.read_csv(filepath, sep = '\t', header = 0, index_col = 0)
-    if verbose:
-        print(' [I] Reading PanPhlAn presence/absence matrix from : ' + str(filepath))
-        print('     Matrix with ' + str(panphlan_matrix.shape[0]) + ' genes families (rows) and')
-        print('                 ' + str(panphlan_matrix.shape[1]) + ' samples (columns)')
-
-    row_sums = panphlan_matrix.sum(axis = 1)
-    thres_bottom = round(panphlan_matrix.shape[1] * core_thres)
-    keep = row_sums < thres_bottom
-    panphlan_matrix = panphlan_matrix[keep]
-    if verbose:
-        print(' [I] After removing genes families too often present. Cut-off at ' + str(core_thres * 100) + ' % of the samples :')
-        print('     Matrix with ' + str(panphlan_matrix.shape[0]) + ' genes families (rows) and')
-        print('                 ' + str(panphlan_matrix.shape[1]) + ' samples (columns)')
-        
-    thres_top = round(panphlan_matrix.shape[1] * cloud_thres)
-    keep = row_sums > thres_top
-    panphlan_matrix = panphlan_matrix[keep]
-    if verbose:
-        print(' [I] After removing genes too often absent. Cut-off at ' + str(cloud_thres * 100) + ' % of the samples :')
-        print('     Matrix with ' + str(panphlan_matrix.shape[0]) + ' genes families (rows) and')
-        print('                 ' + str(panphlan_matrix.shape[1]) + ' samples (columns)')
-
-    return panphlan_matrix
-
-
-def compute_dist(panphlan_matrix, verbose):
-    if verbose:
-        print(' [I] Computing Jaccard distance between gene families...')
-    a = pdist(panphlan_matrix, 'jaccard')
-    dist_matrix = pd.DataFrame(squareform(a),
-                                index = panphlan_matrix.index,
-                                columns = panphlan_matrix.index)
-    if verbose: print(' Done')
-    return dist_matrix
-
-# ------------------------------------------------------------------------------
-#   CLUSTERING AND DEFINING GROUPS
-# ------------------------------------------------------------------------------
-
-def process_OPTICS(dist_matrix, xi_value, n_jobs, verbose):
-    if verbose:
-        print(' [I] Performing OPTICS clustering...')
-    clustering = OPTICS(min_samples = OPTICS_MIN_PTS,
-                        cluster_method = "xi", xi = xi_value,
-                        metric="precomputed", n_jobs = n_jobs).fit(dist_matrix)
-
-    optics_res_df = pd.DataFrame(clustering.labels_)
-    # optics_res_df[0].value_counts()
-    optics_res_dict = dict(zip(dist_matrix.index, clustering.labels_) )
-    # optics_res = {"UniRef90_XXX" : cluster_ID, "UniRef90_YYY" : cluster_ID , ...}
-    if verbose:
-        print(' Done')
-    return optics_res_dict, optics_res_df
-
-
-def write_clusters(dbscan_res, out_file, operon_pval = None, subspec_pval = None):
-    OUT = open(out_file, mode='w')
-    OUT.write("clust_ID\tsize\toperon_pval\tsubspec_pval\tUniRef_ID\n")
-    for i in range(0, max(dbscan_res.values()) + 1 ):
-        OUT.write(str(i) + "\t")
-        ids = [k for k,v in dbscan_res.items() if v == i]
-        OUT.write(str(len(ids)) + "\t")
-        if operon_pval:
-            OUT.write(str(operon_pval[i]) + "\t")
-        else:
-            OUT.write("NA\t")
-        if subspec_pval:
-            OUT.write(str(subspec_pval["clust_" + str(i)]) + "\t")
-        else:
-            OUT.write("NA\t")
-        OUT.write(";".join(ids))
-        OUT.write("\n")
-    OUT.close()
-
-# ------------------------------------------------------------------------------
-#   PLOTTING HEATMAP WITH LABELS
-# ------------------------------------------------------------------------------
-
-def plot_heatmap(panphlan_matrix, out_path, clust_res = None  ):
-    import seaborn as sns
-    from matplotlib import pyplot as plt
-
-
-    sys.setrecursionlimit(10**7)
-
-    if clust_res is None:
-        plt.figure(figsize=(20,20))
-        p = sns.clustermap(data = panphlan_matrix, metric = 'jaccard',
-                        cmap = sns.color_palette(["#f7f7f7", "#ca0020"]),
-                        xticklabels = [], yticklabels = [],
-                        dendrogram_ratio=0.1)
-        p.cax.set_visible(False)
-        plt.axis('off')
-        p.savefig(out_path, dpi = 300)
-    else :
-
-        sns.set(color_codes = True, font_scale = 0.1)
-
-        clust_names = set(clust_res.values())
-        clust_to_col = dict(zip(clust_names, sns.color_palette("hls", len(clust_names) )))
-        # # identify cluster of core genome : biggest clust that is not -1 (noise points)
-        # table_count = {a : list(dbscan_res.values()).count(a) for a in dbscan_res.values()}
-        # table_count = sorted(table_count, key = table_count.get, reverse = True)
-        # if table_count[0] == -1 :
-        #     core_gene_clust = table_count[1]
-        # else:
-        #     core_gene_clust = table_count[0]
-        # # changing colors for core genes and rare genes
-        #clust_to_col.update({core_gene_clust : (0.2,0.2,0.2)})
-        clust_to_col.update({-1 : (1,1,1)})
-
-        lbl_to_col = {lbl : clust_to_col[clust] for lbl, clust in clust_res.items()  }
-        lbl_to_col = pd.Series(lbl_to_col)
-
-        #plot_width = round(panphlan_matrix.shape[0] / 50)
-        #plot_height = round(panphlan_matrix.shape[1] / 50)
-        #plt.figure()
-        p = sns.clustermap(data = panphlan_matrix,
-                        metric = 'jaccard',
-                        row_colors = lbl_to_col,
-                        cmap = sns.color_palette(["#f7f7f7", "#ca0020"]),
-                        xticklabels = [], yticklabels = [],
-                        dendrogram_ratio=0.1)
-        p.cax.set_visible(False)
-        p.savefig(out_path, dpi = 300)
-
-
-def plot_hm_genes_clusters(clust_PA_matrix, out_path, ssp_pval, args  ):
-    import seaborn as sns
-    from matplotlib import pyplot as plt
-
-    clust_PA_matrix = clust_PA_matrix.astype(int)
-    print(' [I] Exporting HM of clusters')
-    sys.setrecursionlimit(10**7)
-
-    sns.set(color_codes = True, font_scale = 0.1)
-
-    clust_names = ["clust_" + str(x) for x in set(ssp_pval.keys())]
-    annot_col_map = {True : "#f7f7f7", False : "#1a9641"}
-    lbl_to_col = dict()
-    for lbl, pval in ssp_pval.items():
-        col = annot_col_map[pval <= 0.01]
-        lbl_to_col.update({lbl : col})
-    lbl_to_col = pd.Series(lbl_to_col)
-
-    #plot_width = round(panphlan_matrix.shape[0] / 50)
-    #plot_height = round(panphlan_matrix.shape[1] / 50)
-    #plt.figure()
-    p = sns.clustermap(data = clust_PA_matrix,
-                    metric = 'jaccard',
-                    row_colors = lbl_to_col,
-                    cmap = sns.color_palette(["#f7f7f7", "#2c7bb6"]),
-                    xticklabels = [], yticklabels = [],
-                    dendrogram_ratio=0.1)
-    p.cax.set_visible(False)
-    p.savefig(out_path, dpi = 300)
-# ------------------------------------------------------------------------------
-#   ANALYZING GENES GROUPS - Proximity
-# ------------------------------------------------------------------------------
-
-def get_span_ratio(gene_list, df):
-    """Compute the ratio beetween sum of gene length and actual span of gene set on genome
-    In case of multiple genome and multiple values return the highest one
-    """
-    df = df[df['UniRef'].isin(gene_list)]
-    df = df.assign(length = lambda df : df["stop"] - df["start"])
-    sum_length = df["length"].sum()
-    tot_span = df["stop"].max() - df["start"].min()
-    return tot_span / sum_length
-
-
-def assessment_operon(clust_res, args):
-    """For each group of genes detected by dbscan clustering, compute its
-    spanning ratio (function above) and randomized spanning ratio of groups
-    with the same size. An empirical pvalue is thus computed.
-    """
-    if args.verbose:
-        print(' [I] Computing empirical pvalue for span of gene families clusters...')
-        print('     Generating ' + str(args.empirical) + ' empirical samples to compute the pvalue (arg --empirical, default 1000)')
-
-    clust_is_operon = dict()
-    table_count = {a : list(clust_res.values()).count(a) for a in clust_res.values()}
-    table_count = sorted(table_count, key = table_count.get, reverse = True)
-
-    pangenome_df = pd.read_csv(args.pangenome, sep = '\t', header = None)
-    pangenome_df.columns = ["UniRef", "name", "genome", "contig", "start", "stop"]
-
-    for cluster in table_count:
-        if args.verbose : print("Analysing cluster : {} ".format(cluster))
-        if cluster == table_count[0]:
-            clust_is_operon[cluster] = "NA"
-            continue
-        genes = [k for k,v in clust_res.items() if v == cluster]
-        cluster_info = pangenome_df[pangenome_df['UniRef'].isin(genes)]
-        contigs = cluster_info.contig.unique()
-        if args.verbose : print("Cluster span across {} contigs ".format(len(contigs)))
-        result_contig = []
-        for c in contigs:
-            if args.verbose : print("Analysing contig : {} ".format(c))
-            contig_info = cluster_info[cluster_info['contig'] == c]
-            gene_of_contig = contig_info['UniRef']
-            if len(gene_of_contig) < OPTICS_MIN_PTS:
-                if args.verbose : print("Contig discarded. Not enough genes on it")
-                result_contig.append(None)
-                continue
-            # assess genes span ratio
-            cluster_span_ratio = get_span_ratio(gene_of_contig, contig_info)
-            random_ratio = list()
-            for i in range(args.empirical):
-                contig_all_genes_info = pangenome_df[pangenome_df['contig'] == c]
-                random_set = random.sample(range(0, contig_all_genes_info.shape[0]), k=len(gene_of_contig))
-                random_set = list(contig_all_genes_info.iloc[random_set]["UniRef"])
-                random_ratio.append(get_span_ratio(random_set, contig_info))
-            # create sample of span values
-            pvalue = sum([a > cluster_span_ratio for a in random_ratio]) / len(random_ratio)
-            result_contig.append(pvalue)
-
-        result_contig = [x for x in result_contig if not x is None]
-
-        if len(result_contig) >= 1:
-            clust_is_operon.update({cluster : min(result_contig)})
-        else:
-            clust_is_operon.update({cluster : "NA"})
-
-    return clust_is_operon
-    
-# ------------------------------------------------------------------------------
-#   ANALYZING GENES GROUPS - Phylogeny
-# ------------------------------------------------------------------------------
-
-
-def create_cluster_PA_matrix(clust_res, panphlan_matrix):
-    """
-    Collapse the presence-absence signature of genes families to genes clusters level
-    Gene cluster is considered present if more than half of the genes are present
-    """
-    clust_res.index = panphlan_matrix.index
-    clust_IDs = set(clust_res[0].values)
-    clust_IDs.remove(-1)
-    clust_PA_matrix = pd.DataFrame(index = ["clust_" + str(x) for x in clust_IDs], 
-                                    columns = panphlan_matrix.columns )
-    for clust in clust_IDs :
-        uniref_in_clust = clust_res[clust_res[0] == clust].index
-        pres_abs_submat = panphlan_matrix.loc[uniref_in_clust]
-        clust_PA = pres_abs_submat.apply(lambda x : int(sum(x) > 0.5 * len(uniref_in_clust))).T
-        clust_PA_matrix.loc["clust_" + str(clust)] = clust_PA
-    return clust_PA_matrix
-
-
-def assessment_cluster_phylo_link(cluster, clust_PA_matrix, samples_dist, args):
-    if args.verbose:
-        print(' [I] Analyzing genes OPTICS cluster {}'.format(cluster))
-    grp_present = clust_PA_matrix.columns[clust_PA_matrix.loc[str(cluster)] == 1]
-    dist_grp_present = samples_dist.loc[samples_dist['sampleA'].isin(grp_present) & samples_dist['sampleB'].isin(grp_present)]['dist']
-    dist_inter_grp = samples_dist.loc[~samples_dist['sampleA'].isin(grp_present) ^ ~samples_dist['sampleB'].isin(grp_present)]['dist']
-    #mean_base = dist_inter_grp.mean() - dist_grp_present.mean() 
-    mean_base = np.std(dist_inter_grp) - np.std(dist_grp_present)
-    mean_base = abs(mean_base)
-    
-    empirical_means = []
-    for i in range(1, args.empirical):
-        rand_samples = random.sample(set(clust_PA_matrix.columns), k = len(grp_present))
-        dist_rand_samples = samples_dist.loc[samples_dist['sampleA'].isin(rand_samples) & samples_dist['sampleB'].isin(rand_samples)]['dist']
-        dist_not_rand_samples = samples_dist.loc[samples_dist['sampleA'].isin(rand_samples) ^ samples_dist['sampleB'].isin(rand_samples)]['dist']
-        # here to avoid computing the mean over way too many values and having a narrow range of empirical value, a subset is extracted
-        dist_rand_samples = random.sample(list(dist_rand_samples), k = 300)
-        dist_not_rand_samples = random.sample(list(dist_not_rand_samples), k = 300)
-        # empirical_means.append(dist_rand_samples.mean())
-        empirical_means.append(abs(np.mean(dist_not_rand_samples) - np.mean(dist_rand_samples)))
-        
-    clust_pval = sum([mean_base < x for x in empirical_means]) / args.empirical
-    print("Cluster {} has pval {}".format(cluster, clust_pval))
-    print("Mean Base {} ".format(mean_base))
-    
-    return([cluster, clust_pval])
-
-
-# TEST WITH MEDOID
-def assessment_cluster_phylo_link_medoid(cluster, clust_PA_matrix, sample_dist_matrix, args):
-    if args.verbose:
-        print(' [I] Analyzing genes OPTICS cluster {}'.format(cluster))
-    grp_present = clust_PA_matrix.columns[clust_PA_matrix.loc[str(cluster)] == 1]
-    
-    tmp = sample_dist_matrix[grp_present]
-    tmp = tmp.T[grp_present]
-    medoid_index = np.argmin(tmp.sum(axis=0))
-    mean_dist_to_medoid = tmp.iloc[medoid_index].mean()
-    empirical_means = []
-    for i in range(1, args.empirical):
-        rand_samples = random.sample(set(clust_PA_matrix.columns), k = len(grp_present ))
-        tmp = sample_dist_matrix[rand_samples]
-        tmp = tmp.T[rand_samples]
-        medoid_index_emp = np.argmin(tmp.sum(axis=0))
-        empirical_means.append(tmp.iloc[medoid_index_emp].mean())
-        
-    clust_pval = sum([mean_dist_to_medoid > x for x in empirical_means]) / args.empirical
-    print("Cluster {} has pval {}".format(cluster, clust_pval))
-    print("Mean Base {} ".format(mean_dist_to_medoid))
-    return([cluster, clust_pval])
-
-
-def assessment_subspecies_gene(clust_res, panphlan_matrix, clust_PA_matrix, args):
-    """WIP
-    Check if group of genes is linked to the phylogenetic dendrogramm
-    Check if distance between samples is significantly different between
-    the samples which have the gene cluster (or not) and the others.
-    Use empirical pvalue (random samples)
-    WIP
-    """
-    clust_res.index = panphlan_matrix.index
-    clust_IDs = set(clust_res[0].values)
-    clust_IDs.remove(-1)
-    
-    # then ompute pairwise distances of samples based on PanPhlAn presence_absence
-    # Shown to be correlated with StrainPhlAn distances
-    samples_dist = pd.DataFrame(itertools.combinations(panphlan_matrix.columns, 2),
-                                columns=['sampleA','sampleB'])
-    samples_dist['dist'] = pdist(panphlan_matrix.T, 'jaccard')
-    sample_dist_matrix = pd.DataFrame(squareform(samples_dist['dist'] ),
-                                index = panphlan_matrix.columns,
-                                columns = panphlan_matrix.columns)
-    
-    clust_phylo_test = dict()
-    
-    par_res = Parallel(n_jobs=args.n_jobs)(
-                delayed(assessment_cluster_phylo_link)(i, clust_PA_matrix, samples_dist, args) for i in clust_PA_matrix.index)
-    
-    for x,y in par_res :
-        clust_phylo_test.update({x : y})    
-    
-    # stick with bonferroni ? or BH ?
-    clust_phylo_test_df = pd.DataFrame.from_dict(clust_phylo_test, orient='index', columns = ['raw_pval'])
-    clust_phylo_test_df['pval_bonf'] = multipletests(clust_phylo_test_df['raw_pval'], method = 'bonferroni')[1]
-    clust_phylo_test_bonf = dict()
-    for index, row in clust_phylo_test_df.iterrows():
-        clust_phylo_test_bonf.update({index : row['pval_bonf']})
-
-    
-    # return clust_phylo_test_bonf
-    return clust_phylo_test
-
-# ------------------------------------------------------------------------------
-#   MAIN
-# ------------------------------------------------------------------------------
-def main():
-    if not sys.version_info.major == 3:
-        sys.stderr.write('[E] Python version: ' + sys.version)
-        sys.exit('[E] This software uses Python 3, please update Python')
-    args = read_params()
-
-    panphlan_matrix = read_and_filter_matrix(args.i_matrix, args.cut_core_thres, args.cut_cloud_thres, args.verbose)
-
-    if args.output:
-        dist_matrix = compute_dist(panphlan_matrix, args.verbose)
-
-        optics_res_dict, optics_res_df = process_OPTICS(dist_matrix, args.optics_xi, args.n_jobs, args.verbose)
-        if args.close_analysis:
-            operon_pval = assessment_operon(optics_res_dict, args)
-        else:
-            operon_pval = None
-        if args.ssp_analysis:
-            clust_PA_matrix = create_cluster_PA_matrix(optics_res_df, panphlan_matrix)
-            ssp_pval = assessment_subspecies_gene(optics_res_df, panphlan_matrix, clust_PA_matrix, args)
-            # with open('ssp_pval.pkl', 'wb') as out_pkl:
-            #     pickle.dump(ssp_pval, out_pkl)
-    
-            # with open('ssp_pval.pkl', 'rb') as in_pkl:
-            #     ssp_pval = pickle.load(in_pkl)     
-            # del ssp_pval[-1]
-            # with open('PA_clust.pkl', 'wb') as out_pkl:
-            #     pickle.dump(clust_PA_matrix, out_pkl)
-            
-        else :
-            ssp_pval = None
-            
-        # print(clust_PA_matrix)    
-        write_clusters(optics_res_dict, args.output, operon_pval = operon_pval, subspec_pval = ssp_pval)
-        if args.out_plot:
-            plot_heatmap(panphlan_matrix, args.out_plot,  optics_res_dict)
-        if args.plot_clust_pa:
-            plot_hm_genes_clusters(clust_PA_matrix, args.plot_clust_pa, ssp_pval, args )
-    elif args.out_plot:
-        plot_heatmap(panphlan_matrix, args.out_plot)
-
-
-
-if __name__ == '__main__':
-    start_time = time.time()
-    main()
-    mins_elapsed = round((time.time() - start_time) / 60.0, 2)
-    print('[TERMINATING...] ' + __file__ + ', ' + str(mins_elapsed) + ' minutes.')