From da147a1851895cba65a72ee6c8458f4ba7544cd0 Mon Sep 17 00:00:00 2001 From: Rauf Salamzade Date: Wed, 28 Aug 2024 16:00:39 -0500 Subject: [PATCH] update to v1.1.2 --- CITATION.cff | 2 +- bin/lsaBGC-Cluster | 2 +- bin/lsaBGC-ComprehenSeeIve | 2 +- bin/lsaBGC-MIBiGMapper | 2 +- bin/lsaBGC-Reconcile | 2 +- bin/lsaBGC-See | 2 +- bin/lsaBGC-Sociate | 2 +- src/lsaBGC/classes/BGC.py | 69 +----- src/lsaBGC/classes/GCF.py | 104 +++++--- src/lsaBGC/classes/Pan.py | 106 ++++---- src/lsaBGC/util.py | 496 +++++++++++++++++++++++++++++++------ workflows/lsaBGC-Pan | 2 +- 12 files changed, 563 insertions(+), 228 deletions(-) diff --git a/CITATION.cff b/CITATION.cff index 027caa4..453b983 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -9,5 +9,5 @@ authors: orcid: "https://orcid.org/0000-0003-4980-5128" title: "lsaBGC-Pan" version: 1.1.2 -date-released: 2024-08-21 +date-released: 2024-08-28 url: "https://github.com/Kalan-Lab/lsaBGC-Pan" diff --git a/bin/lsaBGC-Cluster b/bin/lsaBGC-Cluster index 96db86a..f0afcc4 100755 --- a/bin/lsaBGC-Cluster +++ b/bin/lsaBGC-Cluster @@ -114,7 +114,7 @@ def lsaBGC_Cluster(): # create logging object log_file = outdir + 'Progress.log' logObject = util.createLoggerObject(log_file) - version_string = util.parseVersionFromSetupPy() + version_string = util.parseVersion() logObject.info('Running lsaBGC version %s' % version_string) # Step 0: Log input arguments and update reference and query FASTA files. diff --git a/bin/lsaBGC-ComprehenSeeIve b/bin/lsaBGC-ComprehenSeeIve index be9213e..4353b2a 100644 --- a/bin/lsaBGC-ComprehenSeeIve +++ b/bin/lsaBGC-ComprehenSeeIve @@ -118,7 +118,7 @@ def lsaBGC_ComprehenSeeIve(): # create logging object log_file = outdir + 'Progress.log' logObject = util.createLoggerObject(log_file) - version_string = util.parseVersionFromSetupPy() + version_string = util.parseVersion() logObject.info('Running lsaBGC version %s' % version_string) # Log input arguments and update reference and query FASTA files. diff --git a/bin/lsaBGC-MIBiGMapper b/bin/lsaBGC-MIBiGMapper index 19b50ae..8862c8f 100644 --- a/bin/lsaBGC-MIBiGMapper +++ b/bin/lsaBGC-MIBiGMapper @@ -125,7 +125,7 @@ def mapToMIBiG(): # create logging object log_file = outdir + 'Progress.log' logObject = util.createLoggerObject(log_file) - version_string = util.parseVersionFromSetupPy() + version_string = util.parseVersion() logObject.info('Running lsaBGC version %s' % version_string) diff --git a/bin/lsaBGC-Reconcile b/bin/lsaBGC-Reconcile index 25d0222..4a06942 100644 --- a/bin/lsaBGC-Reconcile +++ b/bin/lsaBGC-Reconcile @@ -125,7 +125,7 @@ def lsaBGC_Reconcile(): # create logging object log_file = outdir + 'Progress.log' logObject = util.createLoggerObject(log_file) - version_string = util.parseVersionFromSetupPy() + version_string = util.parseVersion() logObject.info('Running lsaBGC version %s' % version_string) # Log input arguments and update reference and query FASTA files. diff --git a/bin/lsaBGC-See b/bin/lsaBGC-See index 0157275..de772ea 100755 --- a/bin/lsaBGC-See +++ b/bin/lsaBGC-See @@ -111,7 +111,7 @@ def lsaBGC_See(): # create logging object log_file = outdir + 'Progress.log' logObject = util.createLoggerObject(log_file) - version_string = util.parseVersionFromSetupPy() + version_string = util.parseVersion() logObject.info('Running lsaBGC version %s' % version_string) # Log input arguments and update reference and query FASTA files. diff --git a/bin/lsaBGC-Sociate b/bin/lsaBGC-Sociate index daec7ae..61b72ab 100644 --- a/bin/lsaBGC-Sociate +++ b/bin/lsaBGC-Sociate @@ -129,7 +129,7 @@ def lsaBGC_Sociate(): # create logging object log_file = outdir + 'Progress.log' logObject = util.createLoggerObject(log_file) - version_string = util.parseVersionFromSetupPy() + version_string = util.parseVersion() logObject.info('Running lsaBGC version %s' % version_string) # Log input arguments and update reference and query FASTA files. diff --git a/src/lsaBGC/classes/BGC.py b/src/lsaBGC/classes/BGC.py index 5df08ff..a471e0f 100755 --- a/src/lsaBGC/classes/BGC.py +++ b/src/lsaBGC/classes/BGC.py @@ -163,7 +163,7 @@ def parseGECCO(self, comprehensive_parsing=True, flank_size=2000): self.cluster_information = bgc_info def parseAntiSMASH(self, comprehensive_parsing=True, flank_size=2000): - """ Functoin to parse BGC Genbank produced by antiSMASH """ + """ Function to parse BGC Genbank produced by antiSMASH """ bgc_info = [] domains = [] core_positions = set([]) @@ -325,69 +325,4 @@ def parseGenbanks(self, comprehensive_parsing=True, flank_size=2000): elif self.prediction_method.upper() == 'GECCO': self.parseGECCO(comprehensive_parsing=comprehensive_parsing, flank_size=flank_size) else: - raise RuntimeError("Unable to parse file because BGC prediction method is not an accepted option!") - - def refineGenbank(self, refined_genbank_file, first_bg, second_bg): - """ - Function to prune and update coordinates of BGC Genbank - main functoin of lsaBGC-Refiner - """ - try: - rgf_handle = open(refined_genbank_file, 'w') - start_coord = min([self.gene_information[first_bg]['start'], self.gene_information[first_bg]['end'], self.gene_information[second_bg]['start'], self.gene_information[second_bg]['end']]) - end_coord = max([self.gene_information[first_bg]['start'], self.gene_information[first_bg]['end'], self.gene_information[second_bg]['start'], self.gene_information[second_bg]['end']]) - pruned_coords = set(range(start_coord, end_coord+1)) - with open(self.bgc_genbank) as ogbk: - recs = [x for x in SeqIO.parse(ogbk, 'genbank')] - try: - assert(len(recs) == 1) - except Exception as e: - raise RuntimeError(traceback.format_exc()) - for rec in recs: - original_seq = str(rec.seq) - filtered_seq = "" - if end_coord == len(original_seq): - filtered_seq = original_seq[start_coord-1:] - else: - filtered_seq = original_seq[start_coord-1:end_coord] - - new_seq_object = Seq(filtered_seq) - - updated_rec = copy.deepcopy(rec) - updated_rec.seq = new_seq_object - - updated_features = [] - for feature in rec.features: - all_coords, start, end, direction, is_multi_part = util.parseCDSCoord(str(feature.location)) - - feature_coords = set(range(start, end+1)) - if len(feature_coords.intersection(pruned_coords)) > 0: - fls = [] - for sc, ec, dc in all_coords: - updated_start = sc - start_coord + 1 - updated_end = ec - start_coord + 1 - if ec > end_coord: - if feature.type == 'CDS': - continue - else: - updated_end = end_coord - start_coord + 1 # ; flag1 = True - if sc < start_coord: - if feature.type == 'CDS': - continue - else: - updated_start = 1 # ; flag2 = True - - strand = 1 - if dc == '-': - strand = -1 - fls.append(FeatureLocation(updated_start - 1, updated_end, strand=strand)) - if len(fls) > 0: - updated_location = fls[0] - if len(fls) > 1: - updated_location = sum(fls) - feature.location = updated_location - updated_features.append(feature) - updated_rec.features = updated_features - SeqIO.write(updated_rec, rgf_handle, 'genbank') - rgf_handle.close() - except Exception as e: - raise RuntimeError(traceback.format_exc()) \ No newline at end of file + raise RuntimeError("Unable to parse file because BGC prediction method is not an accepted option!") \ No newline at end of file diff --git a/src/lsaBGC/classes/GCF.py b/src/lsaBGC/classes/GCF.py index 94c4407..24f2875 100755 --- a/src/lsaBGC/classes/GCF.py +++ b/src/lsaBGC/classes/GCF.py @@ -50,6 +50,11 @@ def __init__(self, bgc_genbanks_listing, gcf_id='GCF_X', logObject=None, lineage self.avoid_samples = set([]) def identifyKeyHomologGroups(self, all_primary=False): + """ + Function that is not used currently in lsaBGC-Pan - in the original lsaBGC - this function + computed which orthogroups were core to a BGC, which were single-copy core and which were + commonly found in BGC protocore regions. This is a little more tricky to integrate with zol. + """ try: initial_samples_with_at_least_one_gcf_hg = set([]) for hg in self.hg_genes: @@ -96,7 +101,7 @@ def identifyKeyHomologGroups(self, all_primary=False): def aggregateProteins(self, gcf_prots_file, draft_mode=False): """ - function to aggregate protein sequences from BGC genbanks and output to a file in FASTA format. + Function to aggregate protein sequences from BGC genbanks and output to a file in FASTA format. """ try: gpf_handle = open(gcf_prots_file, 'w') @@ -118,11 +123,14 @@ def aggregateProteins(self, gcf_prots_file, draft_mode=False): def modifyPhylogenyForSamplesWithMultipleBGCs(self, input_phylogeny, result_phylogeny, prune_set=None): """ + Definition: Function which takes in an input phylogeny and produces a replicate resulting phylogeny with samples/leafs which have multiple BGC instances for a GCF expanded. - - :param input_phylogeny: input newick phylogeny file - :result result_phylogeny: resulting newick phylogeny file + ******************************************************************************************************************** + Parameters: + - input_phylogeny: The input newick phylogeny file. + - result_phylogeny: The resulting newick phylogeny file to create. + ******************************************************************************************************************** """ try: number_of_added_leaves = 0 @@ -155,11 +163,13 @@ def modifyPhylogenyForSamplesWithMultipleBGCs(self, input_phylogeny, result_phyl def assignColorsToHGs(self, gene_to_hg, bgc_genes, outdir): """ + Description: Simple function to associate each homolog group with a color for consistent coloring. - - :param gene_to_hg: gene to HG relationship. - :param bgc_genes: set of genes per HG. - :return: dictionary mapping each HG to a hex color value. + ******************************************************************************************************************** + Parameters: + - gene_to_hg: gene to HG relationship. + - bgc_genes: set of genes per HG. + ******************************************************************************************************************** """ hg_bgc_counts = defaultdict(int) @@ -190,10 +200,14 @@ def assignColorsToHGs(self, gene_to_hg, bgc_genes, outdir): def createItolBGCSeeTrack(self, result_track_file): """ + Description: Function to create a track file for visualizing BGC gene architecture across a phylogeny in the interactive tree of life (iTol) - - :param result_track_file: The path to the resulting iTol track file for BGC gene visualization. + ******************************************************************************************************************** + Parameters: + - self: GCF object + - result_track_file: The path to the resulting iTol track file for BGC gene visualization. + ******************************************************************************************************************** """ try: track_handle = open(result_track_file, 'w') @@ -286,14 +300,18 @@ def createItolBGCSeeTrack(self, result_track_file): def visualizeComprehenSeeIveGCFViaR(self, orthofinder_matrix_file, phylogeny_newick_file, heatmap_track_file, detection_track_file, result_pdf_file, plot_rscript): """ + Definition: Function to create tracks for visualization of gene architecture of BGCs belonging to GCF and run Rscript bgSee.R to produce automatic PDFs of plots. In addition, bgSee.R also produces a heatmap to more easily identify homolog groups which are conserved across isolates found to feature GCF. - - :param gggenes_track_file: Path to file with gggenes track information (will be created/written to by function, if it doesn't exist!) - :param heatmap_track_file: Path to file for heatmap visual component (will be created/written to by function, if it doesn't exist!) - :param phylogeny_file: Phylogeny to use for visualization. - :param result_pdf_file: Path to PDF file where plots from bgSee.R will be written to. + ******************************************************************************************************************** + Parameters: + - self: GCF object + - gggenes_track_file: Path to file with gggenes track information (will be created/written to by function, if it doesn't exist!) + - heatmap_track_file: Path to file for heatmap visual component (will be created/written to by function, if it doesn't exist!) + - phylogeny_file: Phylogeny to use for visualization. + - result_pdf_file: Path to PDF file where plots from bgSee.R will be written to. + ******************************************************************************************************************** """ try: if os.path.isfile(heatmap_track_file) or os.path.isfile(detection_track_file): @@ -422,14 +440,20 @@ def visualizeComprehenSeeIveGCFViaR(self, orthofinder_matrix_file, phylogeny_new def visualizeGCFViaR(self, gggenes_track_file, heatmap_track_file, phylogeny_file, result_pdf_file, plot_rscript): """ + Definition: Function to create tracks for visualization of gene architecture of BGCs belonging to GCF and run Rscript bgSee.R to produce automatic PDFs of plots. In addition, bgSee.R also produces a heatmap to more easily identify homolog groups which are conserved across isolates found to feature GCF. - - :param gggenes_track_file: Path to file with gggenes track information (will be created/written to by function, if it doesn't exist!) - :param heatmap_track_file: Path to file for heatmap visual component (will be created/written to by function, if it doesn't exist!) - :param phylogeny_file: Phylogeny to use for visualization. - :param result_pdf_file: Path to PDF file where plots from bgSee.R will be written to. + ******************************************************************************************************************** + Parameters: + - self: GCF object. + - gggenes_track_file: Path to file with gggenes track information (will be created/written to by function, if it + doesn't exist!) + - heatmap_track_file: Path to file for heatmap visual component (will be created/written to by function, if it + doesn't exist!)- + - phylogeny_file: Phylogeny to use for visualization. + - result_pdf_file: Path to PDF file where plots from bgSee.R will be written to. + ******************************************************************************************************************** """ try: if os.path.isfile(gggenes_track_file) or os.path.isfile(heatmap_track_file): @@ -599,18 +623,22 @@ def visualizeGCFViaR(self, gggenes_track_file, heatmap_track_file, phylogeny_fil def constructCodonAlignments(self, outdir, threads=1, only_scc=False, list_alignments=False, filter_outliers=False, use_ms5=True): """ + Definition: Function to automate construction of codon alignments. This function first extracts protein and nucleotide sequnces from BGC Genbanks, then creates protein alignments for each homolog group using MAFFT, and finally converts those into codon alignments using PAL2NAL. - - :param outdir: Path to output/workspace directory. Intermediate files (like extracted nucleotide and protein - sequences, protein and codon alignments, will be writen to respective subdirectories underneath this - one). - :param threads: Number of threads/threads to use when fake-parallelizing jobs using multiprocessing. - :param only_scc: Whether to construct codon alignments only for homolog groups which are found to be core and in - single copy for samples with the GCF. Note, if working with draft genomes and the BGC is fragmented - this should be able to still identify SCC homolog groups across the BGC instances belonging to the - GCF. + ******************************************************************************************************************** + Parameters: + - self: GCF object. + - outdir: Path to output/workspace directory. Intermediate files (like extracted nucleotide and protein + sequences, protein and codon alignments, will be writen to respective subdirectories underneath this + one). + - threads: Number of threads/threads to use when fake-parallelizing jobs using multiprocessing. + - only_scc: Whether to construct codon alignments only for homolog groups which are found to be core and in + single copy for samples with the GCF. Note, if working with draft genomes and the BGC is fragmented + this should be able to still identify SCC homolog groups across the BGC instances belonging to the + GCF. + ******************************************************************************************************************** """ nucl_seq_dir = os.path.abspath(outdir) + '/Nucleotide_Sequences/' @@ -695,11 +723,15 @@ def constructCodonAlignments(self, outdir, threads=1, only_scc=False, list_align def constructGCFPhylogeny(self, output_alignment, output_phylogeny, only_scc=False, ambiguious_position_cutoff=0.000001): """ + Definition: Function to create phylogeny based on codon alignments of SCC homolog groups for GCF. - - :param output_alignment: Path to output file for concatenated SCC homolog group alignment. - :param output_phylogeny: Path to output file for approximate maximum-likelihood phylogeny produced by FastTree2 from - concatenated SCC homolog group alignment. + ******************************************************************************************************************** + Parameters: + - self: GCF object. + - output_alignment: Path to output file for concatenated SCC homolog group alignment. + - output_phylogeny: Path to output file for approximate maximum-likelihood phylogeny produced by FastTree2 from + concatenated SCC homolog group alignment. + ******************************************************************************************************************** """ try: if only_scc: @@ -796,9 +828,13 @@ def constructGCFPhylogeny(self, output_alignment, output_phylogeny, only_scc=Fal def create_codon_msas(inputs): """ + Definition: Helper function which is to be called from the constructCodonAlignments() function to parallelize construction of codon alignments for each homolog group of interest in the GCF. - :param inputs: list of inputs passed in by GCF.constructCodonAlignments(). + ******************************************************************************************************************** + Parameters: + - inputs: list of inputs passed in by GCF.constructCodonAlignments(). + ******************************************************************************************************************** """ hg, gene_sequences, nucl_seq_dir, prot_seq_dir, prot_alg_dir, codo_alg_dir, threads, use_ms5, logObject = inputs diff --git a/src/lsaBGC/classes/Pan.py b/src/lsaBGC/classes/Pan.py index ce4c8ae..063fa6e 100755 --- a/src/lsaBGC/classes/Pan.py +++ b/src/lsaBGC/classes/Pan.py @@ -64,9 +64,17 @@ def __init__(self, bgc_genbanks_listing, logObject=None, lineage_name='Unnamed l def readInBGCGenbanks(self, comprehensive_parsing=True, prune_set=None, edge_dist_cutoff=5000): """ + Definition: Function to parse file listing location of BGC Genbanks. - - :param comprehensive_parsing (optional): flag specifying whether to perform comprehensive extraction of information from Genbanks. default is True. + ******************************************************************************************************************** + Parameters: + - self: A Pan, GCF, or BGC object. + - comprehensive_parsing (optional): flag specifying whether to perform comprehensive extraction of information from + Genbanks. default is True. + - prune_set (optional): set of samples to prune out and disregard BGCs from. default is None. + - edge_dist_cutoff (optional): the distance from scaffold/contig edge for a BGC to be regarded as incomplete. default + is 5000 + ******************************************************************************************************************** """ sample_index = defaultdict(int) with open(self.bgc_genbanks_listing) as obsf: @@ -128,14 +136,18 @@ def readInBGCGenbanks(self, comprehensive_parsing=True, prune_set=None, edge_dis def inputHomologyInformation(self, gene_to_hg, hg_genes, hg_median_copy_count, hg_prop_multi_copy): """ + Definition: Simple function to store OrthoFinder homology information (parsed inititally in lsaBGC.utils) - - :param gene_to_hg: dictionary mapping gene locus tags to homolog group identifiers - :param hg_genes: dictionary of sets with keys corresponding to homolog groups and values corresponding to set - of gene locus tags belonging to that homolog group. - :param hg_median_copy_count: dictionary for the median copy count of each homolog group - :param hg_prop_multi_copy: dictionary for the proportion of samples with homolog group which have multipmle - genes assigned to homolog group (paralogs). + ******************************************************************************************************************** + Parameters: + - self: A Pan, GCF, or BGC object. + - gene_to_hg: dictionary mapping gene locus tags to homolog group identifiers + - hg_genes: dictionary of sets with keys corresponding to homolog groups and values corresponding to set + of gene locus tags belonging to that homolog group. + - hg_median_copy_count: dictionary for the median copy count of each homolog group + - hg_prop_multi_copy: dictionary for the proportion of samples with homolog group which have multipmle + genes assigned to homolog group (paralogs). + ******************************************************************************************************************** """ try: self.gene_to_hg = dict(gene_to_hg) @@ -159,10 +171,14 @@ def inputHomologyInformation(self, gene_to_hg, hg_genes, hg_median_copy_count, h def openStatsFile(self, outdir, run_parameter_tests=False): """ + Definition: Simple function to initialize final report file with statistics on GCF clustering - main output from lsaBGC-Cluster. - - :param outdir: path to workspace directory - :param run_parameter_tests: whether lsaBGC-Cluster is being run in "run_parameter_tests mode". + ******************************************************************************************************************** + Parameters: + - self: A Pan, GCF or BGC object. + - outdir: path to workspace directory + - run_parameter_tests: whether lsaBGC-Cluster is being run in "run_parameter_tests mode". + ******************************************************************************************************************** """ try: final_stats_file = outdir + 'GCF_Details.txt' @@ -282,13 +298,17 @@ def calculateBGCPairwiseRelations(self, outdir, split_by_annotation=False): def runMCLAndReportGCFs(self, mip, jcp, sccp, ccp, outdir, run_parameter_tests=False, threads=1): """ + Description: Function to run MCL and report the GCFs (gene-cluster families) of homologous BGCs identified. - - :param mip: MCL inflation parameter. - :param jcp: Jaccard similarity threshold for homology between two BGCs to be considered. - :param outdir: path to workspace directory. - :param run_parameter_tests: True - :param threads: number of threads/threads to use for MCL. + ******************************************************************************************************************** + Parameters: + - self: A Pan, GCF or BGC object. + - mip: MCL inflation parameter. + - jcp: Jaccard similarity threshold for homology between two BGCs to be considered. + - outdir: path to workspace directory. + - run_parameter_tests: True + - threads: number of threads/threads to use for MCL. + ******************************************************************************************************************** """ pair_relations_filt_txt_file = outdir + 'bgc_pair_relationships.txt' try: @@ -514,11 +534,15 @@ def runMCLAndReportGCFs(self, mip, jcp, sccp, ccp, outdir, run_parameter_tests=F def plotResultsFromUsingDifferentParameters(self, outdir): """ + Description: Function to create an 8x11 in PDF with figures aimed to give the user an idea on how different values of the MCL inflation parameter and the Jaccard similarity cutoff might impact clustering to help them choose the best parameter values. - - :param outdir: path to workspace directory. + ******************************************************************************************************************** + Parameters: + - self: A Pan, GCF or BGC object. + - outdir: path to workspace directory. + ******************************************************************************************************************** """ try: plot_input_dir = outdir + 'plotting_input/' @@ -762,6 +786,21 @@ def convertGenbanksIntoFastas(self, fasta_dir, fasta_listing_file, sample_retent raise RuntimeError(traceback.format_exc()) def createParamInfluencePlot(self, workspace, plot_input_1_file, plot_annot_file, plot_input_2_file, plot_overview_file, plot_input_3_file, plot_pdf_file): + """ + Description: + Function to create the parameter impact assessment PDF report for lsaBGC-Cluster. + ******************************************************************************************************************** + Parameters: + - self: A Pan, GCF or BGC object. + - workspace: The output/workspace where to generate R scripts and other intermediate files. + - plot_input_1_file: The input table + - plot_annot_file: The input table used to create the heatmap visuals on the second page of the PDF report. + - plot_input_2_file: The input table used to assess homogeneity in product/BGC-types per GCF clustering. + - plot_overview_file: The input table with overview information on clustering using a variety of parameters. + - plot_input_3_file: The input table with information on copy-count of GCFs per sample. + - plot_pdf_file: The resulting path to the report PDF file. + ******************************************************************************************************************** + """ rscript_file = workspace + 'params_impact_rscript.R' rfh = open(rscript_file, 'w') @@ -878,30 +917,3 @@ def createParamInfluencePlot(self, workspace, plot_input_1_file, plot_annot_file self.logObject.error('Had an issue running: %s' % ' '.join(rscript_plot_cmd)) self.logObject.error(traceback.format_exc()) raise RuntimeError('Had an issue running: %s' % ' '.join(rscript_plot_cmd)) - - - def readInPopulationsSpecification(self, pop_specs_file, prune_set=None): - """ - Read in population specifications per sample and assign population to each BGC. - - :param pop_specs_file: path to file which has tab separated sample (1st column) and corresponding population (2nd column) - """ - try: - self.bgc_population = defaultdict(lambda: "NA") - self.sample_population = defaultdict(lambda: "NA") - with open(pop_specs_file) as opsf: - for i, line in enumerate(opsf): - if i == 0 and line.startswith('name'): continue - line = line.strip() - sample, population = line.split('\t') - if prune_set != None and not sample in prune_set: continue - self.sample_population[sample] = population - for bgc in self.sample_bgcs[sample]: - self.bgc_population[bgc] = population - self.logObject.info("Successfully parsed population specifications file. There are %d populations." % len(population)) - except Exception as e: - if self.logObject: - self.logObject.error("Issue in parsing population specifications per sample and associating with each BGC.") - self.logObject.error(traceback.format_exc()) - raise RuntimeError(traceback.format_exc()) - diff --git a/src/lsaBGC/util.py b/src/lsaBGC/util.py index 975314e..8171542 100755 --- a/src/lsaBGC/util.py +++ b/src/lsaBGC/util.py @@ -23,6 +23,19 @@ valid_alleles = set(['A', 'C', 'G', 'T']) def reformatOrthologInfo(ortholog_matrix_file, zol_results_dir, logObject): + """ + Description: + Function to reformat sample by ortholog group matrix to a format that can be fed into zol directly as precomputed. + ******************************************************************************************************************** + Parameters: + - ortholog_matrix_file: Path to sample by ortholog matrix file. + - zol_results_dir: The workspace where to write the resulting re-formatted ortholog info file. + - logObject: A python logging object + ******************************************************************************************************************** + Returns: + - ortholog_listing_file: Path to the reformatted ortholog information file in table format. + ******************************************************************************************************************** + """ ortholog_listing_file = zol_results_dir + 'LocusTag_to_Ortholog_Relations.txt' try: outf = open(ortholog_listing_file, 'w') @@ -46,6 +59,23 @@ def reformatOrthologInfo(ortholog_matrix_file, zol_results_dir, logObject): sys.exit(1) def determineNonRepBGCs(sample, gcf, sample_gcf_bgcs, gcf_bgcs, bgc_pairwise_relations, edgy_bgcs, logObject): + """ + Description: + Determine which BGC instances to retain for zol analysis for sample with multiple copies of a single GCF. + ******************************************************************************************************************** + Parameters: + - sample: sample name + - gcf: GCF identifier + - sample_gcf_bgcs: The set of all BGC instances for the sample-GCF pairing. + - gcf_bgcs: The set of all BGC instances for the GCF. + - bgc_pairwise_relations: A dictionary with sequence differences between pairs of BGCs. + - edgy_bgcs: The set of BGCs which are near scaffold/contig edges. + - logObject: A python logging object + ******************************************************************************************************************** + Returns: + - nonrep_bgcs: The set of non-representative BGCs for the sample-GCF pairing determined. + ******************************************************************************************************************** + """ try: complete_bgcs = [] for sample_bgc in sample_gcf_bgcs: @@ -98,6 +128,21 @@ def determineNonRepBGCs(sample, gcf, sample_gcf_bgcs, gcf_bgcs, bgc_pairwise_rel def runMIBiGMapper(detailed_BGC_listing_with_Pop_and_GCF_map_file, ortholog_matrix_file, mibig_dir, multi_thread, parallel_jobs_4thread, logObject): + """ + Description: + Void wrapper function to run lsaBGC-MIBiGMapper for all GCFs in parallel. + ******************************************************************************************************************** + Parameters: + - detailed_BGC_listing_with_pop_and_GCF_map_file: The detailed BGC listing file produced in lsaBGC-Pan with much of + the information for sample mapping, GCF mapping, and context of + each BGC. + - ortholog_matrix_file: The sample by ortholog matrix. + - mibig_dir: The results directory where to write lsaBGC-MIBiGMapper results to. + - multi_threads: The number of threads per job. + - parallel_jobs_4thread: The number of parallel jobs to run whereby each is assumed to be using 4 threads. + - logObject: A python logging object + ******************************************************************************************************************** + """ try: gcfs = set([]) with open(detailed_BGC_listing_with_Pop_and_GCF_map_file) as odbl: @@ -132,6 +177,22 @@ def runMIBiGMapper(detailed_BGC_listing_with_Pop_and_GCF_map_file, ortholog_matr sys.exit(1) def runSeeAndComprehenSeeIve(detailed_BGC_listing_with_Pop_and_GCF_map_file, species_tree, ortholog_matrix_file, see_dir, compsee_dir, threads, logObject): + """ + Description: + Void wrapper function to run lsaBGC-See and lsaBGC-ComprehenSeeIve for all GCFs in parallel. + ******************************************************************************************************************** + Parameters: + - detailed_BGC_listing_with_pop_and_GCF_map_file: The detailed BGC listing file produced in lsaBGC-Pan with much of + the information for sample mapping, GCF mapping, and context of + each BGC. + - species_tree: A species tree in Newick format. + - ortholog_matrix_file: The sample by ortholog matrix. + - see_dir: The results directory where to write lsaBGC-See results to. + - compsee_dir: The results directory where to write lsaBGC-ComprehenSeeIve results to. + - threads: The number of parallel jobs to run whereby each is assumed to be using 1 thread. + - logObject: A python logging object + ******************************************************************************************************************** + """ try: gcfs = set([]) with open(detailed_BGC_listing_with_Pop_and_GCF_map_file) as odbl: @@ -172,6 +233,18 @@ def runSeeAndComprehenSeeIve(detailed_BGC_listing_with_Pop_and_GCF_map_file, spe sys.exit(1) def computeConservationOfOGWithinGCFContext(inputs): + """ + Description: + Void function which computes the conservation of an orhtogroup across BGC instances within a GCF. + ******************************************************************************************************************** + Parameters: + - inputs: A list for four items: + - bgc_paths: A list of BGC paths belonging to the focal GCF. + - og_listing_file: The locus tag to orthogroup mapping table. + - output_file: The output file which maps orthogroups to their conservation. + - logObject: A python logging object. + ******************************************************************************************************************** + """ bgc_paths, og_listing_file, output_file, logObject = inputs try: @@ -205,6 +278,30 @@ def computeConservationOfOGWithinGCFContext(inputs): sys.exit(1) def runZol(detailed_BGC_listing_with_Pop_and_GCF_map_file, ortholog_listing_file, pairwise_relations, zol_comp_results_dir, zol_full_results_dir, cgc_results_dir, zol_parameters, zol_high_quality_preset, zol_edge_distance, zol_keep_multi_copy, threads, multi_thread, parallel_jobs_4thread, logObject): + """ + Description: + Void wrapper function to run zol for all GCFs in parallel. + ******************************************************************************************************************** + Parameters: + - detailed_BGC_listing_with_pop_and_GCF_map_file: The detailed BGC listing file produced in lsaBGC-Pan with much of + the information for sample mapping, GCF mapping, and context of + each BGC. + - ortholog_listing_file: The locus tag to orthogroup mapping table. + - pairwise_relations: A dictionary with pairwise relations between pairs of BGCs + - zol_comp_results_dir: The results directory where to write zol comprehensive results to. + - zol_full_results_dir: The results directory where to write orthogroup conservations to based on full/complete BGC + instances belonging to a GCF. + - cgc_results_dir: The results directory where to write cgc results to. + - zol_parameters: The parameters with which to run zol analysis. + - zol_high_quality_preset: Flag for whether to run a more full/high-quality zol analysis. + - zol_edge_distance: The distance from the edge of a contig/scaffold for a BGC instance to be considered partial. + - zol_keep_multi_copy: Flag for whether to retain multi-copy instances of a GCF per sample. + - threads: The number of parallel jobs to run assuming each job uses 1 thread. + - multi_thread: The number of threads to use per multi-threaded job - typically hardcoded to 4. + - parallel_jobs_4thread: The number of parallel jobs to run assuming each job uses 4 threads. + - logObject: A python logging object. + ******************************************************************************************************************** + """ try: gcf_sample_bgcs = defaultdict(lambda: defaultdict(list)) edgy_bgcs = set([]) @@ -302,6 +399,17 @@ def runZol(detailed_BGC_listing_with_Pop_and_GCF_map_file, ortholog_listing_file def runCmdViaSubprocess(cmd, logObject=None, check_files=[], check_directories=[]): + """ + Description: + Void function to run a command using python's subprocess. + ******************************************************************************************************************** + Parameters: + - cmd: A list corresponding to a command. + - logObject [optional]: A python logging object. + - check_files [optional]: A list of files to simply check are made after running the command. + - check_directories [optional]: A list of directories to simply check are made after running the command. + ******************************************************************************************************************** + """ if logObject != None: logObject.info('Running %s' % ' '.join(cmd)) try: @@ -321,6 +429,16 @@ def runCmdViaSubprocess(cmd, logObject=None, check_files=[], check_directories=[ def mapColorsToCategories(categories_set, colors_file, colors_mapping_file): + """ + Description: + Void function to map colors to categories for various plotting purposes. + ******************************************************************************************************************** + Parameters: + - categories_set: Set of categories to have colors assigned to. + - colors_file: A file where each line corresponds to a hex code for a color. + - colors_mapping_file: The result file where the first column has a category and the second the mapped color. + ******************************************************************************************************************** + """ try: colors = [] with open(colors_file) as ocf: @@ -343,6 +461,18 @@ def mapColorsToCategories(categories_set, colors_file, colors_mapping_file): sys.exit(1) def pairwiseDistancesFromTree(tree_file, logObject): + """ + Description: + Function to compute the branch distance between leafs in a newick tree. + ******************************************************************************************************************** + Parameters: + - tree_file: The input tree file in Newick format. + - logObject: a python logging object + ******************************************************************************************************************** + Returns: + - pairwise_distances: A dictionary showing branch distances between pairs of leafs. + ******************************************************************************************************************** + """ pairwise_distances = defaultdict(lambda: defaultdict(lambda: None)) try: try: @@ -373,6 +503,18 @@ def pairwiseDistancesFromTree(tree_file, logObject): sys.exit(1) def generateColors(workspace, outfile, color_count, palette='Spectral', palette_color_count=12): + """ + Description: + Void function to generate an output file with colors in hex code for various plotting purposes. + ******************************************************************************************************************** + Parameters: + - workspace: The workspace where to write temporary files and scripts. + - outfile: The output file where hex codes of colors will be written to. + - color_count: The count of desired colors. + - palette [Optional]: The color palette in R to use. + - palette_color_count [Optional]: The color palette's default color count. + ******************************************************************************************************************** + """ try: assert(os.path.isdir(workspace)) workspace = os.path.abspath(workspace) + '/' @@ -393,6 +535,14 @@ def generateColors(workspace, outfile, color_count, palette='Spectral', palette_ sys.exit(1) def parseCDSCoord(str_gbk_loc): + """ + Description: + Function to parse a string from a GenBank feature's coordinates. + ******************************************************************************************************************** + Parameters: + - str_gbk_loc: The string value of the feature coordinate from a GenBank file being parsed by Biopython. + ******************************************************************************************************************** + """ try: start = None end = None @@ -449,6 +599,18 @@ def parseCDSCoord(str_gbk_loc): raise RuntimeError(traceback.format_exc()) def cleanUpSampleName(original_name): + """ + Description: + Function to clean up a string corresponding to some name to make it easier to write files referencing. + ******************************************************************************************************************** + Parameters: + - name: A string corresponding to some name. + ******************************************************************************************************************** + Returns: + - cleaned_up_name: A cleaned up string where troublesome characters for having as part of fielnames are replaced + with less problematic characters. + ******************************************************************************************************************** + """ return original_name.replace('#', '').replace('*', '_').replace(':', '_').replace(';', '_').replace(' ', '_').replace( ':', '_').replace('|', '_').replace('"', '_').replace("'", '_').replace("=", "_").replace('-', '_').replace('(', @@ -456,6 +618,18 @@ def cleanUpSampleName(original_name): ')', '').replace('/', '').replace('\\', '').replace('[', '').replace(']', '').replace(',', '') def parseGECCOGBKForFunction(bgc_gbk, logObject): + """ + Description: + Function to parse GECCO BGC GenBank for product function + ******************************************************************************************************************** + Parameters: + - bgc_gbk: The GECCO BGC GenBank file path. + - logObject: a python logging object + ******************************************************************************************************************** + Returns: + - product: The predicted product/type of BGC by GECCO. + ******************************************************************************************************************** + """ try: rec = SeqIO.read(bgc_gbk, 'genbank') product = 'unknown' @@ -474,6 +648,18 @@ def parseGECCOGBKForFunction(bgc_gbk, logObject): raise RuntimeError() def parseAntiSMASHGBKForFunction(bgc_gbk, logObject, compress_multi=True): + """ + Description: + Function to parse antiSMASH BGC GenBank for product function. + ******************************************************************************************************************** + Parameters: + - bgc_gbk: The antiSMASH BGC GenBank file path. + - logObject: a python logging object. + ******************************************************************************************************************** + Returns: + - product: The predicted product(s)/type(s) of BGC by antiSMASH. + ******************************************************************************************************************** + """ product = 'unknown' try: products = set([]) @@ -501,15 +687,19 @@ def parseAntiSMASHGBKForFunction(bgc_gbk, logObject, compress_multi=True): def parseOrthoFinderMatrix(orthofinder_matrix_file, relevant_gene_lts, all_primary=False): """ + Description: Function to parse and return information from OrthoFinderV2 de novo homolog group identification. - - :param orthofinder_matrix: OrthoFinderV2 matrix Orthogroups.csv file - :param relevant_gene_lts: set of all the relevant gene locus tag identifiers found in BGC Genbanks - - :return gene_to_hg: dictionary mapping gene locus tags to homolog group - :return hg_genes: dictionary with set of gene locus tags for each homolog group - :return hg_median_gene_counts: median copy count for each homolog group - :return hg_multicopy_proportion: proportion of samples with homolog group which have multiple (paralogous) genes in the homolog group. + ******************************************************************************************************************** + Parameters: + - orthofinder_matrix: OrthoFinderV2 matrix Orthogroups.csv file + - relevant_gene_lts: set of all the relevant gene locus tag identifiers found in BGC Genbanks + ******************************************************************************************************************** + Return: + - gene_to_hg: dictionary mapping gene locus tags to homolog group + - return hg_genes: dictionary with set of gene locus tags for each homolog group + - return hg_median_gene_counts: median copy count for each homolog group + - return hg_multicopy_proportion: proportion of samples with homolog group which have multiple (paralogous) genes in the homolog group. + ******************************************************************************************************************** """ gene_to_hg = {} hg_genes = defaultdict(set) @@ -543,8 +733,15 @@ def parseOrthoFinderMatrix(orthofinder_matrix_file, relevant_gene_lts, all_prima def run_cmd(cmd, logObject, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL): """ - Simple function to run a single command through subprocess with logging. + Description: + Void simple function to run a single command through subprocess with logging. + ******************************************************************************************************************** + Parameters: + - cmd: A list corresponding to a command. + - logObject: A python logging object. + ******************************************************************************************************************** """ + logObject.info('Running the following command: %s' % ' '.join(cmd)) try: subprocess.call(' '.join(cmd), shell=True, stdout=stdout, stderr=stderr, executable='/bin/bash') @@ -556,8 +753,15 @@ def run_cmd(cmd, logObject, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL def multiProcessNoLogWithTimeout(input): """ - Genralizable function to be used with multiprocessing to parallelize list of commands. Inputs should correspond - to space separated command (as list). + Description: + Void, genralizable function to be used with multiprocessing to parallelize list of commands. Inputs should correspond + to space separated command (as list). Similar to more general function, only this one times out after a while. + ******************************************************************************************************************** + Parameters: + - input: + - input_cmd: Up to last item the input should correspond to the command to run in subprocess. + - TIMEOUT: The last item in the list should correspond to the timeout in seconds. + ******************************************************************************************************************** """ input_cmd = input[:-1] TIMEOUT = input[-1] @@ -571,8 +775,13 @@ def multiProcessNoLogWithTimeout(input): def multiProcessNoLog(input_cmd): """ - Genralizable function to be used with multiprocessing to parallelize list of commands. Inputs should correspond - to space separated command (as list). + Description: + Void, genralizable function to be used with multiprocessing to parallelize list of commands. Inputs should correspond + to space separated command (as list). Same as the generalizable function, only this one doesn't take a log object + as part of the input. + ******************************************************************************************************************** + Parameters: + - input_cmd: A list corresponding to an input command. """ try: subprocess.call(' '.join(input_cmd), shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, @@ -582,9 +791,16 @@ def multiProcessNoLog(input_cmd): def multiProcess(input): """ - Genralizable function to be used with multiprocessing to parallelize list of commands. Inputs should correspond + Descirption: + Void, genralizable function to be used with multiprocessing to parallelize list of commands. Inputs should correspond to space separated command (as list), with last item in list corresponding to a logging object handle for logging progress. + ******************************************************************************************************************** + Parameters: + - input: + - input_cmd: Up to last item the input should correspond to the command to run in subprocess. + - logObject: The last item in the list should correspond to a python logging object. + ******************************************************************************************************************** """ input_cmd = input[:-1] logObject = input[-1] @@ -599,6 +815,21 @@ def multiProcess(input): sys.stderr.write(traceback.format_exc()) def addLocusTagsToGBKs(inputs): + """ + Description: + Void function add locus tags to a sample's genome and BGC GenBank files. + ******************************************************************************************************************** + Parameters: + - inputs: + - sample: The sample name. + - locus_tag_prefix: The locus tag prefix to use. + - resdir: The results directory to write the updated GenBank files to. + - genome_gbk: The genome GenBank file to process. + - region_gbks: The BGC region GenBank files to process. + - keep_ids_flag: A flag variable on whether to attempt to keep the original locus tags. + - logObject: a python logging object. + ******************************************************************************************************************** + """ sample = inputs[0] locus_tag_prefix = inputs[1] resdir = inputs[2] @@ -702,6 +933,19 @@ def addLocusTagsToGBKs(inputs): logObject.warning(traceback.format_exc()) def checkCDSHaveLocusTags(inputs): + """ + Descrption: + Void function to check if all CDS features in a GenBank file have locus_tag qualifiers. + ******************************************************************************************************************** + Parameters: + - inputs: + - sample: The sample name. + - gbk_type: The type of the GenBank file. + - gbk: The input GenBank file. + - outf: A file to write stats on CDS count and whether they all had locus tags. + - logObject: a python logging object. + ******************************************************************************************************************** + """ sample, gbk_type, gbk, outf, logObject = inputs try: cds_count = 0 @@ -727,6 +971,18 @@ def checkCDSHaveLocusTags(inputs): logObject.warning(traceback.format_exc()) def findAntiSMASHBGCInFullGenbank(inputs): + """ + Description: + Void function to parse antiSMASH BGC GenBank from antiSMASH to determine the location/coordinates of it in the + genome. + ******************************************************************************************************************** + Parameters: + - inputs: + - full_gbk: The full genome GenBank file. + - bgc_gbk: The antiSMASH BGC GenBank file. + - outf: The output file where to write information on BGC location to. + ******************************************************************************************************************** + """ try: full_gbk, bgc_gbk, outf = inputs @@ -761,6 +1017,20 @@ def findAntiSMASHBGCInFullGenbank(inputs): raise RuntimeWarning(traceback.format_exc()) def processAntiSMAHSBGCtoGenomeMappingResults(process_data, logObject): + """ + Definition: + Function to process antiSMASH BGC to genome mapping results and return the info as a list. + ******************************************************************************************************************** + Parameters: + - process_data: A list of lists containing paths to files with BGC to genome mapping information. + - logObject: A python logging object. + ******************************************************************************************************************** + Returns: + - bgc_locations: A list of lists where the first item in a sublist is the bgc identifier/path followed by which + sample it belongs to, the scaffold it is on, the start and end coordinates, the length of the bgc, + and the distance of the BGC start/end to the nearest scaffold edge. + ******************************************************************************************************************** + """ try: loc_lists = defaultdict(list) for pd in process_data: @@ -793,6 +1063,17 @@ def processAntiSMAHSBGCtoGenomeMappingResults(process_data, logObject): sys.exit(1) def extractProteinsFromGenBank(inputs): + """ + Definition: + Void function to extract protein sequences from CDS features in a GenBank file and output in FASTA format. + ******************************************************************************************************************** + Parameters: + - inputs: + - input_genbank: The input GenBank file with CDS features and translation + locus_tag qualifiers + - output_proteome: The output FASTA proteome file + - logObject: A python logging object + ******************************************************************************************************************** + """ input_genbank, output_proteome, logObject = inputs try: op_handle = open(output_proteome, 'w') @@ -810,6 +1091,9 @@ def extractProteinsFromGenBank(inputs): raise RuntimeError(traceback.format_exc()) def determineAsofName(asof_index): + """ + Simple function to make integers into strings of equal length by appending 0s as prefices. + """ asof_index_str = str(asof_index) asof_name = None if len(asof_index_str) == 1: @@ -830,6 +1114,21 @@ def determineAsofName(asof_index): return(asof_name) def runOrthoFinder2Full(prot_directory, orthofinder_outdir, run_msa, logObject, threads=1): + """ + Definition: + Wrapper function to run OrthoFinder2 for bacteria (involves some extra processing of hierearchical orthogroups). + ******************************************************************************************************************** + Parameters: + - prot_directory: The input directory of proteomes in FASTA format to provide to OrthoFinder2 as input. + - orthofinder_outdir: The workspace/output directory for OrthoFinder2 analysis. + - run_msa: Flag on whether to infer hierarchical orthogroups using MSA instead of DendroBlast. + - logObject: A python logging object. + - threads (optional): The number of threads. Default is 1. + ******************************************************************************************************************** + Returns: + - result_file: The resulting orthogroup by sample carriage matrix. + ******************************************************************************************************************** + """ result_file = orthofinder_outdir + 'Final_Orthogroups.tsv' try: orthofinder_cmd = ['orthofinder', '-f', prot_directory, '-t', str(threads)] @@ -984,6 +1283,22 @@ def runOrthoFinder2Full(prot_directory, orthofinder_outdir, run_msa, logObject, def runOrthoFinder2FullFungal(prot_directory, orthofinder_outdir, run_msa, logObject, threads=1): + """ + Definition: + Wrapper function to run OrthoFinder2 for fungi. + ******************************************************************************************************************** + Parameters: + - prot_directory: The input directory of proteomes in FASTA format to provide to OrthoFinder2 as input. + - orthofinder_outdir: The workspace/output directory for OrthoFinder2 analysis. + - run_msa: Flag on whether to infer hierarchical orthogroups using MSA instead of DendroBlast. + - logObject: A python logging object. + - threads (optional): The number of threads. Default is 1. + ******************************************************************************************************************** + Returns: + - result_file: The resulting orthogroup by sample carriage matrix. + ******************************************************************************************************************** + """ + result_file = orthofinder_outdir + 'Final_Orthogroups.tsv' try: orthofinder_cmd = ['orthofinder', '-f', prot_directory, '-t', str(threads)] @@ -1060,6 +1375,21 @@ def runOrthoFinder2FullFungal(prot_directory, orthofinder_outdir, run_msa, logOb def runOrthoFinder2(prot_directory, orthofinder_outdir, logObject, threads=1): + """ + Definition: + Wrapper function to run OrthoFinder2 up until the coarse orthogroups are determined (basically the end of + OrthoFinder 1). + ******************************************************************************************************************** + Parameters: + - prot_directory: The input directory of proteomes in FASTA format to provide to OrthoFinder2 as input. + - orthofinder_outdir: The workspace/output directory for OrthoFinder2 analysis. + - logObject: A python logging object. + - threads (optional): The number of threads. Default is 1. + ******************************************************************************************************************** + Returns: + - result_file: The resulting orthogroup by sample carriage matrix. + ******************************************************************************************************************** + """ result_file = orthofinder_outdir + 'Final_Orthogroups.tsv' try: orthofinder_cmd = ['orthofinder', '-f', prot_directory, '-t', str(threads), '-og'] @@ -1090,6 +1420,23 @@ def runOrthoFinder2(prot_directory, orthofinder_outdir, logObject, threads=1): return result_file def runPanaroo(detailed_BGC_listing_file, panaroo_input_dir, results_directory, logObject, threads=1, panaroo_options='--clean-mode moderate --remove-invalid-genes'): + """ + Definition: + Wrapper function to run Panaroo. + ******************************************************************************************************************** + Parameters: + - detailed_BGC_listing_file: A detailed BGC listing file with positional/location information for each BGC. + - panaroo_input_dir: The directory with input GFF files (in Prokka format) to provide to Panaroo as input. + - results_directory: The workspace/output directory for Panaroo analysis. + - logObject: A python logging object. + - threads (optional): . Default is 1. + - panaroo_options (optional): Additional options for running Panaroo. Default is "--clean-mode moderate + --remove-invalid-genes". + ******************************************************************************************************************** + Returns: + - result_file: The resulting orthogroup by sample carriage matrix. + ******************************************************************************************************************** + """ result_file = results_directory + 'Final_Orthogroups.tsv' try: sample_genomes = {} @@ -1180,55 +1527,12 @@ def runPanaroo(detailed_BGC_listing_file, panaroo_input_dir, results_directory, sys.stderr.write(msg + '\n') sys.stderr.write(traceback.format_exc() + '\n') sys.exit(1) - return result_file -def selectFinalResultsAndCleanUp(outdir, fin_outdir, logObject): - - """ - TODO: Refactor for lsaBGC-PAN - """ - try: - delete_set = set(['BLASTing_of_Ortholog_Groups', 'OrthoFinder2_Results', 'KOfam_Annotations', - 'Prodigal_Gene_Calling_Additional', 'Predicted_Proteomes_Initial', - 'Prodigal_Gene_Calling', 'Genomic_Genbanks_Initial']) - for fd in os.listdir(outdir): - if os.path.isfile(fd): continue - subdir = outdir + fd + '/' - if fd in delete_set: - os.system('rm -rf %s' % subdir) - if os.path.isdir(outdir + 'BGCs_Retagged_and_Updated'): - os.system('rm -rf %s' % outdir + 'BGCs_Retagged') - if os.path.isdir(outdir + 'lsaBGC_AutoExpansion_Results/'): - os.system('ln -s %s %s' % ( - outdir + 'lsaBGC_AutoExpansion_Results/Updated_GCF_Listings/', fin_outdir + 'Expanded_GCF_Listings')) - os.system('ln -s %s %s' % ( - outdir + 'lsaBGC_AutoExpansion_Results/Orthogroups.expanded.tsv', fin_outdir + 'Expanded_Orthogroups.tsv')) - os.system('ln -s %s %s' % (outdir + 'lsaBGC_AutoExpansion_Results/Sample_Annotation_Files.txt', - fin_outdir + 'Expanded_Sample_Annotation_Files.txt')) - if os.path.isfile(outdir + 'Intermediate_Files/GToTree_output.tre'): - os.system('ln -s %s %s' % (outdir + 'Intermediate_Files/GToTree_output.tre', fin_outdir)) - if os.path.isfile(outdir + 'Intermediate_Files/GToTree_Expected_Similarities.txt'): - os.system('ln -s %s %s' % (outdir + 'Intermediate_Files/GToTree_Expected_Similarities.txt', fin_outdir)) - if os.path.isfile(outdir + 'Intermediate_Files/Samples_in_GToTree_Tree.txt'): - os.system('ln -s %s %s' % (outdir + 'Intermediate_Files/Samples_in_GToTree_Tree.txt', fin_outdir)) - else: - os.system('ln -s %s %s' % (outdir + 'Intermediate_Files/*', fin_outdir)) - if os.path.isdir(outdir + 'BiG_SCAPE_Results_Reformatted/'): - os.system('ln -s %s %s' % ( - outdir + 'BiG_SCAPE_Results_Reformatted/GCF_Listings/', fin_outdir + 'GCF_Listings')) - os.system('ln -s %s %s' % ( - outdir + 'BiG_SCAPE_Results_Reformatted/GCF_Details.txt', fin_outdir + 'GCF_Details.txt')) - elif os.path.isdir(outdir + 'lsaBGC_Cluster_Results/'): - os.system( - 'ln -s %s %s' % (outdir + 'lsaBGC_Cluster_Results/GCF_Listings/', fin_outdir + 'GCF_Listings')) - os.system('ln -s %s %s' % ( - outdir + 'lsaBGC_Cluster_Results/GCF_Details_Expanded_Singletons.txt', fin_outdir + 'GCF_Details.txt')) - except Exception as e: - raise RuntimeError(traceback.format_exc()) - - def setupReadyDirectory(directories): + """ + Simple function to (re-)create directories provided as a list. + """ try: assert (type(directories) is list) for d in directories: @@ -1304,18 +1608,17 @@ def is_genbank(gbk): return True except: return False - -def parseVersionFromSetupPy(): - """ - Parses version from setup.py program. - """ - return(str(version)) - + def createLoggerObject(log_file): """ + Description: Function which creates logger object. - :param log_file: path to log file. - :return: logging logger object. + ******************************************************************************************************************** + Parameters: + - log_file: path to log file. + ******************************************************************************************************************** + Returns: + - logObject: logging logger object. """ logger = logging.getLogger('task_logger') logger.setLevel(logging.DEBUG) @@ -1330,8 +1633,12 @@ def createLoggerObject(log_file): def closeLoggerObject(logObject): """ - Function which closes/terminates loggerObject. - :param logObject: logging logger object to close + Description: + Void function which closes/terminates a logger object. + ******************************************************************************************************************** + Parameters: + - logObject: A python logger object to close. + ******************************************************************************************************************** """ handlers = logObject.handlers[:] for handler in handlers: @@ -1367,8 +1674,21 @@ def logParametersToObject(logObject, parameter_names, parameter_values): pn = parameter_names[i] logObject.info(pn + ': ' + str(pv)) - def determineSeqSimProteinAlignment(protein_alignment_file, use_only_core=True): + """ + Definition: + Function to determine sequence similarity between pairs of proteins in a protein multiple sequence alignment. + Used in popstrat. + ******************************************************************************************************************** + Parameters: + - protein_alignment_file: The input protein multiple sequence alignment in FASTA format. + - use_only_core (optional): Use only core sites shared by both sequences (neither sequence has an ambiguous allele). + Default is True. + ******************************************************************************************************************** + Returns: + - pair_seq_matching: A dictionary showing the sequence similarity between proteins in the alignment. + ******************************************************************************************************************** + """ protein_sequences = {} with open(protein_alignment_file) as ocaf: for rec in SeqIO.parse(ocaf, 'fasta'): @@ -1402,6 +1722,12 @@ def determineSeqSimProteinAlignment(protein_alignment_file, use_only_core=True): return pair_seq_matching +def parseVersion(): + """ + Returns suite version. + """ + return(str(version)) + def castToNumeric(x): """ Description: @@ -1466,6 +1792,32 @@ def loadTableInPandaDataFrame(input_file, numeric_columns): def createFinalSpreadsheets(detailed_BGC_listing_with_Pop_and_GCF_map_file, zol_results_dir, zol_high_qual_flag, mibig_dir, recon_result_file, recon_og_result_file, recon_pop_color_file, sociate_result_file, final_spreadsheet_xlsx, scratch_dir, logObject): + """ + Definition: + Void, major function to create the final consolidated spreadsheet that is the major result of lsaBGC-Pan. + ******************************************************************************************************************** + Parameters: + - detailed_BGC_listing_with_pop_and_GCF_map_file: The detailed BGC listing file produced in lsaBGC-Pan with much of + the information for sample mapping, GCF mapping, and context of + each BGC. + - zol_results_dir: + - zol_high_qual_flag: A flag for whether zol analyses were run with a more comprehensive/high-quality preset of + options + - mibig_dir: The directory with results from lsaBGC-MIBiGMapper for all GCFs. + - recon_result_file: The major result file from lsaBGC-Recon showing a table showing how BGC-associated orthogroups + are distributed in different populations and GCF contexts. + - recon_og_result_file: The secondary major result file from lsaBGC-Recon which shows a matrix of orthogroup + carriage across samples. + - recon_pop_color_file: The population coloring determined in lsaBGG-Recon to eventually color the spreadsheet + using. Currently is not used! + - sociate_result_file: The major result file from lsaBGC-Sociate showing the results of finding associated/ + disocciated orthogroups and GCFs with focal GCFs. + - final_spreadsheet_xlsx: The final consolidated spreadsheet path; to be made in the function. + - scratch_dir: A scratch space to write temporary/intermediate files. + - logObject: A python logging object. + ******************************************************************************************************************** + """ + try: # generate Excel spreadsheet writer = pd.ExcelWriter(final_spreadsheet_xlsx, engine='xlsxwriter') diff --git a/workflows/lsaBGC-Pan b/workflows/lsaBGC-Pan index 3ec4d6f..67e06b5 100644 --- a/workflows/lsaBGC-Pan +++ b/workflows/lsaBGC-Pan @@ -167,7 +167,7 @@ def create_parser(): return args def lsaBGC(): - version_string = util.parseVersionFromSetupPy() + version_string = util.parseVersion() if '-v' in sys.argv or '--version' in sys.argv: logo = """ __ ___ _____ _____