update to v1.1.2

Kalan-Lab · Aug 28, 2024 · da147a1 · da147a1
1 parent 7686ee7
commit da147a1
Show file tree

Hide file tree

Showing 12 changed files with 563 additions and 228 deletions.
diff --git a/CITATION.cff b/CITATION.cff
@@ -9,5 +9,5 @@ authors:
   orcid: "https://orcid.org/0000-0003-4980-5128"
 title: "lsaBGC-Pan"
 version: 1.1.2
-date-released: 2024-08-21
+date-released: 2024-08-28
 url: "https://github.com/Kalan-Lab/lsaBGC-Pan"
diff --git a/bin/lsaBGC-Cluster b/bin/lsaBGC-Cluster
@@ -114,7 +114,7 @@ def lsaBGC_Cluster():
 	# create logging object
 	log_file = outdir + 'Progress.log'
 	logObject = util.createLoggerObject(log_file)
-	version_string = util.parseVersionFromSetupPy()
+	version_string = util.parseVersion()
 	logObject.info('Running lsaBGC version %s' % version_string)
 
 	# Step 0: Log input arguments and update reference and query FASTA files.

diff --git a/bin/lsaBGC-ComprehenSeeIve b/bin/lsaBGC-ComprehenSeeIve
@@ -118,7 +118,7 @@ def lsaBGC_ComprehenSeeIve():
     # create logging object
     log_file = outdir + 'Progress.log'
     logObject = util.createLoggerObject(log_file)
-    version_string = util.parseVersionFromSetupPy()
+    version_string = util.parseVersion()
     logObject.info('Running lsaBGC version %s' % version_string)
 
     # Log input arguments and update reference and query FASTA files.

diff --git a/bin/lsaBGC-MIBiGMapper b/bin/lsaBGC-MIBiGMapper
@@ -125,7 +125,7 @@ def mapToMIBiG():
 	# create logging object
 	log_file = outdir + 'Progress.log'
 	logObject = util.createLoggerObject(log_file)
-	version_string = util.parseVersionFromSetupPy()
+	version_string = util.parseVersion()
 	logObject.info('Running lsaBGC version %s' % version_string)
 
 

diff --git a/bin/lsaBGC-Reconcile b/bin/lsaBGC-Reconcile
@@ -125,7 +125,7 @@ def lsaBGC_Reconcile():
     # create logging object
     log_file = outdir + 'Progress.log'
     logObject = util.createLoggerObject(log_file)
-    version_string = util.parseVersionFromSetupPy()
+    version_string = util.parseVersion()
     logObject.info('Running lsaBGC version %s' % version_string)
 
     # Log input arguments and update reference and query FASTA files.

diff --git a/bin/lsaBGC-See b/bin/lsaBGC-See
@@ -111,7 +111,7 @@ def lsaBGC_See():
     # create logging object
     log_file = outdir + 'Progress.log'
     logObject = util.createLoggerObject(log_file)
-    version_string = util.parseVersionFromSetupPy()
+    version_string = util.parseVersion()
     logObject.info('Running lsaBGC version %s' % version_string)
 
     # Log input arguments and update reference and query FASTA files.

diff --git a/bin/lsaBGC-Sociate b/bin/lsaBGC-Sociate
@@ -129,7 +129,7 @@ def lsaBGC_Sociate():
     # create logging object
     log_file = outdir + 'Progress.log'
     logObject = util.createLoggerObject(log_file)
-    version_string = util.parseVersionFromSetupPy()
+    version_string = util.parseVersion()
     logObject.info('Running lsaBGC version %s' % version_string)
 
     # Log input arguments and update reference and query FASTA files.

diff --git a/src/lsaBGC/classes/BGC.py b/src/lsaBGC/classes/BGC.py
@@ -163,7 +163,7 @@ def parseGECCO(self, comprehensive_parsing=True, flank_size=2000):
 		self.cluster_information = bgc_info
 
 	def parseAntiSMASH(self, comprehensive_parsing=True, flank_size=2000):
-		""" Functoin to parse BGC Genbank produced by antiSMASH """
+		""" Function to parse BGC Genbank produced by antiSMASH """
 		bgc_info = []
 		domains = []
 		core_positions = set([])
@@ -325,69 +325,4 @@ def parseGenbanks(self, comprehensive_parsing=True, flank_size=2000):
 		elif self.prediction_method.upper() == 'GECCO':
 			self.parseGECCO(comprehensive_parsing=comprehensive_parsing, flank_size=flank_size)
 		else:
-			raise RuntimeError("Unable to parse file because BGC prediction method is not an accepted option!")
-
-	def refineGenbank(self, refined_genbank_file, first_bg, second_bg):
-		"""
-		Function to prune and update coordinates of BGC Genbank - main functoin of lsaBGC-Refiner
-		"""
-		try:
-			rgf_handle = open(refined_genbank_file, 'w')
-			start_coord = min([self.gene_information[first_bg]['start'], self.gene_information[first_bg]['end'], self.gene_information[second_bg]['start'], self.gene_information[second_bg]['end']])
-			end_coord = max([self.gene_information[first_bg]['start'], self.gene_information[first_bg]['end'], self.gene_information[second_bg]['start'], self.gene_information[second_bg]['end']])
-			pruned_coords = set(range(start_coord, end_coord+1))
-			with open(self.bgc_genbank) as ogbk:
-				recs = [x for x in SeqIO.parse(ogbk, 'genbank')]
-				try:
-					assert(len(recs) == 1)
-				except Exception as e:
-					raise RuntimeError(traceback.format_exc())
-				for rec in recs:
-					original_seq = str(rec.seq)
-					filtered_seq = ""
-					if end_coord == len(original_seq):
-						filtered_seq = original_seq[start_coord-1:]
-					else:
-						filtered_seq = original_seq[start_coord-1:end_coord]
-
-					new_seq_object = Seq(filtered_seq)
-
-					updated_rec = copy.deepcopy(rec)
-					updated_rec.seq = new_seq_object
-
-					updated_features = []
-					for feature in rec.features:
-						all_coords, start, end, direction, is_multi_part = util.parseCDSCoord(str(feature.location))
-
-						feature_coords = set(range(start, end+1))
-						if len(feature_coords.intersection(pruned_coords)) > 0:
-							fls = []
-							for sc, ec, dc in all_coords:
-								updated_start = sc - start_coord + 1
-								updated_end = ec - start_coord + 1
-								if ec > end_coord:
-									if feature.type == 'CDS':
-										continue
-									else:
-										updated_end = end_coord - start_coord + 1  # ; flag1 = True
-								if sc < start_coord:
-									if feature.type == 'CDS':
-										continue
-									else:
-										updated_start = 1  # ; flag2 = True
-
-								strand = 1
-								if dc == '-':
-									strand = -1
-								fls.append(FeatureLocation(updated_start - 1, updated_end, strand=strand))
-							if len(fls) > 0:
-								updated_location = fls[0]
-								if len(fls) > 1:
-									updated_location = sum(fls)
-								feature.location = updated_location
-								updated_features.append(feature)
-					updated_rec.features = updated_features
-					SeqIO.write(updated_rec, rgf_handle, 'genbank')
-			rgf_handle.close()
-		except Exception as e:
-			raise RuntimeError(traceback.format_exc())
+			raise RuntimeError("Unable to parse file because BGC prediction method is not an accepted option!")
diff --git a/src/lsaBGC/classes/GCF.py b/src/lsaBGC/classes/GCF.py
@@ -50,6 +50,11 @@ def __init__(self, bgc_genbanks_listing, gcf_id='GCF_X', logObject=None, lineage
 		self.avoid_samples = set([])
 
 	def identifyKeyHomologGroups(self, all_primary=False):
+		"""
+		Function that is not used currently in lsaBGC-Pan - in the original lsaBGC - this function 
+		computed which orthogroups were core to a BGC, which were single-copy core and which were 
+		commonly found in BGC protocore regions. This is a little more tricky to integrate with zol.
+		"""
 		try:
 			initial_samples_with_at_least_one_gcf_hg = set([])
 			for hg in self.hg_genes:
@@ -96,7 +101,7 @@ def identifyKeyHomologGroups(self, all_primary=False):
 
 	def aggregateProteins(self, gcf_prots_file, draft_mode=False):
 		"""
-		function to aggregate protein sequences from BGC genbanks and output to a file in FASTA format.
+		Function to aggregate protein sequences from BGC genbanks and output to a file in FASTA format.
 		"""
 		try:
 			gpf_handle = open(gcf_prots_file, 'w')
@@ -118,11 +123,14 @@ def aggregateProteins(self, gcf_prots_file, draft_mode=False):
 
 	def modifyPhylogenyForSamplesWithMultipleBGCs(self, input_phylogeny, result_phylogeny, prune_set=None):
 		"""
+		Definition:
 		Function which takes in an input phylogeny and produces a replicate resulting phylogeny with samples/leafs which
 		have multiple BGC instances for a GCF expanded.
-
-		:param input_phylogeny: input newick phylogeny file
-		:result result_phylogeny: resulting newick phylogeny file
+		********************************************************************************************************************
+		Parameters:
+		- input_phylogeny: The input newick phylogeny file.
+		- result_phylogeny: The resulting newick phylogeny file to create.
+		********************************************************************************************************************
 		"""
 		try:
 			number_of_added_leaves = 0
@@ -155,11 +163,13 @@ def modifyPhylogenyForSamplesWithMultipleBGCs(self, input_phylogeny, result_phyl
 
 	def assignColorsToHGs(self, gene_to_hg, bgc_genes, outdir):
 		"""
+		Description:
 		Simple function to associate each homolog group with a color for consistent coloring.
-
-		:param gene_to_hg: gene to HG relationship.
-		:param bgc_genes:  set of genes per HG.
-		:return: dictionary mapping each HG to a hex color value.
+		********************************************************************************************************************
+		Parameters:
+		- gene_to_hg: gene to HG relationship.
+		- bgc_genes:  set of genes per HG.
+		********************************************************************************************************************
 		"""
 
 		hg_bgc_counts = defaultdict(int)
@@ -190,10 +200,14 @@ def assignColorsToHGs(self, gene_to_hg, bgc_genes, outdir):
 
 	def createItolBGCSeeTrack(self, result_track_file):
 		"""
+		Description:
 		Function to create a track file for visualizing BGC gene architecture across a phylogeny in the interactive tree
 		of life (iTol)
-
-		:param result_track_file: The path to the resulting iTol track file for BGC gene visualization.
+		********************************************************************************************************************
+		Parameters:
+		- self: GCF object
+		- result_track_file: The path to the resulting iTol track file for BGC gene visualization.
+		********************************************************************************************************************
 		"""
 		try:
 			track_handle = open(result_track_file, 'w')
@@ -286,14 +300,18 @@ def createItolBGCSeeTrack(self, result_track_file):
 	def visualizeComprehenSeeIveGCFViaR(self, orthofinder_matrix_file, phylogeny_newick_file, heatmap_track_file,
 										detection_track_file, result_pdf_file, plot_rscript):
 		"""
+		Definition:
 		Function to create tracks for visualization of gene architecture of BGCs belonging to GCF and run Rscript bgSee.R
 		to produce automatic PDFs of plots. In addition, bgSee.R also produces a heatmap to more easily identify homolog
 		groups which are conserved across isolates found to feature GCF.
-
-		:param gggenes_track_file: Path to file with gggenes track information (will be created/written to by function, if it doesn't exist!)
-		:param heatmap_track_file: Path to file for heatmap visual component (will be created/written to by function, if it doesn't exist!)
-		:param phylogeny_file: Phylogeny to use for visualization.
-		:param result_pdf_file: Path to PDF file where plots from bgSee.R will be written to.
+		********************************************************************************************************************
+		Parameters:
+		- self: GCF object
+		- gggenes_track_file: Path to file with gggenes track information (will be created/written to by function, if it doesn't exist!)
+		- heatmap_track_file: Path to file for heatmap visual component (will be created/written to by function, if it doesn't exist!)
+		- phylogeny_file: Phylogeny to use for visualization.
+		- result_pdf_file: Path to PDF file where plots from bgSee.R will be written to.
+		********************************************************************************************************************
 		"""
 		try:
 			if os.path.isfile(heatmap_track_file) or os.path.isfile(detection_track_file):
@@ -422,14 +440,20 @@ def visualizeComprehenSeeIveGCFViaR(self, orthofinder_matrix_file, phylogeny_new
 
 	def visualizeGCFViaR(self, gggenes_track_file, heatmap_track_file, phylogeny_file, result_pdf_file, plot_rscript):
 		"""
+		Definition:
 		Function to create tracks for visualization of gene architecture of BGCs belonging to GCF and run Rscript bgSee.R
 		to produce automatic PDFs of plots. In addition, bgSee.R also produces a heatmap to more easily identify homolog
 		groups which are conserved across isolates found to feature GCF.
-
-		:param gggenes_track_file: Path to file with gggenes track information (will be created/written to by function, if it doesn't exist!)
-		:param heatmap_track_file: Path to file for heatmap visual component (will be created/written to by function, if it doesn't exist!)
-		:param phylogeny_file: Phylogeny to use for visualization.
-		:param result_pdf_file: Path to PDF file where plots from bgSee.R will be written to.
+		********************************************************************************************************************
+		Parameters:
+		- self: GCF object.
+		- gggenes_track_file: Path to file with gggenes track information (will be created/written to by function, if it 
+		                      doesn't exist!)
+		- heatmap_track_file: Path to file for heatmap visual component (will be created/written to by function, if it 
+		                      doesn't exist!)-
+		- phylogeny_file: Phylogeny to use for visualization.
+		- result_pdf_file: Path to PDF file where plots from bgSee.R will be written to.
+		********************************************************************************************************************
 		"""
 		try:
 			if os.path.isfile(gggenes_track_file) or os.path.isfile(heatmap_track_file):
@@ -599,18 +623,22 @@ def visualizeGCFViaR(self, gggenes_track_file, heatmap_track_file, phylogeny_fil
 
 	def constructCodonAlignments(self, outdir, threads=1, only_scc=False, list_alignments=False, filter_outliers=False, use_ms5=True):
 		"""
+		Definition:
 		Function to automate construction of codon alignments. This function first extracts protein and nucleotide sequnces
 		from BGC Genbanks, then creates protein alignments for each homolog group using MAFFT, and finally converts those
 		into codon alignments using PAL2NAL.
-
-		:param outdir: Path to output/workspace directory. Intermediate files (like extracted nucleotide and protein
-						 sequences, protein and codon alignments, will be writen to respective subdirectories underneath this
-						 one).
-		:param threads: Number of threads/threads to use when fake-parallelizing jobs using multiprocessing.
-		:param only_scc: Whether to construct codon alignments only for homolog groups which are found to be core and in
-						 single copy for samples with the GCF. Note, if working with draft genomes and the BGC is fragmented
-						 this should be able to still identify SCC homolog groups across the BGC instances belonging to the
-						 GCF.
+		********************************************************************************************************************
+		Parameters:
+		- self: GCF object.
+		- outdir: Path to output/workspace directory. Intermediate files (like extracted nucleotide and protein
+				  sequences, protein and codon alignments, will be writen to respective subdirectories underneath this
+				  one).
+		- threads: Number of threads/threads to use when fake-parallelizing jobs using multiprocessing.
+		- only_scc: Whether to construct codon alignments only for homolog groups which are found to be core and in
+					single copy for samples with the GCF. Note, if working with draft genomes and the BGC is fragmented
+					this should be able to still identify SCC homolog groups across the BGC instances belonging to the
+					GCF.
+		********************************************************************************************************************
 		"""
 
 		nucl_seq_dir = os.path.abspath(outdir) + '/Nucleotide_Sequences/'
@@ -695,11 +723,15 @@ def constructCodonAlignments(self, outdir, threads=1, only_scc=False, list_align
 
 	def constructGCFPhylogeny(self, output_alignment, output_phylogeny, only_scc=False, ambiguious_position_cutoff=0.000001):
 		"""
+		Definition:
 		Function to create phylogeny based on codon alignments of SCC homolog groups for GCF.
-
-		:param output_alignment: Path to output file for concatenated SCC homolog group alignment.
-		:param output_phylogeny: Path to output file for approximate maximum-likelihood phylogeny produced by FastTree2 from
-									 concatenated SCC homolog group alignment.
+		********************************************************************************************************************
+		Parameters:
+		- self: GCF object.
+		- output_alignment: Path to output file for concatenated SCC homolog group alignment.
+		- output_phylogeny: Path to output file for approximate maximum-likelihood phylogeny produced by FastTree2 from
+							concatenated SCC homolog group alignment.
+		********************************************************************************************************************
 		"""
 		try:
 			if only_scc:
@@ -796,9 +828,13 @@ def constructGCFPhylogeny(self, output_alignment, output_phylogeny, only_scc=Fal
 
 def create_codon_msas(inputs):
 	"""
+	Definition:
 	Helper function which is to be called from the constructCodonAlignments() function to parallelize construction
 	of codon alignments for each homolog group of interest in the GCF.
-	:param inputs: list of inputs passed in by GCF.constructCodonAlignments().
+	********************************************************************************************************************
+	Parameters:
+	- inputs: list of inputs passed in by GCF.constructCodonAlignments().
+	********************************************************************************************************************
 	"""
 	hg, gene_sequences, nucl_seq_dir, prot_seq_dir, prot_alg_dir, codo_alg_dir, threads, use_ms5, logObject = inputs