Skip to content

Commit

Permalink
update to v1.1.2
Browse files Browse the repository at this point in the history
  • Loading branch information
raufs committed Aug 28, 2024
1 parent 7686ee7 commit da147a1
Show file tree
Hide file tree
Showing 12 changed files with 563 additions and 228 deletions.
2 changes: 1 addition & 1 deletion CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ authors:
orcid: "https://orcid.org/0000-0003-4980-5128"
title: "lsaBGC-Pan"
version: 1.1.2
date-released: 2024-08-21
date-released: 2024-08-28
url: "https://github.com/Kalan-Lab/lsaBGC-Pan"
2 changes: 1 addition & 1 deletion bin/lsaBGC-Cluster
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def lsaBGC_Cluster():
# create logging object
log_file = outdir + 'Progress.log'
logObject = util.createLoggerObject(log_file)
version_string = util.parseVersionFromSetupPy()
version_string = util.parseVersion()
logObject.info('Running lsaBGC version %s' % version_string)

# Step 0: Log input arguments and update reference and query FASTA files.
Expand Down
2 changes: 1 addition & 1 deletion bin/lsaBGC-ComprehenSeeIve
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def lsaBGC_ComprehenSeeIve():
# create logging object
log_file = outdir + 'Progress.log'
logObject = util.createLoggerObject(log_file)
version_string = util.parseVersionFromSetupPy()
version_string = util.parseVersion()
logObject.info('Running lsaBGC version %s' % version_string)

# Log input arguments and update reference and query FASTA files.
Expand Down
2 changes: 1 addition & 1 deletion bin/lsaBGC-MIBiGMapper
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def mapToMIBiG():
# create logging object
log_file = outdir + 'Progress.log'
logObject = util.createLoggerObject(log_file)
version_string = util.parseVersionFromSetupPy()
version_string = util.parseVersion()
logObject.info('Running lsaBGC version %s' % version_string)


Expand Down
2 changes: 1 addition & 1 deletion bin/lsaBGC-Reconcile
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def lsaBGC_Reconcile():
# create logging object
log_file = outdir + 'Progress.log'
logObject = util.createLoggerObject(log_file)
version_string = util.parseVersionFromSetupPy()
version_string = util.parseVersion()
logObject.info('Running lsaBGC version %s' % version_string)

# Log input arguments and update reference and query FASTA files.
Expand Down
2 changes: 1 addition & 1 deletion bin/lsaBGC-See
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def lsaBGC_See():
# create logging object
log_file = outdir + 'Progress.log'
logObject = util.createLoggerObject(log_file)
version_string = util.parseVersionFromSetupPy()
version_string = util.parseVersion()
logObject.info('Running lsaBGC version %s' % version_string)

# Log input arguments and update reference and query FASTA files.
Expand Down
2 changes: 1 addition & 1 deletion bin/lsaBGC-Sociate
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def lsaBGC_Sociate():
# create logging object
log_file = outdir + 'Progress.log'
logObject = util.createLoggerObject(log_file)
version_string = util.parseVersionFromSetupPy()
version_string = util.parseVersion()
logObject.info('Running lsaBGC version %s' % version_string)

# Log input arguments and update reference and query FASTA files.
Expand Down
69 changes: 2 additions & 67 deletions src/lsaBGC/classes/BGC.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def parseGECCO(self, comprehensive_parsing=True, flank_size=2000):
self.cluster_information = bgc_info

def parseAntiSMASH(self, comprehensive_parsing=True, flank_size=2000):
""" Functoin to parse BGC Genbank produced by antiSMASH """
""" Function to parse BGC Genbank produced by antiSMASH """
bgc_info = []
domains = []
core_positions = set([])
Expand Down Expand Up @@ -325,69 +325,4 @@ def parseGenbanks(self, comprehensive_parsing=True, flank_size=2000):
elif self.prediction_method.upper() == 'GECCO':
self.parseGECCO(comprehensive_parsing=comprehensive_parsing, flank_size=flank_size)
else:
raise RuntimeError("Unable to parse file because BGC prediction method is not an accepted option!")

def refineGenbank(self, refined_genbank_file, first_bg, second_bg):
"""
Function to prune and update coordinates of BGC Genbank - main functoin of lsaBGC-Refiner
"""
try:
rgf_handle = open(refined_genbank_file, 'w')
start_coord = min([self.gene_information[first_bg]['start'], self.gene_information[first_bg]['end'], self.gene_information[second_bg]['start'], self.gene_information[second_bg]['end']])
end_coord = max([self.gene_information[first_bg]['start'], self.gene_information[first_bg]['end'], self.gene_information[second_bg]['start'], self.gene_information[second_bg]['end']])
pruned_coords = set(range(start_coord, end_coord+1))
with open(self.bgc_genbank) as ogbk:
recs = [x for x in SeqIO.parse(ogbk, 'genbank')]
try:
assert(len(recs) == 1)
except Exception as e:
raise RuntimeError(traceback.format_exc())
for rec in recs:
original_seq = str(rec.seq)
filtered_seq = ""
if end_coord == len(original_seq):
filtered_seq = original_seq[start_coord-1:]
else:
filtered_seq = original_seq[start_coord-1:end_coord]

new_seq_object = Seq(filtered_seq)

updated_rec = copy.deepcopy(rec)
updated_rec.seq = new_seq_object

updated_features = []
for feature in rec.features:
all_coords, start, end, direction, is_multi_part = util.parseCDSCoord(str(feature.location))

feature_coords = set(range(start, end+1))
if len(feature_coords.intersection(pruned_coords)) > 0:
fls = []
for sc, ec, dc in all_coords:
updated_start = sc - start_coord + 1
updated_end = ec - start_coord + 1
if ec > end_coord:
if feature.type == 'CDS':
continue
else:
updated_end = end_coord - start_coord + 1 # ; flag1 = True
if sc < start_coord:
if feature.type == 'CDS':
continue
else:
updated_start = 1 # ; flag2 = True

strand = 1
if dc == '-':
strand = -1
fls.append(FeatureLocation(updated_start - 1, updated_end, strand=strand))
if len(fls) > 0:
updated_location = fls[0]
if len(fls) > 1:
updated_location = sum(fls)
feature.location = updated_location
updated_features.append(feature)
updated_rec.features = updated_features
SeqIO.write(updated_rec, rgf_handle, 'genbank')
rgf_handle.close()
except Exception as e:
raise RuntimeError(traceback.format_exc())
raise RuntimeError("Unable to parse file because BGC prediction method is not an accepted option!")
104 changes: 70 additions & 34 deletions src/lsaBGC/classes/GCF.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ def __init__(self, bgc_genbanks_listing, gcf_id='GCF_X', logObject=None, lineage
self.avoid_samples = set([])

def identifyKeyHomologGroups(self, all_primary=False):
"""
Function that is not used currently in lsaBGC-Pan - in the original lsaBGC - this function
computed which orthogroups were core to a BGC, which were single-copy core and which were
commonly found in BGC protocore regions. This is a little more tricky to integrate with zol.
"""
try:
initial_samples_with_at_least_one_gcf_hg = set([])
for hg in self.hg_genes:
Expand Down Expand Up @@ -96,7 +101,7 @@ def identifyKeyHomologGroups(self, all_primary=False):

def aggregateProteins(self, gcf_prots_file, draft_mode=False):
"""
function to aggregate protein sequences from BGC genbanks and output to a file in FASTA format.
Function to aggregate protein sequences from BGC genbanks and output to a file in FASTA format.
"""
try:
gpf_handle = open(gcf_prots_file, 'w')
Expand All @@ -118,11 +123,14 @@ def aggregateProteins(self, gcf_prots_file, draft_mode=False):

def modifyPhylogenyForSamplesWithMultipleBGCs(self, input_phylogeny, result_phylogeny, prune_set=None):
"""
Definition:
Function which takes in an input phylogeny and produces a replicate resulting phylogeny with samples/leafs which
have multiple BGC instances for a GCF expanded.
:param input_phylogeny: input newick phylogeny file
:result result_phylogeny: resulting newick phylogeny file
********************************************************************************************************************
Parameters:
- input_phylogeny: The input newick phylogeny file.
- result_phylogeny: The resulting newick phylogeny file to create.
********************************************************************************************************************
"""
try:
number_of_added_leaves = 0
Expand Down Expand Up @@ -155,11 +163,13 @@ def modifyPhylogenyForSamplesWithMultipleBGCs(self, input_phylogeny, result_phyl

def assignColorsToHGs(self, gene_to_hg, bgc_genes, outdir):
"""
Description:
Simple function to associate each homolog group with a color for consistent coloring.
:param gene_to_hg: gene to HG relationship.
:param bgc_genes: set of genes per HG.
:return: dictionary mapping each HG to a hex color value.
********************************************************************************************************************
Parameters:
- gene_to_hg: gene to HG relationship.
- bgc_genes: set of genes per HG.
********************************************************************************************************************
"""

hg_bgc_counts = defaultdict(int)
Expand Down Expand Up @@ -190,10 +200,14 @@ def assignColorsToHGs(self, gene_to_hg, bgc_genes, outdir):

def createItolBGCSeeTrack(self, result_track_file):
"""
Description:
Function to create a track file for visualizing BGC gene architecture across a phylogeny in the interactive tree
of life (iTol)
:param result_track_file: The path to the resulting iTol track file for BGC gene visualization.
********************************************************************************************************************
Parameters:
- self: GCF object
- result_track_file: The path to the resulting iTol track file for BGC gene visualization.
********************************************************************************************************************
"""
try:
track_handle = open(result_track_file, 'w')
Expand Down Expand Up @@ -286,14 +300,18 @@ def createItolBGCSeeTrack(self, result_track_file):
def visualizeComprehenSeeIveGCFViaR(self, orthofinder_matrix_file, phylogeny_newick_file, heatmap_track_file,
detection_track_file, result_pdf_file, plot_rscript):
"""
Definition:
Function to create tracks for visualization of gene architecture of BGCs belonging to GCF and run Rscript bgSee.R
to produce automatic PDFs of plots. In addition, bgSee.R also produces a heatmap to more easily identify homolog
groups which are conserved across isolates found to feature GCF.
:param gggenes_track_file: Path to file with gggenes track information (will be created/written to by function, if it doesn't exist!)
:param heatmap_track_file: Path to file for heatmap visual component (will be created/written to by function, if it doesn't exist!)
:param phylogeny_file: Phylogeny to use for visualization.
:param result_pdf_file: Path to PDF file where plots from bgSee.R will be written to.
********************************************************************************************************************
Parameters:
- self: GCF object
- gggenes_track_file: Path to file with gggenes track information (will be created/written to by function, if it doesn't exist!)
- heatmap_track_file: Path to file for heatmap visual component (will be created/written to by function, if it doesn't exist!)
- phylogeny_file: Phylogeny to use for visualization.
- result_pdf_file: Path to PDF file where plots from bgSee.R will be written to.
********************************************************************************************************************
"""
try:
if os.path.isfile(heatmap_track_file) or os.path.isfile(detection_track_file):
Expand Down Expand Up @@ -422,14 +440,20 @@ def visualizeComprehenSeeIveGCFViaR(self, orthofinder_matrix_file, phylogeny_new

def visualizeGCFViaR(self, gggenes_track_file, heatmap_track_file, phylogeny_file, result_pdf_file, plot_rscript):
"""
Definition:
Function to create tracks for visualization of gene architecture of BGCs belonging to GCF and run Rscript bgSee.R
to produce automatic PDFs of plots. In addition, bgSee.R also produces a heatmap to more easily identify homolog
groups which are conserved across isolates found to feature GCF.
:param gggenes_track_file: Path to file with gggenes track information (will be created/written to by function, if it doesn't exist!)
:param heatmap_track_file: Path to file for heatmap visual component (will be created/written to by function, if it doesn't exist!)
:param phylogeny_file: Phylogeny to use for visualization.
:param result_pdf_file: Path to PDF file where plots from bgSee.R will be written to.
********************************************************************************************************************
Parameters:
- self: GCF object.
- gggenes_track_file: Path to file with gggenes track information (will be created/written to by function, if it
doesn't exist!)
- heatmap_track_file: Path to file for heatmap visual component (will be created/written to by function, if it
doesn't exist!)-
- phylogeny_file: Phylogeny to use for visualization.
- result_pdf_file: Path to PDF file where plots from bgSee.R will be written to.
********************************************************************************************************************
"""
try:
if os.path.isfile(gggenes_track_file) or os.path.isfile(heatmap_track_file):
Expand Down Expand Up @@ -599,18 +623,22 @@ def visualizeGCFViaR(self, gggenes_track_file, heatmap_track_file, phylogeny_fil

def constructCodonAlignments(self, outdir, threads=1, only_scc=False, list_alignments=False, filter_outliers=False, use_ms5=True):
"""
Definition:
Function to automate construction of codon alignments. This function first extracts protein and nucleotide sequnces
from BGC Genbanks, then creates protein alignments for each homolog group using MAFFT, and finally converts those
into codon alignments using PAL2NAL.
:param outdir: Path to output/workspace directory. Intermediate files (like extracted nucleotide and protein
sequences, protein and codon alignments, will be writen to respective subdirectories underneath this
one).
:param threads: Number of threads/threads to use when fake-parallelizing jobs using multiprocessing.
:param only_scc: Whether to construct codon alignments only for homolog groups which are found to be core and in
single copy for samples with the GCF. Note, if working with draft genomes and the BGC is fragmented
this should be able to still identify SCC homolog groups across the BGC instances belonging to the
GCF.
********************************************************************************************************************
Parameters:
- self: GCF object.
- outdir: Path to output/workspace directory. Intermediate files (like extracted nucleotide and protein
sequences, protein and codon alignments, will be writen to respective subdirectories underneath this
one).
- threads: Number of threads/threads to use when fake-parallelizing jobs using multiprocessing.
- only_scc: Whether to construct codon alignments only for homolog groups which are found to be core and in
single copy for samples with the GCF. Note, if working with draft genomes and the BGC is fragmented
this should be able to still identify SCC homolog groups across the BGC instances belonging to the
GCF.
********************************************************************************************************************
"""

nucl_seq_dir = os.path.abspath(outdir) + '/Nucleotide_Sequences/'
Expand Down Expand Up @@ -695,11 +723,15 @@ def constructCodonAlignments(self, outdir, threads=1, only_scc=False, list_align

def constructGCFPhylogeny(self, output_alignment, output_phylogeny, only_scc=False, ambiguious_position_cutoff=0.000001):
"""
Definition:
Function to create phylogeny based on codon alignments of SCC homolog groups for GCF.
:param output_alignment: Path to output file for concatenated SCC homolog group alignment.
:param output_phylogeny: Path to output file for approximate maximum-likelihood phylogeny produced by FastTree2 from
concatenated SCC homolog group alignment.
********************************************************************************************************************
Parameters:
- self: GCF object.
- output_alignment: Path to output file for concatenated SCC homolog group alignment.
- output_phylogeny: Path to output file for approximate maximum-likelihood phylogeny produced by FastTree2 from
concatenated SCC homolog group alignment.
********************************************************************************************************************
"""
try:
if only_scc:
Expand Down Expand Up @@ -796,9 +828,13 @@ def constructGCFPhylogeny(self, output_alignment, output_phylogeny, only_scc=Fal

def create_codon_msas(inputs):
"""
Definition:
Helper function which is to be called from the constructCodonAlignments() function to parallelize construction
of codon alignments for each homolog group of interest in the GCF.
:param inputs: list of inputs passed in by GCF.constructCodonAlignments().
********************************************************************************************************************
Parameters:
- inputs: list of inputs passed in by GCF.constructCodonAlignments().
********************************************************************************************************************
"""
hg, gene_sequences, nucl_seq_dir, prot_seq_dir, prot_alg_dir, codo_alg_dir, threads, use_ms5, logObject = inputs

Expand Down
Loading

0 comments on commit da147a1

Please sign in to comment.