Skip to content

Commit

Permalink
Updated Changelog with v3.3.3 info. Limiting number of concurrent req…
Browse files Browse the repository at this point in the history
…uests to UniProt's SPARQL endpoint to a maximum of 4.
  • Loading branch information
rfm-targa committed Jan 25, 2024
1 parent aaab734 commit e6b68a2
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 39 deletions.
14 changes: 14 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,19 @@
# Changelog

## 3.3.3 - 2024--

- Fixed warning related with BLASTp `--seqidlist` parameter. For BLAST>=2.9, the TXT file with the sequence IDs is converted to binary format with `blastdb_aliastool`.

- The `Bio.Application` modules are deprecated and might be removed from future Biopython versions. Modified the function that calls MAFFT so that it uses the subprocess module instead of `Bio.Align.Applications.MafftCommandline`. Changed the Biopython version requirement to >=1.79.

- The `UniprotFinder` module now exits cleanly if the output directory already exists.

- Fixed the in-frame stop codon count values displayed in the reports created by the SchemaEvaluator module.

- Added a `pyproject.toml` configuration file and simplified the instructions in `setup.py`. The use of `setup.py` as a command line tool is deprecated and the `pyproject.toml` configuration file allows to install and build packages through the recommended method.

- Updated the Dockerfile to install chewBBACA with `python3 -m pip install .` instead of the deprecated `python setup.py install` command.

## 3.3.2 - 2024-01-16

- Changed FASTA file validation to reduce memory usage.
Expand Down
58 changes: 26 additions & 32 deletions CHEWBBACA/CHEWBBACA_NS/upload_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -936,13 +936,13 @@ def main(schema_directory, species_id, schema_name, loci_prefix,
else:
token = ''

# verify user
# Verify user
print('-- User Permissions --')
# GET request headers
headers_get = ct.HEADERS_GET_JSON
headers_get['Authorization'] = token

# determine current user ID and Role
# Determine current user ID and Role
if 'tutorial' not in nomenclature_server:
user_id, user_role, user_auth = cr.user_info(nomenclature_server, headers_get)
else:
Expand All @@ -952,11 +952,9 @@ def main(schema_directory, species_id, schema_name, loci_prefix,
print('User role: {0}'.format(user_role))
print('Authorized: {0}\n'.format(user_auth))

# only Admin or Contributor type users can upload schemas
# Only Admin or Contributor type users can upload schemas
if not user_auth:
sys.exit('Current user has no Administrator '
'or Contributor permissions.\n'
'Not allowed to upload schemas.')
sys.exit(ct.LOADSCHEMA_NO_PERMISSIONS)

# POST requests headers
headers_post = ct.HEADERS_POST_JSON
Expand All @@ -973,17 +971,15 @@ def main(schema_directory, species_id, schema_name, loci_prefix,
# Get schema files from genes list file
genes_list = os.path.join(schema_directory, '.genes_list')
genes = fo.pickle_loader(genes_list)

fasta_paths = [os.path.join(schema_directory, file) for file in genes]
fasta_paths.sort()

# total number of loci
# Total number of loci
total_loci = len(fasta_paths)
# total number of alelles
# Total number of alelles
total_alleles = sum(list(map(fao.count_sequences, fasta_paths)))

# Get the name of the species from the provided id
# or vice-versa
# Get the name of the species from the provided id or vice-versa
species_info = cr.species_ids(species_id, nomenclature_server, headers_get)
if isinstance(species_info, list):
species_id, species_name = species_info
Expand All @@ -994,30 +990,29 @@ def main(schema_directory, species_id, schema_name, loci_prefix,
print('Number of loci: {0}'.format(total_loci))
print('Number of alleles: {0}\n'.format(total_alleles))

# verify schema configs
# Verify schema config
print('Verifying schema configs...')
# load schema config file
# Load schema config file
configs = pv.read_configs(schema_directory, '.schema_config')

# validate arguments values
# Validate arguments values
schema_ptfs = configs['prodigal_training_file']
ptf_info = pv.validate_ptf(None, schema_directory, schema_ptfs, True)
if ptf_info[0] is None or ptf_info[2] is True:
sys.exit('Please ensure that the schema\'s directory includes the '
'Prodigal training file used to create the schema.')
sys.exit(ct.LOADSCHEMA_MISSING_PTF)

bsr_val = pv.bsr_type(configs.get('bsr', ''))
msl_val = pv.minimum_sequence_length_type(configs.get('minimum_locus_length', ''))
tt_val = pv.translation_table_type(configs.get('translation_table', ''))
st_val = pv.size_threshold_type(configs.get('size_threshold', ''))
cv_val = configs.get('chewBBACA_version', '')[0]
# add window size
# Add window size
ws_val = pv.validate_ws(configs.get('word_size', None))
cs_val = pv.validate_cs(configs.get('cluster_sim', None))
rf_val = pv.validate_rf(configs.get('representative_filter', None))
if_val = pv.validate_if(configs.get('intraCluster_filter', None))

# dictionary with schema parameters values to send to the NS
# Dictionary with schema parameters values to send to the NS
params = {'bsr': bsr_val, 'prodigal_training_file': ptf_info[1],
'translation_table': tt_val, 'minimum_locus_length': msl_val,
'chewBBACA_version': 'chewBBACA {0}'.format(cv_val),
Expand All @@ -1040,7 +1035,7 @@ def main(schema_directory, species_id, schema_name, loci_prefix,
ptf_file = ptf_info[0]
ptf_hash = ptf_info[1]

# determine schema status
# Determine schema status
upload_type = schema_status(nomenclature_server, headers_get,
schema_name, species_id,
continue_up)
Expand All @@ -1055,15 +1050,15 @@ def main(schema_directory, species_id, schema_name, loci_prefix,
print('Schema exists and is incomplete '
'("{0}", id={1})'.format(schema_name, schema_id))

# get schema description
# Get schema description
if continue_up is False:
if description_file is not None and os.path.isfile(description_file) is True:
# determine file hash
# Determine file hash
description_hash = fo.hash_file(description_file, hashlib.blake2b())
print('Schema description: {0}'.format(description_file))
else:
print('Could not get a description from a file. '
'Reset to schema name.')
'Will use the name of the schema as description.')
description_file = 'schema_description.txt'
with open(description_file, 'w') as sd:
sd.write(schema_name)
Expand All @@ -1073,12 +1068,11 @@ def main(schema_directory, species_id, schema_name, loci_prefix,

print('\n-- Schema Pre-processing --')

# hash schema files to get unique identifiers based on content
# Hash schema files to get unique identifiers based on content
hashed_files = {fo.hash_file(file, hashlib.blake2b()): file for file in fasta_paths}
print('Determining data to upload...')
absent_loci = fasta_paths
if upload_type[0] == 'incomplete':

loci_info, absent_loci, fasta_paths = schema_completedness(nomenclature_server, species_id,
upload_type[1], headers_get,
hashed_files)
Expand All @@ -1088,48 +1082,48 @@ def main(schema_directory, species_id, schema_name, loci_prefix,
print(' Loci without the full set of alleles: '
'{0}\n'.format(len(fasta_paths)))

# create inputs for QC step
# Create inputs for QC step
inputs = [(file,
file.split('/')[-1].split('.fasta')[0],
int(params['translation_table']),
0,
None) for file in fasta_paths]

# validate schema data and create files with translated sequences
# Validate schema data and create files with translated sequences
print('Translating sequences based on schema configs...')
qc_results = []
genes_pools = multiprocessing.Pool(processes=cpu_cores)
rawr = genes_pools.map_async(quality_control, inputs,
callback=qc_results.extend)
rawr.wait()

# get invalid alleles
# Get invalid alleles
invalid_alleles = [r[2] for r in qc_results]
invalid_alleles = list(itertools.chain.from_iterable(invalid_alleles))
invalid_identifiers = set([r[0] for r in invalid_alleles])

print(' Found a total of {0} invalid '
'alleles.\n'.format(len(invalid_identifiers)))

# list translated sequences files
# List translated sequences files
dna_files = [r[0] for r in qc_results]
prot_files = [r[1] for r in qc_results]

# determine loci missing annotations
# Determine loci missing annotations
miss_annotation = [pf for pf in prot_files
if pf.split('_prots')[0] + '.fasta' in absent_loci]

print('Loci missing UniProt annotation: {0}'.format(len(miss_annotation)))

queries_files = []
if len(miss_annotation) > 0:
# create SPARQL queries to query UniProt SPARQL endpoint
# Create SPARQL queries to query UniProt SPARQL endpoint
print('Creating SPARQL queries to search UniProt for annotations...')
with concurrent.futures.ThreadPoolExecutor(max_workers=ct.UNIPROT_SPARQL_THREADS) as executor:
for res in executor.map(create_uniprot_queries, miss_annotation):
queries_files.append(res)

# multithreaded annotation searching
# Multithreaded annotation searching
print('Searching for annotations on UniProt...')
loci_annotations = {}
total_found = 0
Expand All @@ -1142,7 +1136,7 @@ def main(schema_directory, species_id, schema_name, loci_prefix,
print('\r', 'Searched annotations for '
'{0}/{1} loci'.format(total_found, total_loci), end='')

# get user and custom annotations
# Get user and custom annotations
if os.path.isfile(str(annotations)) is True:
user_annotations = import_annotations(annotations)
valid = 0
Expand Down
14 changes: 7 additions & 7 deletions CHEWBBACA/UniprotFinder/annotate_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ def proteome_annotations(schema_directory, temp_directory, taxa,
return proteome_results


def sparql_annotations(loci_files, translation_table):
def sparql_annotations(loci_files, translation_table, cpu_cores):
"""Retrieve annotations from UniProt's SPARQL endpoint.
Parameters
Expand All @@ -264,13 +264,13 @@ def sparql_annotations(loci_files, translation_table):
URL to the page of the record that matched the
locus.
"""
# create inputs to multiprocessing
# Create inputs to multiprocessing
uniprot_args = [[gene, translation_table, ur.get_annotation]
for gene in loci_files]

# this works with all alleles in the loci to maximize
# chance of finding info
workers = ct.UNIPROT_SPARQL_THREADS
# This works with all alleles in the loci to maximize
# chance of finding annotations
workers = cpu_cores if cpu_cores <= ct.UNIPROT_SPARQL_THREADS else ct.UNIPROT_SPARQL_THREADS
annotations = mo.map_async_parallelizer(uniprot_args,
mo.function_helper,
workers,
Expand Down Expand Up @@ -360,7 +360,7 @@ def main(schema_directory, output_directory, genes_list, protein_table,
print('Schema: {0}'.format(schema_directory))
print('Number of loci: {0}'.format(len(loci_paths)))

# find annotations based on reference proteomes for species
# Find annotations based on reference proteomes for species
proteome_results = {}
if taxa is not None:
proteome_results = proteome_annotations(schema_directory,
Expand Down Expand Up @@ -394,7 +394,7 @@ def main(schema_directory, output_directory, genes_list, protein_table,
translation_table = 11

# Get annotations through UniProt SPARQL endpoint
results = sparql_annotations(loci_paths, translation_table)
results = sparql_annotations(loci_paths, translation_table, cpu_cores)
for i, r in enumerate(results):
if fo.file_basename(r[0], False) in loci_basenames:
if r[1] != '' or r[2] != '':
Expand Down
8 changes: 8 additions & 0 deletions CHEWBBACA/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,3 +457,11 @@

# Invalid format for loci files
NON_FASTA_LOCI_EXCEPTION = ('The following loci files are not in FASTA format:\n{0}')

# User does not have permissions to upload schemas to Chewie-NS
LOADSCHEMA_NO_PERMISSIONS = ('Current user has no Administrator or Contributor '
'permissions.\nNot allowed to upload schemas.')

# PTF is missing from schema's directory
LOADSCHEMA_MISSING_PTF = ('Please ensure that the schema\'s directory includes the '
'Prodigal training file used to create the schema.')

0 comments on commit e6b68a2

Please sign in to comment.