From e8e1f0ea632e2b9232b0234f62af79083f0fcb63 Mon Sep 17 00:00:00 2001 From: rfm-targa Date: Thu, 11 Jul 2024 19:09:04 +0100 Subject: [PATCH] Converting to tabs. --- CHEWBBACA/utils/core_functions.py | 1792 ++++++++++++++--------------- 1 file changed, 896 insertions(+), 896 deletions(-) diff --git a/CHEWBBACA/utils/core_functions.py b/CHEWBBACA/utils/core_functions.py index 5678b603..c325db58 100644 --- a/CHEWBBACA/utils/core_functions.py +++ b/CHEWBBACA/utils/core_functions.py @@ -15,854 +15,854 @@ import os try: - from utils import (constants as ct, - blast_wrapper as bw, - gene_prediction as gp, - file_operations as fo, - fasta_operations as fao, - sequence_clustering as sc, - sequence_manipulation as sm, - iterables_manipulation as im, - multiprocessing_operations as mo) + from utils import (constants as ct, + blast_wrapper as bw, + gene_prediction as gp, + file_operations as fo, + fasta_operations as fao, + sequence_clustering as sc, + sequence_manipulation as sm, + iterables_manipulation as im, + multiprocessing_operations as mo) except ModuleNotFoundError: - from CHEWBBACA.utils import (constants as ct, - blast_wrapper as bw, - gene_prediction as gp, - file_operations as fo, - fasta_operations as fao, - sequence_clustering as sc, - sequence_manipulation as sm, - iterables_manipulation as im, - multiprocessing_operations as mo) + from CHEWBBACA.utils import (constants as ct, + blast_wrapper as bw, + gene_prediction as gp, + file_operations as fo, + fasta_operations as fao, + sequence_clustering as sc, + sequence_manipulation as sm, + iterables_manipulation as im, + multiprocessing_operations as mo) def predict_genes(fasta_files, ptf_path, translation_table, - prodigal_mode, cpu_cores, output_directory): - """Execute Prodigal to predict coding sequences from Fasta files. - - Parameters - ---------- - fasta_files : list - List of paths to FASTA files with genomic - sequences. - ptf_path : str - Path to the Prodigal training file. Should - be NoneType if a training file is not provided. - translation_table : int - Genetic code used to predict and translate - coding sequences. - prodigal_mode : str - Prodigal execution mode. - cpu_cores : int - Number of processes that will run Prodigal in - parallel. - output_directory : str - Path to the directory where output files - with Prodigal's results will be stored in. - - Returns - ------- - failed_info : list - List that contains a list with the stderr for the - cases that Prodigal failed to predict genes for - and the path to the file with information about - failed cases. Returns NoneType if gene prediction - succeeded for all inputs. - """ - if ptf_path is not None: - # Read training file to create GeneFinder object - training_data = gp.read_training_file(ptf_path) - # Create GeneFinder object based on training data - gene_finder = gp.create_gene_finder(training_data, True, True, False) - elif ptf_path is None and prodigal_mode == 'meta': - # Create GeneFinder object to run in meta mode - gene_finder = gp.create_gene_finder(None, True, True, True) - else: - gene_finder = None - - common_args = [output_directory, gene_finder, translation_table] - - # Divide inputs into equal number of sublists for maximum process - # progress resolution - pyrodigal_inputs = im.divide_list_into_n_chunks(list(fasta_files.items()), - len(fasta_files)) - - # Add common arguments to all sublists - pyrodigal_inputs = im.multiprocessing_inputs(pyrodigal_inputs, - common_args, - gp.predict_genome_genes) - - # Run Pyrodigal to predict genes - # Need to use ThreadPool. Pyrodigal might hang when using Pool - pyrodigal_results = mo.map_async_parallelizer(pyrodigal_inputs, - mo.function_helper, - cpu_cores, - show_progress=True, - pool_type='threadpool') - - # Determine if Pyrodigal predicted genes for all genomes - failed = {line[0][0]: line[1] - for line in pyrodigal_results - if line[1] == 0 - or isinstance(line[1], str) is True} - - cds_counts = {line[0][1]: line[1] - for line in pyrodigal_results - if isinstance(line[1], int) is True} - - total_cds = sum([line[1] - for line in pyrodigal_results - if isinstance(line[1], int) is True]) - - cds_fastas = [line[2] for line in pyrodigal_results if line[2] is not None] - - cds_hashes = {line[0][1]: line[3] for line in pyrodigal_results if line[3] is not None} - - # Merge dictionaries with info about CDSs closes to contig tips - close_to_tip = [line[4] for line in pyrodigal_results if len(line[4]) > 0] - close_to_tip = im.merge_dictionaries(close_to_tip) - - return [failed, total_cds, cds_fastas, cds_hashes, cds_counts, close_to_tip] + prodigal_mode, cpu_cores, output_directory): + """Execute Prodigal to predict coding sequences from Fasta files. + + Parameters + ---------- + fasta_files : list + List of paths to FASTA files with genomic + sequences. + ptf_path : str + Path to the Prodigal training file. Should + be NoneType if a training file is not provided. + translation_table : int + Genetic code used to predict and translate + coding sequences. + prodigal_mode : str + Prodigal execution mode. + cpu_cores : int + Number of processes that will run Prodigal in + parallel. + output_directory : str + Path to the directory where output files + with Prodigal's results will be stored in. + + Returns + ------- + failed_info : list + List that contains a list with the stderr for the + cases that Prodigal failed to predict genes for + and the path to the file with information about + failed cases. Returns NoneType if gene prediction + succeeded for all inputs. + """ + if ptf_path is not None: + # Read training file to create GeneFinder object + training_data = gp.read_training_file(ptf_path) + # Create GeneFinder object based on training data + gene_finder = gp.create_gene_finder(training_data, True, True, False) + elif ptf_path is None and prodigal_mode == 'meta': + # Create GeneFinder object to run in meta mode + gene_finder = gp.create_gene_finder(None, True, True, True) + else: + gene_finder = None + + common_args = [output_directory, gene_finder, translation_table] + + # Divide inputs into equal number of sublists for maximum process + # progress resolution + pyrodigal_inputs = im.divide_list_into_n_chunks(list(fasta_files.items()), + len(fasta_files)) + + # Add common arguments to all sublists + pyrodigal_inputs = im.multiprocessing_inputs(pyrodigal_inputs, + common_args, + gp.predict_genome_genes) + + # Run Pyrodigal to predict genes + # Need to use ThreadPool. Pyrodigal might hang when using Pool + pyrodigal_results = mo.map_async_parallelizer(pyrodigal_inputs, + mo.function_helper, + cpu_cores, + show_progress=True, + pool_type='threadpool') + + # Determine if Pyrodigal predicted genes for all genomes + failed = {line[0][0]: line[1] + for line in pyrodigal_results + if line[1] == 0 + or isinstance(line[1], str) is True} + + cds_counts = {line[0][1]: line[1] + for line in pyrodigal_results + if isinstance(line[1], int) is True} + + total_cds = sum([line[1] + for line in pyrodigal_results + if isinstance(line[1], int) is True]) + + cds_fastas = [line[2] for line in pyrodigal_results if line[2] is not None] + + cds_hashes = {line[0][1]: line[3] for line in pyrodigal_results if line[3] is not None} + + # Merge dictionaries with info about CDSs closes to contig tips + close_to_tip = [line[4] for line in pyrodigal_results if len(line[4]) > 0] + close_to_tip = im.merge_dictionaries(close_to_tip) + + return [failed, total_cds, cds_fastas, cds_hashes, cds_counts, close_to_tip] def merge_dna_dedup(dedup_files, ids_map): - """Select distinct DNA sequences based on sequence deduplication results. - - Parameters - ---------- - dedup_files : list - List with the paths to the files with the results - from sequence deduplication. - ids_map : dict - List with two dictionaries mapping input - identifiers to integer identifiers and vice - versa. - - Return - ------ - merged_results : dict - Dictionary with distinct sequence hashes as keys - and lists with the protid and input integer identifier - of the sequence chosen as representative followed by the - list of other inputs that contain the sequence - (e.g. 02bf9c4f695ce7...: [3567, 1, 2, 3, 10] ). - The lists are encoded with polyline encoding. - total_duplicated : dict - Total number of times a sequence was duplicated. - representative_seqids : list - List with the sequence identifiers of the - representative sequences. - """ - merged_results = {} - total_duplicated = 0 - representative_seqids = [] - for file in dedup_files: - results = fo.pickle_loader(file) - # Sequence hash and list with protid:inputID pairs - for hashid, ids in results.items(): - stored_ids = merged_results.get(hashid, []) - # Already have a representative seqid - if len(stored_ids) > 0: - stored_ids = im.polyline_decoding(stored_ids) - # Get input integer identifiers - new_ids = [ids[i] for i in range(1, len(ids), 2)] - total_duplicated += (len(ids)/2) - # New distinct sequence - else: - # Choose first sequence seqid as representative - rep_seqid = '{0}-protein{1}'.format(ids_map[1][ids[1]], ids[0]) - representative_seqids.append(rep_seqid) - new_ids = ids[0:2] + [ids[i] for i in range(3, len(ids), 2)] - # Do not count representative as duplicate - total_duplicated += (len(ids)/2) - 1 - - merged_results[hashid] = im.polyline_encoding(stored_ids+new_ids) - - return [merged_results, total_duplicated, representative_seqids] + """Select distinct DNA sequences based on sequence deduplication results. + + Parameters + ---------- + dedup_files : list + List with the paths to the files with the results + from sequence deduplication. + ids_map : dict + List with two dictionaries mapping input + identifiers to integer identifiers and vice + versa. + + Return + ------ + merged_results : dict + Dictionary with distinct sequence hashes as keys + and lists with the protid and input integer identifier + of the sequence chosen as representative followed by the + list of other inputs that contain the sequence + (e.g. 02bf9c4f695ce7...: [3567, 1, 2, 3, 10] ). + The lists are encoded with polyline encoding. + total_duplicated : dict + Total number of times a sequence was duplicated. + representative_seqids : list + List with the sequence identifiers of the + representative sequences. + """ + merged_results = {} + total_duplicated = 0 + representative_seqids = [] + for file in dedup_files: + results = fo.pickle_loader(file) + # Sequence hash and list with protid:inputID pairs + for hashid, ids in results.items(): + stored_ids = merged_results.get(hashid, []) + # Already have a representative seqid + if len(stored_ids) > 0: + stored_ids = im.polyline_decoding(stored_ids) + # Get input integer identifiers + new_ids = [ids[i] for i in range(1, len(ids), 2)] + total_duplicated += (len(ids)/2) + # New distinct sequence + else: + # Choose first sequence seqid as representative + rep_seqid = '{0}-protein{1}'.format(ids_map[1][ids[1]], ids[0]) + representative_seqids.append(rep_seqid) + new_ids = ids[0:2] + [ids[i] for i in range(3, len(ids), 2)] + # Do not count representative as duplicate + total_duplicated += (len(ids)/2) - 1 + + merged_results[hashid] = im.polyline_encoding(stored_ids+new_ids) + + return [merged_results, total_duplicated, representative_seqids] def merge_protein_dedup(dedup_files, ids_map): - """Select distinct proteins based on sequence deduplication results. - - Parameters - ---------- - dedup_files : list - List with the paths to the files with the results - from sequence deduplication. - ids_map : dict - List with two dictionaries mapping input - identifiers to integer identifiers and vice - versa. - - Return - ------ - merged_results : dict - Dictionary with distinct sequence hashes as keys - and lists with protid and input integer identifiers - pairs. The lists are encoded with polyline encoding. - total_duplicated : dict - Total number of times a sequence was duplicated. - representative_seqids : list - List with the sequence identifiers of the - representative sequences. - """ - merged_results = {} - total_duplicated = 0 - representative_seqids = [] - for file in dedup_files: - results = fo.pickle_loader(file) - # Sequence hash and list with protid:inputID pairs - for hashid, ids in results.items(): - stored_ids = merged_results.get(hashid, []) - new_ids = ids - # Already have a representative seqid - if len(stored_ids) > 0: - stored_ids = im.polyline_decoding(stored_ids) - total_duplicated += (len(ids)/2) - else: - rep_seqid = '{0}-protein{1}'.format(ids_map[1][ids[1]], ids[0]) - representative_seqids.append(rep_seqid) - total_duplicated += (len(ids)/2) - 1 - - merged_results[hashid] = im.polyline_encoding(stored_ids+new_ids) - - return [merged_results, total_duplicated, representative_seqids] + """Select distinct proteins based on sequence deduplication results. + + Parameters + ---------- + dedup_files : list + List with the paths to the files with the results + from sequence deduplication. + ids_map : dict + List with two dictionaries mapping input + identifiers to integer identifiers and vice + versa. + + Return + ------ + merged_results : dict + Dictionary with distinct sequence hashes as keys + and lists with protid and input integer identifiers + pairs. The lists are encoded with polyline encoding. + total_duplicated : dict + Total number of times a sequence was duplicated. + representative_seqids : list + List with the sequence identifiers of the + representative sequences. + """ + merged_results = {} + total_duplicated = 0 + representative_seqids = [] + for file in dedup_files: + results = fo.pickle_loader(file) + # Sequence hash and list with protid:inputID pairs + for hashid, ids in results.items(): + stored_ids = merged_results.get(hashid, []) + new_ids = ids + # Already have a representative seqid + if len(stored_ids) > 0: + stored_ids = im.polyline_decoding(stored_ids) + total_duplicated += (len(ids)/2) + else: + rep_seqid = '{0}-protein{1}'.format(ids_map[1][ids[1]], ids[0]) + representative_seqids.append(rep_seqid) + total_duplicated += (len(ids)/2) - 1 + + merged_results[hashid] = im.polyline_encoding(stored_ids+new_ids) + + return [merged_results, total_duplicated, representative_seqids] def merge_get_seqids(dedup_files, ids_map): - """Select distinct sequences based on sequence deduplication results. - - Parameters - ---------- - dedup_files : list - List with the paths to the files with the results - from sequence deduplication. - ids_map : dict - List with two dictionaries mapping input - identifiers to integer identifiers and vice - versa. - - Return - ------ - total_duplicated : dict - Total number of times a sequence was duplicated. - representative_seqids : list - List with the sequence identifiers of the - representative sequences. - """ - total_duplicated = 0 - representative_seqids = [] - representative_hashes = set() - for file in dedup_files: - results = fo.pickle_loader(file) - # Sequence hash and list with protid:inputID pairs - for hashid, ids in results.items(): - if hashid not in representative_hashes: - rep_seqid = '{0}-protein{1}'.format(ids_map[1][ids[1]], ids[0]) - representative_seqids.append(rep_seqid) - representative_hashes.add(hashid) - total_duplicated += (len(ids)/2) - 1 - else: - total_duplicated += (len(ids)/2) - - return [None, total_duplicated, representative_seqids] + """Select distinct sequences based on sequence deduplication results. + + Parameters + ---------- + dedup_files : list + List with the paths to the files with the results + from sequence deduplication. + ids_map : dict + List with two dictionaries mapping input + identifiers to integer identifiers and vice + versa. + + Return + ------ + total_duplicated : dict + Total number of times a sequence was duplicated. + representative_seqids : list + List with the sequence identifiers of the + representative sequences. + """ + total_duplicated = 0 + representative_seqids = [] + representative_hashes = set() + for file in dedup_files: + results = fo.pickle_loader(file) + # Sequence hash and list with protid:inputID pairs + for hashid, ids in results.items(): + if hashid not in representative_hashes: + rep_seqid = '{0}-protein{1}'.format(ids_map[1][ids[1]], ids[0]) + representative_seqids.append(rep_seqid) + representative_hashes.add(hashid) + total_duplicated += (len(ids)/2) - 1 + else: + total_duplicated += (len(ids)/2) + + return [None, total_duplicated, representative_seqids] def exclude_duplicates(fasta_files, temp_directory, cpu_cores, - ids_map, protein=False, only_seqids=False): - """Identify duplicated sequences and select distinct set of sequences. - - Parameters - ---------- - fasta_files : list - List with paths to FASTA files. - temp_directory : str - Path to the directory where new files will be - created. - cpu_cores : int - Number of deduplication processes to run in - parallel. - ids_map : list - List with two dictionaries mapping input - identifiers to integer identifiers and vice - versa. - protein : bool - If sequences to deduplicate are proteins. - only_seqids : bool - Return only the list of distinct seqids. - Do not return the hash table with encoded results. - - Returns - ------- - A list with the following elements: - distinct_seqids : list - List with the sequence identifiers of distinct - sequences. - distinct_seqs : str - Path to the FASTA file with distinct sequences. - """ - # Define output files paths - output_files = [fo.join_paths(temp_directory, [f'distinct_{str(i+1)}.fasta']) - for i, file in enumerate(fasta_files)] - # Create input lists to pass to multiprocessing - dedup_inputs = im.aggregate_iterables([fasta_files, output_files]) - dedup_inputs = im.multiprocessing_inputs(dedup_inputs, [ids_map[0]], - sm.determine_distinct) - - # Determine set of distinct sequences per file - dedup_results = mo.map_async_parallelizer(dedup_inputs, - mo.function_helper, - cpu_cores, - show_progress=False) - - if only_seqids is True: - merge_results = merge_get_seqids(dedup_results, ids_map) - else: - if protein is False: - merge_results = merge_dna_dedup(dedup_results, ids_map) - else: - merge_results = merge_protein_dedup(dedup_results, ids_map) - - hash_table, duplicated_count, representative_seqids = merge_results - - if hash_table is not None: - hash_table_file = fo.join_paths(temp_directory, ['distinct.hashtable']) - fo.pickle_dumper(hash_table, hash_table_file) - - # Concatenate Fasta files from parallel processes - dedup_outfiles = [files[1] for files in dedup_inputs] - merge_fasta = fo.join_paths(temp_directory, ['concat.fasta']) - merge_fasta = fo.concatenate_files(dedup_outfiles, merge_fasta) - - # Create index for concatenated Fasta and get representative sequences - merge_fasta_index = fao.index_fasta(merge_fasta) - representative_fasta = fo.join_paths(temp_directory, ['distinct.fasta']) - fao.get_sequences_by_id(merge_fasta_index, representative_seqids, - representative_fasta, 50000) - - # Remove intermediate files - fo.remove_files(dedup_outfiles+dedup_results+[merge_fasta]) - - return [hash_table, representative_seqids, - representative_fasta, duplicated_count] + ids_map, protein=False, only_seqids=False): + """Identify duplicated sequences and select distinct set of sequences. + + Parameters + ---------- + fasta_files : list + List with paths to FASTA files. + temp_directory : str + Path to the directory where new files will be + created. + cpu_cores : int + Number of deduplication processes to run in + parallel. + ids_map : list + List with two dictionaries mapping input + identifiers to integer identifiers and vice + versa. + protein : bool + If sequences to deduplicate are proteins. + only_seqids : bool + Return only the list of distinct seqids. + Do not return the hash table with encoded results. + + Returns + ------- + A list with the following elements: + distinct_seqids : list + List with the sequence identifiers of distinct + sequences. + distinct_seqs : str + Path to the FASTA file with distinct sequences. + """ + # Define output files paths + output_files = [fo.join_paths(temp_directory, [f'distinct_{str(i+1)}.fasta']) + for i, file in enumerate(fasta_files)] + # Create input lists to pass to multiprocessing + dedup_inputs = im.aggregate_iterables([fasta_files, output_files]) + dedup_inputs = im.multiprocessing_inputs(dedup_inputs, [ids_map[0]], + sm.determine_distinct) + + # Determine set of distinct sequences per file + dedup_results = mo.map_async_parallelizer(dedup_inputs, + mo.function_helper, + cpu_cores, + show_progress=False) + + if only_seqids is True: + merge_results = merge_get_seqids(dedup_results, ids_map) + else: + if protein is False: + merge_results = merge_dna_dedup(dedup_results, ids_map) + else: + merge_results = merge_protein_dedup(dedup_results, ids_map) + + hash_table, duplicated_count, representative_seqids = merge_results + + if hash_table is not None: + hash_table_file = fo.join_paths(temp_directory, ['distinct.hashtable']) + fo.pickle_dumper(hash_table, hash_table_file) + + # Concatenate Fasta files from parallel processes + dedup_outfiles = [files[1] for files in dedup_inputs] + merge_fasta = fo.join_paths(temp_directory, ['concat.fasta']) + merge_fasta = fo.concatenate_files(dedup_outfiles, merge_fasta) + + # Create index for concatenated Fasta and get representative sequences + merge_fasta_index = fao.index_fasta(merge_fasta) + representative_fasta = fo.join_paths(temp_directory, ['distinct.fasta']) + fao.get_sequences_by_id(merge_fasta_index, representative_seqids, + representative_fasta, 50000) + + # Remove intermediate files + fo.remove_files(dedup_outfiles+dedup_results+[merge_fasta]) + + return [hash_table, representative_seqids, + representative_fasta, duplicated_count] def exclude_small(fasta_file, minimum_length, variation=0): - """Identify sequences smaller that a specified length value. - - Parameters - ---------- - fasta_file : str - Path to a FASTA file. - minimum_length : int - Sequences with a length value below this value are - considered small. - variation : float - Accept sequences with length variation of up to - minus (`minimum_length`*`variation`). - - Returns - ------- - small_seqids : list - List with the sequence identifiers of small - sequences. - ss_lines : list - List with one string per small sequence. Each string - represents an exception message for a sequence that - is small. - """ - # determine small sequences and keep their seqids - small_seqids = sm.determine_small(fasta_file, minimum_length, variation) - - ss_lines = ['{0}: smaller than {1} chars'.format(seqid, minimum_length) - for seqid in small_seqids] - - return [small_seqids, ss_lines] + """Identify sequences smaller that a specified length value. + + Parameters + ---------- + fasta_file : str + Path to a FASTA file. + minimum_length : int + Sequences with a length value below this value are + considered small. + variation : float + Accept sequences with length variation of up to + minus (`minimum_length`*`variation`). + + Returns + ------- + small_seqids : list + List with the sequence identifiers of small + sequences. + ss_lines : list + List with one string per small sequence. Each string + represents an exception message for a sequence that + is small. + """ + # determine small sequences and keep their seqids + small_seqids = sm.determine_small(fasta_file, minimum_length, variation) + + ss_lines = ['{0}: smaller than {1} chars'.format(seqid, minimum_length) + for seqid in small_seqids] + + return [small_seqids, ss_lines] def translate_sequences(sequence_ids, sequences_file, temp_directory, - translation_table, minimum_length, cpu_cores): - """Translate DNA sequences. - - Returns information about sequences that are untranslatable - and saves translatable DNA sequences and proteins to FASTA files. - - Parameters - ---------- - sequence_ids : list - List with the identifiers of the sequences that - should be translated. - sequences_file : str - Path to the FASTA file with the DNA sequences. - temp_directory : str - Path to the directory where new files will be - created. - translation_table : int - Genetic code used to translate coding sequences. - minimum_length : int - Sequences with a length value below this value are - considered small and are excluded. - cpu_cores : int - Number of translation processes to run in - parallel. - - Returns - ------- - A list with the following elements: - dna_file : str - Path to the FASTA with all DNA sequences - that could be translated. - protein_file : str - Path to the FASTA file with the translated - sequences. - untrans_seqids : list - List with the identifiers of the sequences - that could not be translated. - untrans_lines : list - List with one string per untranslatable - sequence. Each string has a sequence - identifier and a small description that - indicates why the sequence could not be - translated. - """ - # Divide inputs into sublists - translation_inputs = im.divide_list_into_n_chunks(sequence_ids, cpu_cores) - - # Create paths to files with protein sequences - protein_template = fo.join_paths(temp_directory, ['translated_{0}.fasta']) - protein_files = [protein_template.format(i+1) - for i in range(len(translation_inputs))] - - # Add common args to sublists - common_args = [sequences_file, translation_table, minimum_length] - translation_inputs = im.aggregate_iterables([translation_inputs, - protein_files]) - translation_inputs = im.multiprocessing_inputs(translation_inputs, - common_args, - sm.translate_coding_sequences) - - # Translate sequences - translation_results = mo.map_async_parallelizer(translation_inputs, - mo.function_helper, - cpu_cores, - show_progress=True) - - # Concatenate files - protein_file = fo.join_paths(temp_directory, ['translated.fasta']) - protein_file = fo.concatenate_files(protein_files, protein_file) - fo.remove_files(protein_files) - - # Determine sequences that could not be translated - untrans_lines = [] - untrans_seqids = [] - for res in translation_results: - if len(res[0]) > 0: - untrans_lines.extend([f'{r[0]}: {r[1]}' for r in res[0]]) - untrans_seqids.extend([r[0] for r in res[0]]) - - return [protein_file, untrans_seqids, untrans_lines] + translation_table, minimum_length, cpu_cores): + """Translate DNA sequences. + + Returns information about sequences that are untranslatable + and saves translatable DNA sequences and proteins to FASTA files. + + Parameters + ---------- + sequence_ids : list + List with the identifiers of the sequences that + should be translated. + sequences_file : str + Path to the FASTA file with the DNA sequences. + temp_directory : str + Path to the directory where new files will be + created. + translation_table : int + Genetic code used to translate coding sequences. + minimum_length : int + Sequences with a length value below this value are + considered small and are excluded. + cpu_cores : int + Number of translation processes to run in + parallel. + + Returns + ------- + A list with the following elements: + dna_file : str + Path to the FASTA with all DNA sequences + that could be translated. + protein_file : str + Path to the FASTA file with the translated + sequences. + untrans_seqids : list + List with the identifiers of the sequences + that could not be translated. + untrans_lines : list + List with one string per untranslatable + sequence. Each string has a sequence + identifier and a small description that + indicates why the sequence could not be + translated. + """ + # Divide inputs into sublists + translation_inputs = im.divide_list_into_n_chunks(sequence_ids, cpu_cores) + + # Create paths to files with protein sequences + protein_template = fo.join_paths(temp_directory, ['translated_{0}.fasta']) + protein_files = [protein_template.format(i+1) + for i in range(len(translation_inputs))] + + # Add common args to sublists + common_args = [sequences_file, translation_table, minimum_length] + translation_inputs = im.aggregate_iterables([translation_inputs, + protein_files]) + translation_inputs = im.multiprocessing_inputs(translation_inputs, + common_args, + sm.translate_coding_sequences) + + # Translate sequences + translation_results = mo.map_async_parallelizer(translation_inputs, + mo.function_helper, + cpu_cores, + show_progress=True) + + # Concatenate files + protein_file = fo.join_paths(temp_directory, ['translated.fasta']) + protein_file = fo.concatenate_files(protein_files, protein_file) + fo.remove_files(protein_files) + + # Determine sequences that could not be translated + untrans_lines = [] + untrans_seqids = [] + for res in translation_results: + if len(res[0]) > 0: + untrans_lines.extend([f'{r[0]}: {r[1]}' for r in res[0]]) + untrans_seqids.extend([r[0] for r in res[0]]) + + return [protein_file, untrans_seqids, untrans_lines] def cluster_sequences(sequences, word_size, window_size, clustering_sim, - representatives, grow_clusters, kmer_offset, - seq_num_cluster, temp_directory, cpu_cores, - divide, position): - """Cluster sequences based on the proportion of shared minimizers. - - Parameters - ---------- - sequences : dict - Dictionary with sequence identifiers as keys and - sequences as values. - word_size : int - Value k for the k-mer size. - window_size : int - Value for the window size/number of consecutive - k-mers per window. - clustering_sim : float - Similarity threshold to add a sequence to - a cluster. - representatives : dict - Dictionary with k-mers as keys and a list with - identifiers of sequences that contain that k-mer - as values. - grow_clusters : bool - If it is allowed to create new clusters. - kmer_offset : int - Value to indicate offset of consecutive kmers. - seq_num_cluster : int - Maximum number of clusters that a sequence can be - added to. - temp_directory : str - Path to the directory where the clustering results - will be saved to. - cpu_cores : int - Number of clustering processes to run in parallel. - divide : int - Create smaller groups of sequences that can - be processed in parallel. - position : bool - True if the start position for each k-mer should be saved. - False otherwise. - - Returns - ------- - clusters : dict - Dictionary with representative sequence identifiers - as keys and lists of tuples as values. Each tuple - contains a sequence identifier of a clustered - sequence, the proportion of shared kmers with the - representative and the length of the clustered - sequence. - """ - # Sort sequences by length - sorted_seqs = {k: v for k, v in im.sort_iterable(sequences.items(), - sort_key=lambda x: len(x[1]), - reverse=True)} - - if divide is not None: - # Divide input sequences into smaller groups - cluster_inputs = im.split_iterable(sorted_seqs, divide) - else: - cluster_inputs = [sorted_seqs] - - common_args = [word_size, window_size, clustering_sim, - representatives, grow_clusters, kmer_offset, - position, seq_num_cluster, - sc.clusterer] - cluster_inputs = [[c, *common_args] for c in cluster_inputs] - - # Cluster proteins - clustering_results = mo.map_async_parallelizer(cluster_inputs, - mo.function_helper, - cpu_cores, - show_progress=True) - - # Merge clusters - clusters = [d[0] for d in clustering_results] - clusters = im.merge_dictionaries(clusters) - rep_sequences = [d[1] for d in clustering_results] - rep_sequences = im.merge_dictionaries(rep_sequences) - - # Perform clustering with representatives - # This step does not run for AlleleCall as we do - # not need to compare schema representatives - # It runs for CreateSchema because the representatives - # from each cluster group were not compared - # No need to run this step if input sequences were not divided into smaller groups - # AlleleCall does not select new representatives, so this does not run - if len(cluster_inputs) > 1 and len(rep_sequences) > 0: - # Cluster representatives - rep_clusters = sc.clusterer(rep_sequences, word_size, - window_size, clustering_sim, - representatives, grow_clusters, - kmer_offset, position, - seq_num_cluster) - - merged_clusters = {} - for k, v in rep_clusters[0].items(): - # Merge clusters whose representatives are similar - for n in v: - # Representatives from other clusters are added with - # similarity score against new representative - # Clustered sequences from other clusters are added - # with similarity score against their representative - add_seqids = [n] + [s for s in clusters[n[0]] if s[0] != n[0]] - merged_clusters.setdefault(k, []).extend(add_seqids) - - clusters = merged_clusters - - # Sort clusters - clusters = {k: v for k, v in im.sort_iterable(clusters.items())} - - # Write file with clustering results - clusters_out = os.path.join(temp_directory, 'clusters.txt') - sc.write_clusters(clusters, clusters_out) - - return clusters + representatives, grow_clusters, kmer_offset, + seq_num_cluster, temp_directory, cpu_cores, + divide, position): + """Cluster sequences based on the proportion of shared minimizers. + + Parameters + ---------- + sequences : dict + Dictionary with sequence identifiers as keys and + sequences as values. + word_size : int + Value k for the k-mer size. + window_size : int + Value for the window size/number of consecutive + k-mers per window. + clustering_sim : float + Similarity threshold to add a sequence to + a cluster. + representatives : dict + Dictionary with k-mers as keys and a list with + identifiers of sequences that contain that k-mer + as values. + grow_clusters : bool + If it is allowed to create new clusters. + kmer_offset : int + Value to indicate offset of consecutive kmers. + seq_num_cluster : int + Maximum number of clusters that a sequence can be + added to. + temp_directory : str + Path to the directory where the clustering results + will be saved to. + cpu_cores : int + Number of clustering processes to run in parallel. + divide : int + Create smaller groups of sequences that can + be processed in parallel. + position : bool + True if the start position for each k-mer should be saved. + False otherwise. + + Returns + ------- + clusters : dict + Dictionary with representative sequence identifiers + as keys and lists of tuples as values. Each tuple + contains a sequence identifier of a clustered + sequence, the proportion of shared kmers with the + representative and the length of the clustered + sequence. + """ + # Sort sequences by length + sorted_seqs = {k: v for k, v in im.sort_iterable(sequences.items(), + sort_key=lambda x: len(x[1]), + reverse=True)} + + if divide is not None: + # Divide input sequences into smaller groups + cluster_inputs = im.split_iterable(sorted_seqs, divide) + else: + cluster_inputs = [sorted_seqs] + + common_args = [word_size, window_size, clustering_sim, + representatives, grow_clusters, kmer_offset, + position, seq_num_cluster, + sc.clusterer] + cluster_inputs = [[c, *common_args] for c in cluster_inputs] + + # Cluster proteins + clustering_results = mo.map_async_parallelizer(cluster_inputs, + mo.function_helper, + cpu_cores, + show_progress=True) + + # Merge clusters + clusters = [d[0] for d in clustering_results] + clusters = im.merge_dictionaries(clusters) + rep_sequences = [d[1] for d in clustering_results] + rep_sequences = im.merge_dictionaries(rep_sequences) + + # Perform clustering with representatives + # This step does not run for AlleleCall as we do + # not need to compare schema representatives + # It runs for CreateSchema because the representatives + # from each cluster group were not compared + # No need to run this step if input sequences were not divided into smaller groups + # AlleleCall does not select new representatives, so this does not run + if len(cluster_inputs) > 1 and len(rep_sequences) > 0: + # Cluster representatives + rep_clusters = sc.clusterer(rep_sequences, word_size, + window_size, clustering_sim, + representatives, grow_clusters, + kmer_offset, position, + seq_num_cluster) + + merged_clusters = {} + for k, v in rep_clusters[0].items(): + # Merge clusters whose representatives are similar + for n in v: + # Representatives from other clusters are added with + # similarity score against new representative + # Clustered sequences from other clusters are added + # with similarity score against their representative + add_seqids = [n] + [s for s in clusters[n[0]] if s[0] != n[0]] + merged_clusters.setdefault(k, []).extend(add_seqids) + + clusters = merged_clusters + + # Sort clusters + clusters = {k: v for k, v in im.sort_iterable(clusters.items())} + + # Write file with clustering results + clusters_out = os.path.join(temp_directory, 'clusters.txt') + sc.write_clusters(clusters, clusters_out) + + return clusters def cluster_representative_filter(clusters, representative_filter, - output_directory): - """Exclude sequences highly similar to cluster representatives. - - After removing highly similar sequences, excludes clusters - that are singletons (only contain the representative). - - Parameters - ---------- - clusters : dict - Dictionary with representative sequence identifiers - as keys and lists of tuples as values. Each tuple - contains a sequence identifier of a clustered - sequence, the proportion of shared kmers with the - representative and the length of the clustered - sequence. - representative_filter : float - Similarity threshold value. Sequences with - equal or greater similarity value with the cluster's - representative are excluded from clusters. - output_directory : str - Path to the directory where the clustering results - will be saved to. - - Returns - ------- - A list with the following elements: - pruned_clusters : dict - Clusters without the sequences that were highly - similar to the cluster's representative and without - the clusters that were singletons (only contained - the representative). - excluded_seqids : list - List with the sequence identifiers of the sequences - that were excluded from the clusters. - """ - # remove sequences that are very similar to representatives - pruning_results = sc.representative_pruner(clusters, - representative_filter) - - pruned_clusters, excluded_seqids = pruning_results - - # get identifiers of excluded sequences - # determine set because same seqids could be in several clusters - excluded_seqids = set([e[0] for e in excluded_seqids]) - - # remove excluded seqids from clusters without high representative - # similarity - pruned_clusters = {k: [e for e in v if e[0] not in excluded_seqids] - for k, v in pruned_clusters.items()} - - # write file with pruning results - pruned_out = os.path.join(output_directory, 'clusters.txt') - sc.write_clusters(pruned_clusters, pruned_out) - - # identify singletons and exclude those clusters - singletons = im.select_keys(pruned_clusters, 0) - - pruned_clusters = im.prune_dictionary(pruned_clusters, singletons) - - # determine number of sequences that still need to be evaluated - # +1 to include representative - clustered_sequences = sum([len(v)+1 for k, v in pruned_clusters.items()]) + len(singletons) - - # write list of excluded seqids to file - excluded_outfile = os.path.join(output_directory, 'excluded.txt') - fo.write_lines(excluded_seqids, excluded_outfile) - - return [pruned_clusters, excluded_seqids, singletons, clustered_sequences] + output_directory): + """Exclude sequences highly similar to cluster representatives. + + After removing highly similar sequences, excludes clusters + that are singletons (only contain the representative). + + Parameters + ---------- + clusters : dict + Dictionary with representative sequence identifiers + as keys and lists of tuples as values. Each tuple + contains a sequence identifier of a clustered + sequence, the proportion of shared kmers with the + representative and the length of the clustered + sequence. + representative_filter : float + Similarity threshold value. Sequences with + equal or greater similarity value with the cluster's + representative are excluded from clusters. + output_directory : str + Path to the directory where the clustering results + will be saved to. + + Returns + ------- + A list with the following elements: + pruned_clusters : dict + Clusters without the sequences that were highly + similar to the cluster's representative and without + the clusters that were singletons (only contained + the representative). + excluded_seqids : list + List with the sequence identifiers of the sequences + that were excluded from the clusters. + """ + # remove sequences that are very similar to representatives + pruning_results = sc.representative_pruner(clusters, + representative_filter) + + pruned_clusters, excluded_seqids = pruning_results + + # get identifiers of excluded sequences + # determine set because same seqids could be in several clusters + excluded_seqids = set([e[0] for e in excluded_seqids]) + + # remove excluded seqids from clusters without high representative + # similarity + pruned_clusters = {k: [e for e in v if e[0] not in excluded_seqids] + for k, v in pruned_clusters.items()} + + # write file with pruning results + pruned_out = os.path.join(output_directory, 'clusters.txt') + sc.write_clusters(pruned_clusters, pruned_out) + + # identify singletons and exclude those clusters + singletons = im.select_keys(pruned_clusters, 0) + + pruned_clusters = im.prune_dictionary(pruned_clusters, singletons) + + # determine number of sequences that still need to be evaluated + # +1 to include representative + clustered_sequences = sum([len(v)+1 for k, v in pruned_clusters.items()]) + len(singletons) + + # write list of excluded seqids to file + excluded_outfile = os.path.join(output_directory, 'excluded.txt') + fo.write_lines(excluded_seqids, excluded_outfile) + + return [pruned_clusters, excluded_seqids, singletons, clustered_sequences] def cluster_intra_filter(clusters, sequences, word_size, - intra_filter, output_directory): - """Exclude sequences that are highly similar to other clustered sequences. - - Parameters - ---------- - clusters : dict - Dictionary with representative sequence identifiers - as keys and lists of tuples as values. Each tuple - contains a sequence identifier of a clustered - sequence, the proportion of shared kmers with the - representative and the length of the clustered - sequence. - sequences : dict - Dictionary with sequence identifiers as keys and - sequences as values. - word_size : int - Value k for the k-mer size. - intra_filter : float - Similarity threshold value. Sequences with - equal or greater similarity value with other - clustered sequences are excluded from clusters. - output_directory : str - Path to the directory where the clustering results - will be saved to. - - Returns - ------- - A list with the following elements: - clusters : dict - Clusters without the sequences that were highly - similar to other clustered sequences. - intra_excluded : list - List with the identifiers of the sequences that - were excluded. - """ - # identify clusters with more than 1 sequence - intra_clusters = {k: v for k, v in clusters.items() if len(v) > 1} - - excluded_seqids, excluded_sims = sc.intra_cluster_sim(intra_clusters, - sequences, - word_size, - intra_filter) - - intra_excluded = [v for k, v in excluded_seqids.items()] - intra_excluded = im.flatten_list(intra_excluded) - # get identifiers of excluded sequences - # determine set because same seqids could be in several clusters - intra_excluded = set(intra_excluded) - - # remove excluded seqids from clusters without high intra-similarity - pruned_clusters = {k: [e for e in v if e[0] not in intra_excluded] - for k, v in clusters.items()} - - # write excluded to file - intrasim_out = os.path.join(output_directory, 'excluded.txt') - sc.write_clusters(excluded_sims, intrasim_out) - # write clusters to file - intrasim_out = os.path.join(output_directory, 'clusters.txt') - sc.write_clusters(pruned_clusters, intrasim_out) - - return [pruned_clusters, intra_excluded] + intra_filter, output_directory): + """Exclude sequences that are highly similar to other clustered sequences. + + Parameters + ---------- + clusters : dict + Dictionary with representative sequence identifiers + as keys and lists of tuples as values. Each tuple + contains a sequence identifier of a clustered + sequence, the proportion of shared kmers with the + representative and the length of the clustered + sequence. + sequences : dict + Dictionary with sequence identifiers as keys and + sequences as values. + word_size : int + Value k for the k-mer size. + intra_filter : float + Similarity threshold value. Sequences with + equal or greater similarity value with other + clustered sequences are excluded from clusters. + output_directory : str + Path to the directory where the clustering results + will be saved to. + + Returns + ------- + A list with the following elements: + clusters : dict + Clusters without the sequences that were highly + similar to other clustered sequences. + intra_excluded : list + List with the identifiers of the sequences that + were excluded. + """ + # identify clusters with more than 1 sequence + intra_clusters = {k: v for k, v in clusters.items() if len(v) > 1} + + excluded_seqids, excluded_sims = sc.intra_cluster_sim(intra_clusters, + sequences, + word_size, + intra_filter) + + intra_excluded = [v for k, v in excluded_seqids.items()] + intra_excluded = im.flatten_list(intra_excluded) + # get identifiers of excluded sequences + # determine set because same seqids could be in several clusters + intra_excluded = set(intra_excluded) + + # remove excluded seqids from clusters without high intra-similarity + pruned_clusters = {k: [e for e in v if e[0] not in intra_excluded] + for k, v in clusters.items()} + + # write excluded to file + intrasim_out = os.path.join(output_directory, 'excluded.txt') + sc.write_clusters(excluded_sims, intrasim_out) + # write clusters to file + intrasim_out = os.path.join(output_directory, 'clusters.txt') + sc.write_clusters(pruned_clusters, intrasim_out) + + return [pruned_clusters, intra_excluded] def blast_clusters(clusters, sequences, output_directory, - blastp_path, makeblastdb_path, cpu_cores, - blastdb_aliastool_path, only_rep=False): - """Use BLAST to align sequences in the same clusters. - - Parameters - ---------- - clusters : dict - Dictionary with representative sequence identifiers - as keys and lists of tuples as values. Each tuple - contains a sequence identifier of a clustered - sequence, the proportion of shared kmers with the - representative and the length of the clustered - sequence. - sequences : dict - Path to the FASTA file that contains the sequences in - the clusters. - output_directory : str - Path to the directory where the clustering results - will be saved to. - blastp_path : str - Path to the `BLASTp` executable. - makeblastdb_path : str - Path to the `makeblastdb` executable. - cpu_cores : int - Number of BLASTp processes to run in parallel. + blastp_path, makeblastdb_path, cpu_cores, + blastdb_aliastool_path, only_rep=False): + """Use BLAST to align sequences in the same clusters. + + Parameters + ---------- + clusters : dict + Dictionary with representative sequence identifiers + as keys and lists of tuples as values. Each tuple + contains a sequence identifier of a clustered + sequence, the proportion of shared kmers with the + representative and the length of the clustered + sequence. + sequences : dict + Path to the FASTA file that contains the sequences in + the clusters. + output_directory : str + Path to the directory where the clustering results + will be saved to. + blastp_path : str + Path to the `BLASTp` executable. + makeblastdb_path : str + Path to the `makeblastdb` executable. + cpu_cores : int + Number of BLASTp processes to run in parallel. blastdb_aliastool_path : str - Path to the blastalias_tool executable to convert + Path to the blastalias_tool executable to convert seqid files to binary format. - only_rep - - Returns - ------- - blast_results : list - List with paths to the files with BLASTp results - (one file per cluster). - """ - # Create directory to store BLASTp database - blast_db_dir = fo.join_paths(output_directory, ['BLASTp_db']) - fo.create_directory(blast_db_dir) - # Create BLAST DB - blast_db = fo.join_paths(blast_db_dir, ['distinct_proteins']) - db_std = bw.make_blast_db(makeblastdb_path, sequences, blast_db, 'prot') - - blastp_results_dir = os.path.join(output_directory, 'BLASTp_outfiles') - fo.create_directory(blastp_results_dir) - - # Create TXT files with the list of sequences per cluster - seqids_to_blast = sc.blast_seqids(clusters, blastp_results_dir, only_rep) - - # Distribute clusters per available cores - process_num = 20 if cpu_cores <= 20 else cpu_cores - splitted_seqids = mo.distribute_loci(seqids_to_blast, process_num, 'seqcount') - common_args = [sequences, blastp_results_dir, blastp_path, - blast_db, blastdb_aliastool_path, only_rep, sc.cluster_blaster] - splitted_seqids = [[s, *common_args] for s in splitted_seqids] - - # BLAST sequences in a cluster against every sequence in that cluster - blast_results = mo.map_async_parallelizer(splitted_seqids, - mo.function_helper, - cpu_cores, - show_progress=True) - - return [blast_results, blastp_results_dir] + only_rep + + Returns + ------- + blast_results : list + List with paths to the files with BLASTp results + (one file per cluster). + """ + # Create directory to store BLASTp database + blast_db_dir = fo.join_paths(output_directory, ['BLASTp_db']) + fo.create_directory(blast_db_dir) + # Create BLAST DB + blast_db = fo.join_paths(blast_db_dir, ['distinct_proteins']) + db_std = bw.make_blast_db(makeblastdb_path, sequences, blast_db, 'prot') + + blastp_results_dir = os.path.join(output_directory, 'BLASTp_outfiles') + fo.create_directory(blastp_results_dir) + + # Create TXT files with the list of sequences per cluster + seqids_to_blast = sc.blast_seqids(clusters, blastp_results_dir, only_rep) + + # Distribute clusters per available cores + process_num = 20 if cpu_cores <= 20 else cpu_cores + splitted_seqids = mo.distribute_loci(seqids_to_blast, process_num, 'seqcount') + common_args = [sequences, blastp_results_dir, blastp_path, + blast_db, blastdb_aliastool_path, only_rep, sc.cluster_blaster] + splitted_seqids = [[s, *common_args] for s in splitted_seqids] + + # BLAST sequences in a cluster against every sequence in that cluster + blast_results = mo.map_async_parallelizer(splitted_seqids, + mo.function_helper, + cpu_cores, + show_progress=True) + + return [blast_results, blastp_results_dir] def compute_bsr(subject_score, query_score): - """Compute the BLAST Score Ratio for an alignment between two sequences. + """Compute the BLAST Score Ratio for an alignment between two sequences. - Parameters - ---------- - subject_score : float - Alignment raw score computed by BLAST. - query_score : float - Raw score computed by BLAST for the - self-alignment of the query sequence. + Parameters + ---------- + subject_score : float + Alignment raw score computed by BLAST. + query_score : float + Raw score computed by BLAST for the + self-alignment of the query sequence. - Returns - ------- - bsr : float - BLAST Score Ratio for the alignment. - """ - bsr = subject_score / query_score + Returns + ------- + bsr : float + BLAST Score Ratio for the alignment. + """ + bsr = subject_score / query_score - return bsr + return bsr def determine_self_scores(fasta_file, output_directory, makeblastdb_path, - blast_path, db_type, blast_threads, blastdb_aliastool_path): - """Compute the self-alignment raw score for sequences in a FASTA file. - - Parameters - ---------- - fasta_file : str - Path to the FASTA file that contains the sequences. - output_directory : str - Path to the working directory. - makeblastdb_path : str - Path to the 'makeblastdb' executable. - blast_path : str - Path to the BLASTp/n executable. - db_type : str - Type of the database, nucleotide (nuc) or - protein (prot). - blast_threads : int - Number of threads/cores used to run BLAST. + blast_path, db_type, blast_threads, blastdb_aliastool_path): + """Compute the self-alignment raw score for sequences in a FASTA file. + + Parameters + ---------- + fasta_file : str + Path to the FASTA file that contains the sequences. + output_directory : str + Path to the working directory. + makeblastdb_path : str + Path to the 'makeblastdb' executable. + blast_path : str + Path to the BLASTp/n executable. + db_type : str + Type of the database, nucleotide (nuc) or + protein (prot). + blast_threads : int + Number of threads/cores used to run BLAST. blastdb_aliastool_path : str - Path to the blastalias_tool executable to convert + Path to the blastalias_tool executable to convert seqid files to binary format. - Returns - ------- - self_scores : dict - Dictionary with sequence identifiers as keys and - tuples with the sequence length and the raw score - of the self-alignment as values. - """ - blast_db_dir = fo.join_paths(output_directory, ['BLASTp_db']) - fo.create_directory(blast_db_dir) - blast_db = fo.join_paths(blast_db_dir, [fo.file_basename(fasta_file, False)]) - # Will not work if file contains duplicates - db_std = bw.make_blast_db(makeblastdb_path, fasta_file, blast_db, db_type) - - # Split Fasta file to BLAST short sequences (<30aa) separately - # only possible to have alleles <30aa with non-default schemas - above_outfile, below_outfile = fao.split_seqlength(fasta_file, - output_directory, - ct.BLAST_TASK_THRESHOLD['blastp']) - - total_seqids = above_outfile[0] + below_outfile[0] - all_seqids = above_outfile[1] + below_outfile[1] - - if above_outfile[0] > 0: - above_outdir = fo.join_paths(output_directory, ['above']) - fo.create_directory(above_outdir) - # Divide FASTA file into groups of 100 sequences to reduce - # execution time for large sequence sets - split_fastas = fao.split_seqcount(above_outfile[2], above_outdir, 100) - - # Create TXT with list of sequence identifiers - seqids_files = [] - for f in split_fastas: - seqids = list(f[1]) - seqids_file = fo.join_paths(above_outdir, [fo.file_basename(f[0], False)]) - fo.write_lines(seqids, seqids_file) - seqids_files.append(seqids_file) + Returns + ------- + self_scores : dict + Dictionary with sequence identifiers as keys and + tuples with the sequence length and the raw score + of the self-alignment as values. + """ + blast_db_dir = fo.join_paths(output_directory, ['BLASTp_db']) + fo.create_directory(blast_db_dir) + blast_db = fo.join_paths(blast_db_dir, [fo.file_basename(fasta_file, False)]) + # Will not work if file contains duplicates + db_std = bw.make_blast_db(makeblastdb_path, fasta_file, blast_db, db_type) + + # Split Fasta file to BLAST short sequences (<30aa) separately + # only possible to have alleles <30aa with non-default schemas + above_outfile, below_outfile = fao.split_seqlength(fasta_file, + output_directory, + ct.BLAST_TASK_THRESHOLD['blastp']) + + total_seqids = above_outfile[0] + below_outfile[0] + all_seqids = above_outfile[1] + below_outfile[1] + + if above_outfile[0] > 0: + above_outdir = fo.join_paths(output_directory, ['above']) + fo.create_directory(above_outdir) + # Divide FASTA file into groups of 100 sequences to reduce + # execution time for large sequence sets + split_fastas = fao.split_seqcount(above_outfile[2], above_outdir, 100) + + # Create TXT with list of sequence identifiers + seqids_files = [] + for f in split_fastas: + seqids = list(f[1]) + seqids_file = fo.join_paths(above_outdir, [fo.file_basename(f[0], False)]) + fo.write_lines(seqids, seqids_file) + seqids_files.append(seqids_file) binary_seqid_files = [] for file in seqids_files: @@ -872,29 +872,29 @@ def determine_self_scores(fasta_file, output_directory, makeblastdb_path, binary_file) binary_seqid_files.append(binary_file) seqids_files = binary_seqid_files - # This should not happen or be very rare, but just in case - else: - split_fastas = [] - - # Create directory to store results from final BLASTp - final_blastp_dir = fo.join_paths(output_directory, ['BLASTp_results']) - fo.create_directory(final_blastp_dir) - blast_outputs = ['{0}/{1}_blast_out.tsv'.format(final_blastp_dir, - fo.file_basename(file[0], False)) - for file in split_fastas] - - # Add common arguments to all sublists - blast_inputs = [[blast_path, blast_db, file[0], - blast_outputs[i], 1, 1, - seqids_files[i], 'blastp', None, - None, bw.run_blast] - for i, file in enumerate(split_fastas)] - - # Add file with short sequences - if below_outfile[0] > 0: - seqids = below_outfile[1] - seqids_file = fo.join_paths(output_directory, [fo.file_basename(below_outfile[2], False)]) - fo.write_lines(seqids, seqids_file) + # This should not happen or be very rare, but just in case + else: + split_fastas = [] + + # Create directory to store results from final BLASTp + final_blastp_dir = fo.join_paths(output_directory, ['BLASTp_results']) + fo.create_directory(final_blastp_dir) + blast_outputs = ['{0}/{1}_blast_out.tsv'.format(final_blastp_dir, + fo.file_basename(file[0], False)) + for file in split_fastas] + + # Add common arguments to all sublists + blast_inputs = [[blast_path, blast_db, file[0], + blast_outputs[i], 1, 1, + seqids_files[i], 'blastp', None, + None, bw.run_blast] + for i, file in enumerate(split_fastas)] + + # Add file with short sequences + if below_outfile[0] > 0: + seqids = below_outfile[1] + seqids_file = fo.join_paths(output_directory, [fo.file_basename(below_outfile[2], False)]) + fo.write_lines(seqids, seqids_file) binary_file = f'{seqids_file}.bin' blast_std = bw.run_blastdb_aliastool(blastdb_aliastool_path, @@ -902,85 +902,85 @@ def determine_self_scores(fasta_file, output_directory, makeblastdb_path, binary_file) seqids_file = binary_file - below_blastout = '{0}/{1}_blastout.tsv'.format(final_blastp_dir, - fo.file_basename(below_outfile[2], False)) - blast_outputs.append(below_blastout) - blast_inputs.append([blast_path, blast_db, below_outfile[2], - below_blastout, 1, 1, - seqids_file, 'blastp-short', None, - None, bw.run_blast]) - - blast_results = mo.map_async_parallelizer(blast_inputs, - mo.function_helper, - blast_threads, - show_progress=False) - - # Concatenate files with BLASTp results - blast_output = fo.join_paths(final_blastp_dir, ['blastout_concat.tsv']) - blast_output = fo.concatenate_files(blast_outputs, blast_output) - - current_results = fo.read_tabular(blast_output) - # Get raw score and sequence length - # multiply by 3 to get DNA sequence length and add 3 to count stop codon - self_results = [line for line in current_results if line[0] == line[4]] - self_scores = {} - for line in self_results: - # multiply by 3 to get DNA sequence length - # add 3 to count stop codon - dna_length = (int(line[3])*3)+3 - self_scores[line[0]] = (dna_length, float(line[6])) - - # Determine if we got the score for all representatives - if len(self_scores) < total_seqids: - missing = [seqid - for seqid in all_seqids - if seqid not in [line[0] for line in self_results]] - # Index FASTA file - concat_reps_index = fao.index_fasta(fasta_file) - # Get all representatives that do not have a self-score - for seqid in missing: - current_rep = concat_reps_index[seqid] - record = fao.fasta_str_record(ct.FASTA_RECORD_TEMPLATE, [current_rep.id, str(current_rep.seq)]) - rep_file = fo.join_paths(output_directory, [f'{current_rep.id}_solo.fasta']) - fo.write_lines([record], rep_file) - # Create file with representative seqid to only compare against self - id_file = fo.join_paths(output_directory, [f'{current_rep.id}_ids.txt']) - fo.write_lines([current_rep.id], id_file) + below_blastout = '{0}/{1}_blastout.tsv'.format(final_blastp_dir, + fo.file_basename(below_outfile[2], False)) + blast_outputs.append(below_blastout) + blast_inputs.append([blast_path, blast_db, below_outfile[2], + below_blastout, 1, 1, + seqids_file, 'blastp-short', None, + None, bw.run_blast]) + + blast_results = mo.map_async_parallelizer(blast_inputs, + mo.function_helper, + blast_threads, + show_progress=False) + + # Concatenate files with BLASTp results + blast_output = fo.join_paths(final_blastp_dir, ['blastout_concat.tsv']) + blast_output = fo.concatenate_files(blast_outputs, blast_output) + + current_results = fo.read_tabular(blast_output) + # Get raw score and sequence length + # multiply by 3 to get DNA sequence length and add 3 to count stop codon + self_results = [line for line in current_results if line[0] == line[4]] + self_scores = {} + for line in self_results: + # multiply by 3 to get DNA sequence length + # add 3 to count stop codon + dna_length = (int(line[3])*3)+3 + self_scores[line[0]] = (dna_length, float(line[6])) + + # Determine if we got the score for all representatives + if len(self_scores) < total_seqids: + missing = [seqid + for seqid in all_seqids + if seqid not in [line[0] for line in self_results]] + # Index FASTA file + concat_reps_index = fao.index_fasta(fasta_file) + # Get all representatives that do not have a self-score + for seqid in missing: + current_rep = concat_reps_index[seqid] + record = fao.fasta_str_record(ct.FASTA_RECORD_TEMPLATE, [current_rep.id, str(current_rep.seq)]) + rep_file = fo.join_paths(output_directory, [f'{current_rep.id}_solo.fasta']) + fo.write_lines([record], rep_file) + # Create file with representative seqid to only compare against self + id_file = fo.join_paths(output_directory, [f'{current_rep.id}_ids.txt']) + fo.write_lines([current_rep.id], id_file) binary_file = f'{id_file}.bin' blast_std = bw.run_blastdb_aliastool(blastdb_aliastool_path, id_file, binary_file) id_file = binary_file - rep_blastout = fo.join_paths(output_directory, [f'{current_rep.id}_blastout.tsv']) - # Cannot get self-alignemnt for some sequences if composition-based stats is enabled - blast_std = bw.run_blast(blast_path, blast_db, rep_file, - rep_blastout, 1, 1, - id_file, 'blastp', None, 0) - rep_results = fo.read_tabular(rep_blastout) - if len(rep_results) > 0: - dna_length = (int(rep_results[0][3])*3)+3 - self_scores[rep_results[0][0]] = (dna_length, float(rep_results[0][6])) - else: - print('Could not determine the self-alignment raw ' - f'score for {rep_results[0][0]}') - - return self_scores + rep_blastout = fo.join_paths(output_directory, [f'{current_rep.id}_blastout.tsv']) + # Cannot get self-alignemnt for some sequences if composition-based stats is enabled + blast_std = bw.run_blast(blast_path, blast_db, rep_file, + rep_blastout, 1, 1, + id_file, 'blastp', None, 0) + rep_results = fo.read_tabular(rep_blastout) + if len(rep_results) > 0: + dna_length = (int(rep_results[0][3])*3)+3 + self_scores[rep_results[0][0]] = (dna_length, float(rep_results[0][6])) + else: + print('Could not determine the self-alignment raw ' + f'score for {rep_results[0][0]}') + + return self_scores def write_coordinates_file(coordinates_file, output_file): - """Write genome CDS coordinates to a TSV file. - - Parameters - ---------- - coordinates_file : str - Path to the pickle file that contains data about - the CDSs coordinates. - output_file : str - Path to the output TSV file. - """ - data = fo.pickle_loader(coordinates_file) - lines = [coords for h, coords in data[0].items()] - lines = im.flatten_list(lines) - lines = ['\t'.join(line) for line in lines] - fo.write_lines(lines, output_file) + """Write genome CDS coordinates to a TSV file. + + Parameters + ---------- + coordinates_file : str + Path to the pickle file that contains data about + the CDSs coordinates. + output_file : str + Path to the output TSV file. + """ + data = fo.pickle_loader(coordinates_file) + lines = [coords for h, coords in data[0].items()] + lines = im.flatten_list(lines) + lines = ['\t'.join(line) for line in lines] + fo.write_lines(lines, output_file)