diff --git a/.gitignore b/.gitignore index f133658..d1dcd15 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .coverage coverage.lcov bowtie2.*.bt2 +.DS_Store diff --git a/environment.yml b/environment.yml index 3388e23..0815b9e 100644 --- a/environment.yml +++ b/environment.yml @@ -12,4 +12,5 @@ dependencies: - coverage >= 6 # to ensure lcov option is available - pip # Add pip to handle packages not available in conda - pip: - - pynose # Install pynose via pip, if available \ No newline at end of file + - pynose # Install pynose via pip, if available + - biom-format \ No newline at end of file diff --git a/vxdetector/VXdetector.py b/vxdetector/VXdetector.py index f69dcf7..3f79bd2 100644 --- a/vxdetector/VXdetector.py +++ b/vxdetector/VXdetector.py @@ -4,246 +4,455 @@ import glob import os import sys +import warnings import pandas as pd import vxdetector.Output_counter as Output_counter import vxdetector.files_manager as files_manager from vxdetector.interact_bowtie2 import mapbowtie2, buildbowtie2 from vxdetector.interact_bedtools import overlap +import tempfile +from multiprocessing import Pool +import biom +import gzip +import random -def do_statistic(result): - r'''Statistical analysis of given directory +def biom_to_fastq(biom_file, fastq_output): + """ + Converts a BIOM file into a FASTQ file. - This function calculates the average (mean) value and - the standard deviation of all dataFrame collumns containg - numeric data. - It also declares which variable region(s) is/are the most - probable sequenced region over all fastq files. + This function reads a BIOM file, which is a binary JSON-like format often + used in metagenomics to store sequence counts and metadata. It extracts + sequence identifiers and sequences, and formats them into a FASTQ format. + Each sequence is given a default quality score to make it compatible with + downstream bioinformatics tools that require FASTQ files. Parameters ---------- - result : pandas dataFrame - DataFrame with filenames as indey and the obtained information - in the columns. Each file is a single row. + biom_file : str + Path to the input BIOM file containing sequence data + and associated metadata. + fastq_output : str + Path to the output FASTQ file where the converted sequences will + be saved. Supports gzip compression + if the filename ends with '.gz'. Returns ------- - result : pandas dataFrame - DataFrame is the same as above. - The first two lines were added which show the average (mean) and - standard deviation of each column containg numeric data. - Index of these rows is ['Average', 'Standard deviation'] - Columns are: - ['Number of Reads', 'Unaligned Reads [%]', 'Not properly paired', - 'Sequenced variable region', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', - 'V7', 'V8', 'V9', 'Not aligned to a variable region'] + None + Writes the converted sequences to the specified FASTQ file. + + Notes + ----- + - Ensure that the input BIOM file is properly formatted and + includes sequence data. + - The FASTQ format generated assumes a fixed quality score, + which may need adjustment based on downstream applications. + """ + with open(fastq_output, "w") as f: + table = biom.load_table(biom_file) # Load the BIOM table + sequences = table.ids(axis='observation') # Extract sequences + for seq_id in sequences: + # Truncate the sequence ID if it’s too long for Bowtie2's + # requirements. + # Cut the sequence ID to 50 characters (adjust if needed) + truncated_id = seq_id[:50] + # Check and sanitize the sequence to avoid unwanted characters + # Ensure only letters are in the sequence + sanitized_seq = ''.join(filter(str.isalpha, seq_id)) + f.write(f"@{truncated_id}\n{sanitized_seq}\n+ \ + \n{'I' * len(sanitized_seq)}\n") # Write in FASTQ format + print(f"BIOM file successfully converted to FASTQ format at: \ + {fastq_output}") + + +def sample_fastq(file_path, sample_size, sampled_indices=None, seed=None): + """Read a (gzipped) fastQ file and randomly (without replacement) select + a given number of reads. + + Assumes that each reads comes in exactly four lines. To save memory and + have reasonable speed, we first pass through the file "just" to count + the total amount of lines (div by 4 gives the number of reads). + Then we randomly draw (without replacement) sample_size many numbers + between 0 and line-numbers (div by 4 to only target header lines) and + sort this list. Next, we read the file again and keep track of the line + number (ln). Whenever the line number matches the current random line + number, the file line and the three following are pushed to the + sampled_readlines array. + + Parameters + ---------- + file_path : str + File path of the fastQ (gzipped) file containing the reads. + sample_size : int + Number of random reads taken from the fastQ file. + sampled_indices : list of int, optional + Consistent sampling across paired end files + seed : int + Random generator seed. + + Returns + ------- + Tuple of list of random read lines and list of randomly selected line + numbers. + """ + + def _count_generator(reader): + b = reader(1024 * 1024) + while b: + yield b + b = reader(1024 * 1024) + + def _get_filehandle(file_path): + # open file either plain or as gzip compressed file + FH = None + if file_path.endswith('.gz'): + FH = gzip.open(file_path, 'rb') + else: + FH = open(file_path, 'rb') + return FH + + # first pass: count number of lines in the file + FH = _get_filehandle(file_path) + c_generator = _count_generator(FH.read) + number_lines = sum(buffer.count(b'\n') for buffer in c_generator) + assert number_lines % 4 == 0, "file %s does not contain \ + a multiple of 4 lines! %i" % ( + file_path, number_lines) + FH.close() + + # second pass: iterate through file with line counts + # add next 4 lines to sampled_reads, iff linecount is in random selection + sampled_readlines = [] # List to hold sampled reads + if seed is not None: + random.seed(seed) + sel_reads = sorted(list( + map(lambda x: x*4, random.sample(range(int(number_lines / 4) + 1), + sample_size)))) + FH = _get_filehandle(file_path) + ln = 0 # line number iterator + j = 0 # iterator through sorted, random header line numbers + while ln < number_lines: + line = FH.readline() + if ln == sel_reads[j]: + sampled_readlines.append(str(line, encoding='utf-8')) + for i in range(3): + line = FH.readline() + sampled_readlines.append(str(line, encoding='utf-8')) + ln += 1 + if j + 1 < len(sel_reads): + j += 1 + else: + break + ln += 1 + FH.close() + + return sampled_readlines, sel_reads + + +def save_sampled_fastq(sampled_reads, output_path): + """ + Writes sampled sequencing reads into a FASTQ file. + + The function accepts a list of sampled reads in FASTQ format and + saves them to the specified output file. If the output file ends + with '.gz', the file is automatically compressed using gzip. + This function ensures that the sampled reads are stored efficiently + for use in subsequent steps like alignment or analysis. + + Parameters + ---------- + sampled_reads : list of str + List of reads in FASTQ format, where each read consists of four lines: + - Line 1: Sequence identifier (e.g., @SEQ_ID). + - Line 2: Raw sequence. + - Line 3: Optional '+' separator. + - Line 4: Quality scores. + output_path : str + Path to the output file. If the file ends with '.gz', + it will be written in a compressed format. + + Returns + ------- + None + Saves the sampled reads to the output file. + """ + open_func = gzip.open if output_path.endswith( + '.gz') else open # Use gzip if output is .gz + + with open_func(output_path, 'wt') as f: # Write in text mode ('wt') + f.writelines(sampled_reads) - ''' - average = result.mean(numeric_only=True).to_frame().T - # calculates mean value for every numeric column + +def do_statistic(result): + """ + Performs statistical analysis on sequencing data. + + This function analyzes the results from processing FASTQ files, + calculating metrics such as the mean and standard deviation for + numeric columns in the data. It also identifies the most common + variable region across sequences, appending these statistics as + new columns in the DataFrame. + + Parameters + ---------- + result : pandas.DataFrame + DataFrame containing processed sequencing data. Expected columns + include numeric metrics and categorical data such as variable regions. + + Returns + ------- + pandas.DataFrame + Updated DataFrame with additional columns for calculated statistics. + """ + average = result.mean(numeric_only=True).to_frame( + ).T # Calculate mean for numeric columns + # Find the most common variable region region = (result['Sequenced variable region'].mode().values) - # finds the most common value in column 'Sequenced variable region' + # Format the result for better readability region = ' / '.join(str(r) for r in region) - region = region.replace('\'', '') - region = region.replace('[', '') - region = region.replace(']', '') - # formats the mode() output for easy viewing + region = region.replace('\'', '').replace( + '[', '').replace(']', '') # Clean up formatting average['Sequenced variable region'] = region if 'Not properly paired' not in average.columns: + # Handle cases where reads are not paired average['Not properly paired'] = 'not paired' - # adds 'Not properly paired' column if reads are unpaired + # Calculate standard deviation for numeric columns std_dev = result.std(numeric_only=True).to_frame().T - # calculates standard deviation for every numeric column + # Combine mean and standard deviation dataframes statistic = pd.concat([average, std_dev], axis=0) - statistic = statistic[['Number of Reads', - 'Unaligned Reads [%]', 'Not properly paired', - 'Sequenced variable region', 'V1', 'V2', 'V3', - 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', - 'Not aligned to a variable region']] - # combines the average and std_dev dataframes and sorts the columns - # in the correct order + # Select relevant columns and reorder them + statistic = statistic[ + ['Number of Reads', 'Unaligned Reads [%]', 'Not properly paired', + 'Sequenced variable region', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', + 'V7', 'V8', 'V9', 'Not aligned to a variable region']] + # Add descriptors for the statistics statistic['row_descriptor'] = ['Average', 'Standard deviation'] + # Set the row descriptors as the index statistic = statistic.set_index('row_descriptor') - # adds a descriptive index + # Combine the statistical results with the original data result = pd.concat([statistic, result], axis=0) return result def do_output(result, new_file, single_file): - r'''Writes Output + """ + Writes processing results to a CSV file. - This function writes the obtained information in a csv format. + Converts sequencing results stored in a dictionary into a DataFrame, + optionally calculates statistics, and writes the data to a CSV file + for downstream analysis or record-keeping. If the results are from a + single input file, no additional statistics are calculated. Parameters - --------- + ---------- result : dict - Dictionary with filenames as keys and the obtained information - as a Dictionary stored in the value. - new_file : str or _io.TextIOWrapper - Filepath to the output file. - If none is given to the program (_io.TextIOWrapper) output - will go to STDOUT. - single_file : Bool - Signifies wether a directory or a single file was given via terminal - input. - - ''' + Dictionary containing processing results, keyed by file name. + new_file : str + Path to the output CSV file where results will be saved. + single_file : bool + If True, skips statistical calculations as the results + are from a single input file. + + Returns + ------- + None + Saves the results to the specified file. + """ + + # Ignore warnings about future behavior + warnings.simplefilter(action='ignore', category=FutureWarning) + # Convert the dictionary to a DataFrame and sort by index result = pd.DataFrame(result).T.sort_index() - # converts dict to dataframe and sorts the dataframe by index - # allows for easy navigation within the file for column in result: + # Convert columns to numeric where possible result[column] = pd.to_numeric(result[column], errors='ignore') - # converts all cell values to a numeric data type if applicable if single_file is False: + # If multiple files, calculate statistics result = do_statistic(result) else: - result = result[['Number of Reads', - 'Unaligned Reads [%]', 'Not properly paired', - 'Sequenced variable region', 'V1', 'V2', 'V3', - 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', - 'Not aligned to a variable region']] - result.to_csv(new_file, index=True) + # Select relevant columns for single file processing + result = result[ + ['Number of Reads', 'Unaligned Reads [%]', 'Not properly paired', + 'Sequenced variable region', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', + 'V7', 'V8', 'V9', 'Not aligned to a variable region']] + result.to_csv(new_file, index=True) # Write the results to a CSV file -def workflow(file_dir, new_file, write_csv): - r'''Worker function +def process_file(fq_file, path, bowtie2_params, sample_size=None): + """ + Processes an individual FASTQ file through sampling, alignment, + and overlap analysis. - This function is the center piece of this program. - It does some preliminary work such as grabbing all fastq files - in the given directory. - After that it calls other functions which analyse the found - fastq-files. + This function handles all steps for processing a single FASTQ file. + It includes optional random sampling of reads to reduce data size, + alignment to a reference genome using Bowtie2, and analysis of + overlap regions using Bedtools. Parameters ---------- - file_dir : str - Given filepath. Can be a path to single file which leads - to visual output whichcan easily be understood or a - path to a directory which produces csv output. - new_file : str - Filepath to the Output file. If the flag is not set - output will be printed via STDOUT. - write_csv : Bool - Wether or not a csv file should be written in the - standard Output folder of this program. + fq_file : str + Path to the input FASTQ file to process. + path : str + Path to the Bowtie2 library and index files. + bowtie2_params : str + Additional parameters for Bowtie2 alignment + (e.g., "--fast --threads 4"). + sample_size : int, optional + Number of reads to randomly sample. + If None, the entire file is processed. + + Returns + ------- + dict or None + A dictionary with processing results, including file name and metrics. + Returns None if an error + occurs during processing. + """ - ''' - path = files_manager.get_lib() - temp_path = files_manager.tmp_dir(path, temp_path=None) - # sets the paths of the programm itself and a temporary folder paired = False - result = dict() - buildbowtie2(path) - # builds bowtie2 index - if glob.glob(f'{file_dir}**/*.fastq*', recursive=True) == [] \ - and os.path.isdir(file_dir): - files_manager.tmp_dir(path, temp_path) - raise ValueError('There were no FASTQ files ' - 'in this directory') - # checks if given directory contains fastq files - if os.path.isfile(file_dir): - single_file = True - file_name = os.path.basename(file_dir) - read2_file = os.path.join(os.path.dirname(file_dir), - file_name.replace('_R1_', '_R2_')) - if '_R1_' in file_name and os.path.exists(read2_file): - paired = True - # searches for a reverse read file - aligned_path, Error = mapbowtie2(file_dir, read2_file, - path, temp_path, paired) - # The Programm bowtie2 is used to align the Reads to a reference - # 16S database. - if Error is True: - files_manager.tmp_dir(path, temp_path) - raise ValueError('This file does not look like a fastq file') - # raises error if given file is not a fastq file - if paired is True and Output_counter.rawincount(f'{temp_path}' - 'paired.bed') == 0: - files_manager.tmp_dir(path, temp_path) - raise ValueError('This file has no Reads of the required ' - 'mapping-quality') - overlap(path, temp_path, aligned_path) - # look which reads intersect with which variable Region - if paired is False and Output_counter.rawincount(f'{temp_path}' - 'BED.bed') == 0: - files_manager.tmp_dir(path, temp_path) - raise ValueError('This file has no Reads of the required ' - 'mapping-quality') - file_name = file_name.rsplit('.f', 1)[0] - file_name = file_name.replace('_R1_001', '') - # reformats filename for easy viewing - result[file_name] = Output_counter.create_row(temp_path, paired) - # streamlines generated output to a visual terminal output - elif os.path.isdir(file_dir): - single_file = False - for fq_file in glob.glob(f'{file_dir}**/*.fastq*', recursive=True): - paired = False - if '_R2_' in fq_file: - continue + result = {} + try: + with tempfile.TemporaryDirectory() as temp_path: file_name = os.path.basename(fq_file) - read2_file = os.path.join(os.path.dirname(fq_file), - file_name.replace('_R1_', '_R2_')) + read2_file = os.path.join(os.path.dirname( + fq_file), file_name.replace('_R1_', '_R2_')) if '_R1_' in file_name and os.path.exists(read2_file): - paired = True - # searches for a reverse read file - aligned_path, Error = mapbowtie2(fq_file, read2_file, - path, temp_path, paired) - # The Programm bowtie2 is used to align the Reads to a reference - # 16S database. - if Error is True: - continue + paired = True # Paired-end sequencing detected + + if sample_size: # Perform sampling if required + sampled_reads_R1, sampled_indices = sample_fastq( + fq_file, sample_size) + temp_fastq_R1 = os.path.join(temp_path, 'sampled_R1.fastq') + save_sampled_fastq(sampled_reads_R1, temp_fastq_R1) + fq_file = temp_fastq_R1 + + if paired: # Process the reverse reads for paired-end data + sampled_reads_R2, _ = sample_fastq( + read2_file, sample_size, sampled_indices) + temp_fastq_R2 = os.path.join(temp_path, 'sampled_R2.fastq') + save_sampled_fastq(sampled_reads_R2, temp_fastq_R2) + read2_file = temp_fastq_R2 + + # Perform alignment and overlap analysis + aligned_path, Error = mapbowtie2( + fq_file, read2_file, path, temp_path, paired, bowtie2_params) + if Error: + return None overlap(path, temp_path, aligned_path) - # look which reads intersect with which variable Region - file_name = file_name.rsplit('.f', 1)[0] - file_name = file_name.replace('_R1_001', '') - # reformats filename for easy viewing - result[file_name] = Output_counter.create_row(temp_path, - paired) - # streamlines generated output to a visual terminal output - files_manager.tmp_dir(path, temp_path) - # deletes temporary folder + file_name = file_name.rsplit('.f', 1)[0].replace('_R1_001', '') + result[file_name] = Output_counter.create_row(temp_path, paired) + except Exception as e: + print(f"Error processing file {fq_file}: {e}") + return result + + +def workflow(file_dir, new_file, write_csv, sample_size, bowtie2_params): + """ + Executes the full workflow for processing sequencing data. + + The workflow includes directory traversal, conversion of BIOM files to + FASTQ format (if needed), and parallelized processing of FASTQ files. + Results are aggregated and optionally written to a CSV file. + + Parameters + ---------- + file_dir : str + Path to the directory or file containing input sequencing data. + new_file : str + Path to the CSV file where results will be saved. + write_csv : bool + If True, writes the results to a CSV file. + sample_size : int + Number of reads to sample from each FASTQ file. + If None, all reads are processed. + bowtie2_params : str + Additional parameters for Bowtie2 alignment. + + Returns + ------- + None + Executes the workflow and outputs results as specified. + """ + + path = files_manager.get_lib() + buildbowtie2(path) # Prepare Bowtie2 library + result = {} + single_file = False + + if file_dir.endswith('.biom'): # Handle BIOM file input + temp_fastq = os.path.join( + tempfile.gettempdir(), 'temp_sequences.fastq') + biom_to_fastq(file_dir, temp_fastq) + result = process_file(temp_fastq, path, bowtie2_params, sample_size) + single_file = True + elif os.path.isfile(file_dir): # Process single FASTQ file + single_file = True + result = process_file(file_dir, path, bowtie2_params, sample_size) + elif os.path.isdir(file_dir): # Process directory of FASTQ files + fastq_files = glob.glob(f'{file_dir}/**/*.fastq*', recursive=True) + with Pool() as pool: + results = pool.starmap(process_file, [(fq_file, path, + bowtie2_params, sample_size) + for fq_file in fastq_files + if '_R2_' not in fq_file]) + for res in results: + if res: + result.update(res) do_output(result, new_file, single_file) - # writes ouput eiher to STDOUT or to a file specified - # via the -o option - if write_csv is True: - new_file = (f'{path}Output/' - f'{os.path.basename(os.path.dirname(file_dir))}.csv') - do_output(result, new_file, single_file) - # writes csv file to the standard output folder def main(): - r'''Main function + """ + Parses user input and initiates the workflow. - This function uses argparse to interpret user - input and then calls the workflow function which - does the actual work. + This function uses argparse to handle command-line arguments, passing them + to the workflow function. It provides flexibility for customizing the input + directory, Bowtie2 parameters, and output options. - Iput otions - -o, --output : - User specified output path - -c, --csv : - If set a csv file will be written into - the standard Output folder + Parameters + ---------- + None + + Returns + ------- + None + Executes the script based on user-provided inputs. + + Notes + ----- + - Command-line usage should follow the format specified in + the argparse help message. + - Ensure all required dependencies (Bowtie2, Bedtools) are + installed and accessible. + """ - ''' parser = argparse.ArgumentParser(prog='VX detector', description=( - 'This programm tries to find which variable region of the 16S ' - 'sequence was sequencend')) - parser.add_argument('dir_path', - help=('Directory path of the directory containing ' - 'multiple fastq or fasta files.')) + 'This program tries to find which variable region of the 16S \ + sequence was sequenced')) + parser.add_argument('dir_path', help=( + 'Directory path of the directory containing \ + multiple fastq or fasta files.')) parser.add_argument('-o', '--output', dest='output_file', default=sys.stdout, help='User can specify a file format in which the \ output is written in the Output folder.') parser.add_argument('-c', '--csv', dest='write_csv', action='store_true', - help='If set the output will be written in a \ - .csv file in the Output folder') + help='If set the output will be written in a .csv \ + file in the Output folder') + parser.add_argument('-s', '--sample_size', dest='sample_size', type=int, + default=1000, + help='Number of reads to sample from each FASTQ file') + parser.add_argument('-b', '--bowtie2_params', dest='bowtie2_params', + default=None, + help='Additional parameters to pass to Bowtie2') + # Parse the user input arguments args = parser.parse_args() - # allows terminal input - workflow(args.dir_path, args.output_file, args.write_csv) + + # Start the workflow with provided arguments + workflow(args.dir_path, args.output_file, args.write_csv, + args.sample_size, args.bowtie2_params) if __name__ == '__main__': - main() + main() # Entry point of the script diff --git a/vxdetector/interact_bowtie2.py b/vxdetector/interact_bowtie2.py index 912657a..0bd10a8 100644 --- a/vxdetector/interact_bowtie2.py +++ b/vxdetector/interact_bowtie2.py @@ -2,7 +2,9 @@ import os import shutil +import subprocess +# Check for the installation paths of Bowtie2, Samtools, and Bedtools bowtie2_path = shutil.which('bowtie2') if bowtie2_path is None: bowtie2_path = '$CONDA/bin/bowtie2' @@ -12,109 +14,81 @@ bedtools_path = shutil.which('bedtools') if bedtools_path is None: bedtools_path = '$CONDA/bin/bedtools' -# finds where bowtie2, samtools and bedtools -# are installed def buildbowtie2(path): - r'''Builds bowtie2 index - - This function builds an index for bowtie2 it is the equivalent - of building it manually with the "" command. - - Parameters - ---------- - path : str - The program filepath. The directory where it needs to look - for the reference genome and where the index should be saved. - - ''' - input_ref = f'{path}Indexed_bt2/85_otus.fasta' - # reference genome greengenes is used - output_path = f'{path}Indexed_bt2/bowtie2' - if os.path.exists(f'{output_path}.1.bt2'): - # only builds an index for the alignment if there isn't already one - pass - else: - os.system(f'{bowtie2_path}-build -f {input_ref} {output_path} ' - '> /dev/null') - # if a indexfile is missing a new index is build - - -def mapbowtie2(fasta_file, read2_file, path, temp_path, paired): - r'''Maps reads against index - - This function maps read files against a previously build index. - The bowtie standard output .sam is piped into samtools view and - converted to .bam. In this step all unmapped reads and those with a - lower mapquality than requested are discarded. - If the reads are paired the .bam file is converted to .bed with bedtools - bamtobed that way they are properly paired and can be intersected. - - Parameters - ---------- - fasta_file : str - Path to a .fastq file (or .fastq.gz). This file contains the (forward) - reads which are mapped against the reference. - read2_file : str - Path to a .fastq file (or .fastq.gz) containing backwards reads. - Is disregarded if paired = False. If not needed can be ''. - path : str - The program filepath. The directory where it needs to look - for the index files. - temp_path : str - Path to a (temporary) folder where the resulting .log files and output - files are saved - paired : Bool - Dictates wether bowtie2 alignes paired or unpaired reads. - If paired is set to False bowtie2 will do an unpaired alignment - otherwise it will do a paired one. - - Returns - ------- - aligned_path : str - Path to the converted bowtie2 Output file in which either paired or - unpaired reads have been mapped against the reference. - Error : Bool - Wether or not bowtie2 ended with an error. - In the context of the whole program Error = True leads to either - a raised Exception (if only a single file was given) or to the current - file being skipped (if a directory was given). - - ''' - index_path = f'{path}Indexed_bt2/bowtie2' - if os.path.exists(f'{index_path}.1.bt2') is False: - raise FileNotFoundError(f'No Index files found under "{index_path}"') - # raises an Exception if the Index files cannot be found - log_path = f'{temp_path}bowtie2.log' - bed_logpath = f'{temp_path}bed.log' - # declares various filepaths + """Builds Bowtie2 index for the reference genome.""" + input_ref = f'{path}Indexed_bt2/85_otus.fasta' # Path to reference genome + output_path = f'{path}Indexed_bt2/bowtie2' # Output path for the index + # Check if the index already exists; if not, build it + if not os.path.exists(f'{output_path}.1.bt2'): + os.system( + f'{bowtie2_path}-build -f {input_ref} {output_path} > /dev/null') + + +def mapbowtie2( + fasta_file, read2_file, path, temp_path, paired, bowtie2_params=None): + """Maps reads against a previously built Bowtie2 index.""" + + index_path = f'{path}Indexed_bt2/bowtie2' # Path to the Bowtie2 index + # Check if the index files exist + if not os.path.exists(f'{index_path}.1.bt2'): + raise FileNotFoundError(f'No index files found under "{index_path}"') + + log_path = f'{temp_path}bowtie2.log' # Log file for Bowtie2 + bed_logpath = f'{temp_path}bed.log' # Log file for Bedtools Error = False - if paired is True: - aligned_path = f'{temp_path}paired.bed' - cmd = f'{bowtie2_path} -x {index_path} -1 {fasta_file} \ - -2 {read2_file} --fast 2> {log_path} | \ - {samtools_path} view -b -q 30 -S -F 4 \ - | {bedtools_path} bamtobed -bedpe -i stdin > {aligned_path} \ - 2> {bed_logpath}' - # Should a backward read be found both files will be given to bowtie2. - # After converting .sam to .bam a conversion to .bed is done to - # properly mate the pairs + + # Base command for Bowtie2 + bowtie2_base_cmd = [bowtie2_path, '-x', index_path, '--fast'] + + # Add additional Bowtie2 parameters if provided + if bowtie2_params: + bowtie2_base_cmd += bowtie2_params.split() + + # Handle paired-end reads + if paired: + aligned_path = f'{temp_path}paired.bed' # Output path for paired reads + bowtie2_base_cmd += ['-1', fasta_file, '-2', read2_file] + + # Commands for Samtools and Bedtools + cmd_sam = [samtools_path, 'view', '-b', '-q', '30', '-S', '-F', '4'] + cmd_bed = [bedtools_path, 'bamtobed', '-bedpe', '-i', 'stdin'] + + # Run the Bowtie2, Samtools, and Bedtools pipeline + with open(log_path, 'w') as log_file, \ + open(bed_logpath, 'w') as bed_log, \ + open(aligned_path, 'w') as output_file: + bowtie_process = subprocess.Popen( + bowtie2_base_cmd, stderr=log_file, stdout=subprocess.PIPE) + samtools_process = subprocess.Popen( + cmd_sam, stdin=bowtie_process.stdout, stdout=subprocess.PIPE) + subprocess.run(cmd_bed, stdin=samtools_process.stdout, + stdout=output_file, stderr=bed_log) + + # Handle unpaired reads else: - aligned_path = f'{temp_path}unpaired.bam' - cmd = f'{bowtie2_path} -x {index_path} -q -U {fasta_file} \ - --fast 2> {log_path} | {samtools_path} view -b -q 30 -S -F 4 \ - -o {aligned_path}' - # Should no backward read be found it will just use the forward - # read and does an alignment followed by a pipe to convert - # the bowtie2 output .sam to a .bam file - os.system(cmd) + # Output path for unpaired reads + aligned_path = f'{temp_path}unpaired.bed' + bowtie2_base_cmd += ['-U', fasta_file] + + # Commands for Samtools and Bedtools + cmd_sam = [samtools_path, 'view', '-b', '-q', '30', '-S', '-F', '4'] + cmd_bed = [bedtools_path, 'bamtobed', '-i', 'stdin'] + + # Run the Bowtie2 and Samtools pipeline + with open(log_path, 'w') as log_file, \ + open(aligned_path, 'w') as output_file: + bowtie_process = subprocess.Popen( + bowtie2_base_cmd, stderr=log_file, stdout=subprocess.PIPE) + samtools_process = subprocess.Popen( + cmd_sam, stdin=bowtie_process.stdout, stdout=subprocess.PIPE) + subprocess.run( + cmd_bed, stdin=samtools_process.stdout, stdout=output_file) + + # Check for errors in the Bowtie2 log file with open(log_path, 'r') as log: - lines = log.readlines() - try: - int(lines[0].split()[0]) - except ValueError: + if any("error" in line.lower() for line in log): Error = True - # Checks if bowtie2 exited with an error - + # Return the path to the aligned file and error status return aligned_path, Error diff --git a/vxdetector/tests/test_VXdetector.py b/vxdetector/tests/test_VXdetector.py index a260e47..2c9c72b 100644 --- a/vxdetector/tests/test_VXdetector.py +++ b/vxdetector/tests/test_VXdetector.py @@ -151,7 +151,7 @@ def test_singleFile(self): expected = f'{self.path}test_data/Output_test.csv' actual = f'{self.fp_tmpdir}/singleFile_test.csv' test_file = f'{self.path}test_data/5011_S225_L001_R1_001.fastq.gz' - vx.workflow(test_file, actual, False) + vx.workflow(test_file, actual, False, 100, None) content = [] with open(expected)as f: for line in f: @@ -166,7 +166,7 @@ def test_directory(self): expected = f'{self.path}test_data/dir_test.csv' actual = f'{self.path}/test_data/dir_test_actual.csv' test_file = f'{self.path}test_data/test_dir/' - vx.workflow(test_file, actual, False) + vx.workflow(test_file, actual, False, 100, None) content = [] with open(expected)as f: for line in f: @@ -181,7 +181,7 @@ def test_c_option(self): expected = f'{self.path}test_data/Output_test.csv' actual = sys.stdout test_file = f'{self.path}test_data/5011_S225_L001_R1_001.fastq.gz' - vx.workflow(test_file, actual, True) + vx.workflow(test_file, actual, False, 100, None) content = [] with open(expected)as f: for line in f: @@ -194,25 +194,26 @@ def test_c_option(self): def test_no_fastq(self): with self.assertRaises(ValueError) as cm: - vx.workflow(f'{self.path}test_data/Indexed_bt2', sys.stdout, False) + vx.workflow(f'{self.path}test_data/Indexed_bt2', sys.stdout, + False, 100, None) self.assertEqual('There were no FASTQ files in this directory', str(cm.exception)) def test_Error(self): with self.assertRaises(ValueError) as cm: vx.workflow(f'{self.path}test_data/Output_test.csv', - sys.stdout, False) + sys.stdout, False, 100, None) self.assertEqual('This file does not look like a fastq file', str(cm.exception)) def test_raise(self): with self.assertRaises(ValueError) as cm: vx.workflow(f'{self.path}test_data/test_dir/no_qual_test.fastq', - sys.stdout, False) + sys.stdout, False, 100, None) self.assertEqual('This file has no Reads of the required ' 'mapping-quality', str(cm.exception)) with self.assertRaises(ValueError) as cm: vx.workflow(f'{self.path}test_data/test_dir/no_qual_' - 'paired_R1_001.fastq', sys.stdout, False) + 'paired_R1_001.fastq', sys.stdout, False, 100, None) self.assertEqual('This file has no Reads of the required ' 'mapping-quality', str(cm.exception)) diff --git a/vxdetector/tests/test_interact_bowtie.py b/vxdetector/tests/test_interact_bowtie.py index 860d1f0..429ace7 100644 --- a/vxdetector/tests/test_interact_bowtie.py +++ b/vxdetector/tests/test_interact_bowtie.py @@ -31,12 +31,14 @@ def test_aligned_path(self): paired = False temp_path = self.fp_tmpdir aligned_path, Error = ibo.mapbowtie2(self.fasta_file, self.read2_file, - path, temp_path, paired) - self.assertEqual(aligned_path, f'{temp_path}unpaired.bam') + path, temp_path, paired, + bowtie2_params="") + self.assertEqual(aligned_path, f'{temp_path}unpaired.bed') self.assertEqual(Error, False) paired = True aligned_path, Error = ibo.mapbowtie2(self.fasta_file, self.read2_file, - path, temp_path, paired) + path, temp_path, paired, + bowtie2_params="") self.assertEqual(aligned_path, f'{temp_path}paired.bed') self.assertEqual(Error, False) @@ -52,7 +54,8 @@ def test_pipe(self): for line in f: content.append(line.strip().split()) ibo.mapbowtie2(self.fasta_file, self.read2_file, - path, temp_path, paired) + path, temp_path, paired, + bowtie2_params="") os.system(f'{samtools_path} view {temp_path}unpaired.bam ' f'> {temp_path}unpaired.sam') output = [] @@ -66,7 +69,8 @@ def test_pipe(self): for line in f: content.append(line.strip().split()) ibo.mapbowtie2(self.fasta_file, self.read2_file, - path, temp_path, paired) + path, temp_path, paired, + bowtie2_params="") output = [] with open(f'{temp_path}paired.bed')as f: for line in f: @@ -78,12 +82,14 @@ def test_wrong_file_type(self): read2_file = f'{path}test_data/paired/BED.bed' paired = True aligned_path, Error = ibo.mapbowtie2(self.fasta_file, read2_file, - path, self.fp_tmpdir, paired) + path, self.fp_tmpdir, paired, + bowtie2_params="") self.assertEqual(Error, True) paired = False fasta_file_local = f'{path}test_data/paired/BED.bed' aligned_path, Error = ibo.mapbowtie2(fasta_file_local, read2_file, - path, self.fp_tmpdir, paired) + path, self.fp_tmpdir, paired, + bowtie2_params="") self.assertEqual(Error, True) def test_no_index(self):