From d38ae75264c14d534c95e7df14fd63d713abccb1 Mon Sep 17 00:00:00 2001 From: Jeremy Meiss Date: Mon, 5 Aug 2024 10:35:57 +0200 Subject: [PATCH 01/37] add subsampling --- vxdetector/VXdetector.py | 232 +++++++++++---------------------------- 1 file changed, 64 insertions(+), 168 deletions(-) diff --git a/vxdetector/VXdetector.py b/vxdetector/VXdetector.py index f69dcf7..bee5e52 100644 --- a/vxdetector/VXdetector.py +++ b/vxdetector/VXdetector.py @@ -4,246 +4,142 @@ import glob import os import sys +import warnings import pandas as pd -import vxdetector.Output_counter as Output_counter -import vxdetector.files_manager as files_manager -from vxdetector.interact_bowtie2 import mapbowtie2, buildbowtie2 -from vxdetector.interact_bedtools import overlap - +import Output_counter as Output_counter +import files_manager as files_manager +from interact_bowtie2 import mapbowtie2, buildbowtie2 +from interact_bedtools import overlap +import time +from multiprocessing import pool +import random + +def sample_fastq(file_path, sample_size): + '''get random parts from the fq file''' + sampled_reads = [] + with open(file_path, 'r') as f: + lines = f.readlines() + total_reads = len(lines) // 4 + sampled_indices = sorted(random.sample(range(total_reads), sample_size)) + for idx in sampled_indices: + read = lines[idx*4:(idx+1)*4] + sampled_reads.extend(read) + return sampled_reads + +def save_sampled_fastq(sampled_reads, output_path): + '''Save sampled reads to a temporary FASTQ file''' + with open(output_path, 'w') as f: + f.writelines(sampled_reads) def do_statistic(result): - r'''Statistical analysis of given directory - - This function calculates the average (mean) value and - the standard deviation of all dataFrame collumns containg - numeric data. - It also declares which variable region(s) is/are the most - probable sequenced region over all fastq files. - - Parameters - ---------- - result : pandas dataFrame - DataFrame with filenames as indey and the obtained information - in the columns. Each file is a single row. - - Returns - ------- - result : pandas dataFrame - DataFrame is the same as above. - The first two lines were added which show the average (mean) and - standard deviation of each column containg numeric data. - Index of these rows is ['Average', 'Standard deviation'] - Columns are: - ['Number of Reads', 'Unaligned Reads [%]', 'Not properly paired', - 'Sequenced variable region', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', - 'V7', 'V8', 'V9', 'Not aligned to a variable region'] - ''' average = result.mean(numeric_only=True).to_frame().T - # calculates mean value for every numeric column region = (result['Sequenced variable region'].mode().values) - # finds the most common value in column 'Sequenced variable region' region = ' / '.join(str(r) for r in region) region = region.replace('\'', '') region = region.replace('[', '') region = region.replace(']', '') - # formats the mode() output for easy viewing average['Sequenced variable region'] = region if 'Not properly paired' not in average.columns: average['Not properly paired'] = 'not paired' - # adds 'Not properly paired' column if reads are unpaired std_dev = result.std(numeric_only=True).to_frame().T - # calculates standard deviation for every numeric column statistic = pd.concat([average, std_dev], axis=0) - statistic = statistic[['Number of Reads', - 'Unaligned Reads [%]', 'Not properly paired', - 'Sequenced variable region', 'V1', 'V2', 'V3', - 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', - 'Not aligned to a variable region']] - # combines the average and std_dev dataframes and sorts the columns - # in the correct order + statistic = statistic[['Number of Reads', 'Unaligned Reads [%]', 'Not properly paired', + 'Sequenced variable region', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', + 'V7', 'V8', 'V9', 'Not aligned to a variable region']] statistic['row_descriptor'] = ['Average', 'Standard deviation'] statistic = statistic.set_index('row_descriptor') - # adds a descriptive index result = pd.concat([statistic, result], axis=0) return result - def do_output(result, new_file, single_file): - r'''Writes Output - - This function writes the obtained information in a csv format. - Parameters - --------- - result : dict - Dictionary with filenames as keys and the obtained information - as a Dictionary stored in the value. - new_file : str or _io.TextIOWrapper - Filepath to the output file. - If none is given to the program (_io.TextIOWrapper) output - will go to STDOUT. - single_file : Bool - Signifies wether a directory or a single file was given via terminal - input. - - ''' + warnings.simplefilter(action='ignore', category=FutureWarning) result = pd.DataFrame(result).T.sort_index() - # converts dict to dataframe and sorts the dataframe by index - # allows for easy navigation within the file for column in result: result[column] = pd.to_numeric(result[column], errors='ignore') - # converts all cell values to a numeric data type if applicable if single_file is False: result = do_statistic(result) else: - result = result[['Number of Reads', - 'Unaligned Reads [%]', 'Not properly paired', - 'Sequenced variable region', 'V1', 'V2', 'V3', - 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', - 'Not aligned to a variable region']] + result = result[['Number of Reads', 'Unaligned Reads [%]', 'Not properly paired', + 'Sequenced variable region', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', + 'V7', 'V8', 'V9', 'Not aligned to a variable region']] result.to_csv(new_file, index=True) - -def workflow(file_dir, new_file, write_csv): - r'''Worker function - - This function is the center piece of this program. - It does some preliminary work such as grabbing all fastq files - in the given directory. - After that it calls other functions which analyse the found - fastq-files. - - Parameters - ---------- - file_dir : str - Given filepath. Can be a path to single file which leads - to visual output whichcan easily be understood or a - path to a directory which produces csv output. - new_file : str - Filepath to the Output file. If the flag is not set - output will be printed via STDOUT. - write_csv : Bool - Wether or not a csv file should be written in the - standard Output folder of this program. - - ''' +def workflow(file_dir, new_file, write_csv, sample_size): + path = files_manager.get_lib() temp_path = files_manager.tmp_dir(path, temp_path=None) - # sets the paths of the programm itself and a temporary folder paired = False result = dict() buildbowtie2(path) - # builds bowtie2 index - if glob.glob(f'{file_dir}**/*.fastq*', recursive=True) == [] \ - and os.path.isdir(file_dir): + if glob.glob(f'{file_dir}**/*.fastq*', recursive=True) == [] and os.path.isdir(file_dir): files_manager.tmp_dir(path, temp_path) - raise ValueError('There were no FASTQ files ' - 'in this directory') - # checks if given directory contains fastq files + raise ValueError('There were no FASTQ files in this directory') if os.path.isfile(file_dir): single_file = True file_name = os.path.basename(file_dir) - read2_file = os.path.join(os.path.dirname(file_dir), - file_name.replace('_R1_', '_R2_')) + read2_file = os.path.join(os.path.dirname(file_dir), file_name.replace('_R1_', '_R2_')) if '_R1_' in file_name and os.path.exists(read2_file): paired = True - # searches for a reverse read file - aligned_path, Error = mapbowtie2(file_dir, read2_file, - path, temp_path, paired) - # The Programm bowtie2 is used to align the Reads to a reference - # 16S database. + sampled_reads = sample_fastq(file_dir, sample_size) + temp_fastq = os.path.join(temp_path, 'sampled.fastq') + save_sampled_fastq(sampled_reads, temp_fastq) + aligned_path, Error = mapbowtie2(temp_fastq, read2_file, path, temp_path, paired) if Error is True: files_manager.tmp_dir(path, temp_path) raise ValueError('This file does not look like a fastq file') - # raises error if given file is not a fastq file - if paired is True and Output_counter.rawincount(f'{temp_path}' - 'paired.bed') == 0: + if paired is True and Output_counter.rawincount(f'{temp_path}paired.bed') == 0: files_manager.tmp_dir(path, temp_path) - raise ValueError('This file has no Reads of the required ' - 'mapping-quality') + raise ValueError('This file has no Reads of the required mapping-quality') overlap(path, temp_path, aligned_path) - # look which reads intersect with which variable Region - if paired is False and Output_counter.rawincount(f'{temp_path}' - 'BED.bed') == 0: + if paired is False and Output_counter.rawincount(f'{temp_path}BED.bed') == 0: files_manager.tmp_dir(path, temp_path) - raise ValueError('This file has no Reads of the required ' - 'mapping-quality') + raise ValueError('This file has no Reads of the required mapping-quality') file_name = file_name.rsplit('.f', 1)[0] file_name = file_name.replace('_R1_001', '') - # reformats filename for easy viewing result[file_name] = Output_counter.create_row(temp_path, paired) - # streamlines generated output to a visual terminal output elif os.path.isdir(file_dir): single_file = False + total_files = len(glob.glob(f'{file_dir}**/*.fastq*', recursive=True)) + processed_files = 0 for fq_file in glob.glob(f'{file_dir}**/*.fastq*', recursive=True): + processed_files += 1 paired = False if '_R2_' in fq_file: continue file_name = os.path.basename(fq_file) - read2_file = os.path.join(os.path.dirname(fq_file), - file_name.replace('_R1_', '_R2_')) + read2_file = os.path.join(os.path.dirname(fq_file), file_name.replace('_R1_', '_R2_')) if '_R1_' in file_name and os.path.exists(read2_file): paired = True - # searches for a reverse read file - aligned_path, Error = mapbowtie2(fq_file, read2_file, - path, temp_path, paired) - # The Programm bowtie2 is used to align the Reads to a reference - # 16S database. + sampled_reads = sample_fastq(fq_file, sample_size) + temp_fastq = os.path.join(temp_path, 'sampled.fastq') + save_sampled_fastq(sampled_reads, temp_fastq) + aligned_path, Error = mapbowtie2(temp_fastq, read2_file, path, temp_path, paired) if Error is True: continue overlap(path, temp_path, aligned_path) - # look which reads intersect with which variable Region file_name = file_name.rsplit('.f', 1)[0] file_name = file_name.replace('_R1_001', '') - # reformats filename for easy viewing - result[file_name] = Output_counter.create_row(temp_path, - paired) - # streamlines generated output to a visual terminal output + result[file_name] = Output_counter.create_row(temp_path, paired) files_manager.tmp_dir(path, temp_path) - # deletes temporary folder do_output(result, new_file, single_file) - # writes ouput eiher to STDOUT or to a file specified - # via the -o option if write_csv is True: - new_file = (f'{path}Output/' - f'{os.path.basename(os.path.dirname(file_dir))}.csv') + new_file = (f'{path}Output/{os.path.basename(os.path.dirname(file_dir))}.csv') do_output(result, new_file, single_file) - # writes csv file to the standard output folder - def main(): - r'''Main function - - This function uses argparse to interpret user - input and then calls the workflow function which - does the actual work. - - Iput otions - -o, --output : - User specified output path - -c, --csv : - If set a csv file will be written into - the standard Output folder - - ''' parser = argparse.ArgumentParser(prog='VX detector', description=( - 'This programm tries to find which variable region of the 16S ' - 'sequence was sequencend')) - parser.add_argument('dir_path', - help=('Directory path of the directory containing ' - 'multiple fastq or fasta files.')) - parser.add_argument('-o', '--output', dest='output_file', - default=sys.stdout, - help='User can specify a file format in which the \ - output is written in the Output folder.') - parser.add_argument('-c', '--csv', dest='write_csv', action='store_true', - help='If set the output will be written in a \ - .csv file in the Output folder') + 'This programm tries to find which variable region of the 16S sequence was sequencend')) + parser.add_argument('dir_path', help=('Directory path of the directory containing multiple fastq or fasta files.')) + parser.add_argument('-o', '--output', dest='output_file', default=sys.stdout, + help='User can specify a file format in which the output is written in the Output folder.') + parser.add_argument('-c', '--csv', dest='write_csv', action='store_true', + help='If set the output will be written in a .csv file in the Output folder') + parser.add_argument('-s', '--sample_size', dest='sample_size', type=int, default=1000, + help='Number of reads to sample from each FASTQ file') args = parser.parse_args() - # allows terminal input - workflow(args.dir_path, args.output_file, args.write_csv) - + workflow(args.dir_path, args.output_file, args.write_csv, args.sample_size) if __name__ == '__main__': main() From 4684232aaab4cee66a698740c91458888a49cf27 Mon Sep 17 00:00:00 2001 From: Jeremy Meiss Date: Mon, 5 Aug 2024 10:52:50 +0200 Subject: [PATCH 02/37] add subsampling --- vxdetector/VXdetector.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vxdetector/VXdetector.py b/vxdetector/VXdetector.py index bee5e52..22f41f7 100644 --- a/vxdetector/VXdetector.py +++ b/vxdetector/VXdetector.py @@ -10,7 +10,6 @@ import files_manager as files_manager from interact_bowtie2 import mapbowtie2, buildbowtie2 from interact_bedtools import overlap -import time from multiprocessing import pool import random From d3e1cafe80b6e119b63a6d9ab97c9eed0c1edbf3 Mon Sep 17 00:00:00 2001 From: Landbarsch Date: Mon, 5 Aug 2024 11:01:54 +0200 Subject: [PATCH 03/37] add subsampling --- .DS_Store | Bin 0 -> 6148 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..44be779b37ebb780b7d4c19577818362dacc2ceb GIT binary patch literal 6148 zcmeHKJxc>Y5Ph2u3<*N8vb@etjE(IXBA8PA0m>P_z<5CtQLwn5U}@!#@E3@Py<%%8 z_#`2_4hDM!AeK06jo0!^Aw7}D*kk698JcmeMAvHk5yQB4jz_*O_LwSgKcN`=bj}~ya+uhmMpZx+7+0X^ZtJrDZ{(l<$D8y{6;K8Kl>(;J+HN)ZN`7yh xcsbc?Bl<2~OzJX+wuQlO$M! Date: Mon, 5 Aug 2024 11:06:00 +0200 Subject: [PATCH 04/37] "add subsampling" --- .DS_Store | Bin 6148 -> 6148 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/.DS_Store b/.DS_Store index 44be779b37ebb780b7d4c19577818362dacc2ceb..5adcdaf9c4ac643ccd6532e224019fbae50e472d 100644 GIT binary patch delta 106 zcmZoMXffEJ#uD4Gi-CcGg+Y%YogtHwcs<&ES|Ls(cDw`GO3? V;N<+=0-zoS)*}ZtH?uSf0|3rs95es` delta 106 zcmZoMXffEJ#u8gOgMop8g+Y%YogtH Date: Mon, 5 Aug 2024 11:12:27 +0200 Subject: [PATCH 05/37] add subsampling --- .DS_Store | Bin 6148 -> 6148 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/.DS_Store b/.DS_Store index 5adcdaf9c4ac643ccd6532e224019fbae50e472d..f8561586ee19eeb6420ee78f8731c0fcfb2a330d 100644 GIT binary patch delta 106 zcmZoMXffEJ#u7UxgMop8g+Y%YogtHS16#zF>1ROQSFVG`Sgl delta 106 zcmZoMXffEJ#uD4Gi-CcGg+Y%YogtHwcs<&ES|Ls(cDw`GO3? V;N<+=0-zoS)*}ZtH?uSf0|3rs95es` From 8d8438e481d1f785e326fd4681775d9347521919 Mon Sep 17 00:00:00 2001 From: Landbarsch Date: Mon, 19 Aug 2024 09:59:03 +0200 Subject: [PATCH 06/37] update --- .DS_Store | Bin 6148 -> 8196 bytes vxdetector/.DS_Store | Bin 0 -> 6148 bytes vxdetector/testlog | 2 ++ vxdetector/tests/.DS_Store | Bin 0 -> 6148 bytes 4 files changed, 2 insertions(+) create mode 100644 vxdetector/.DS_Store create mode 100644 vxdetector/testlog create mode 100644 vxdetector/tests/.DS_Store diff --git a/.DS_Store b/.DS_Store index f8561586ee19eeb6420ee78f8731c0fcfb2a330d..5007d88cbd18a60f323aef667935563dabdf85ad 100644 GIT binary patch literal 8196 zcmeHM&ubGw6n>i~?ZzU8qR@-7An2vkG_955CB_umg9wf2K_zChF$vx5uE{3p4+}Z# zKj6uWS3wXD9>s%RRm79t1pfuagMRZP?PilSJ&C0=F!SEdeDCdh-|Wn0HbkUln${fA zG!f~jOeQAL3@B`z)xHvuJ!c>_*ppqJ%N5<4*N0RciUGxdVn8vV7*GuSFAU(F&5Azc zy)Q;ps~AuW97zV){$QXonU=C6B;PvF@FM_Z6y3Z)PcQ}2kTNZ0M@WpIXu=dym}>Hf zp$T*J8xoh6vLmE0CrutcGjag^p%$2^lMAiJuYE+h3Ef_`-J)cGZ89wT*+0BM zBbuaD*jTXHK(#1OCi#>KS-m)zRgjCw$8S%E`wwz*{>j5GCqtqMax)qoCl`9MC3cmhh4q%GXRr*^Lop6T zXHK3fB09NEbn?d38H|hBIXDEFL5hGb;|3D0AlGe7{LVa?U&a&URt6@ByFp%I*c{I@ GhZz9w!z{W0 diff --git a/vxdetector/.DS_Store b/vxdetector/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..3980255f502e9eb1aa9391e620c1023c74d2632a GIT binary patch literal 6148 zcmeHK&x_MQ6n@j~+SIa8afQ7I0k37PUD-vvbnANXBp6u*m6~j74Q4Z?NexRW!z#8|(HY=dz>r!}jY z7Ul^RBchCkxb|tX=4~6S0#9C5c2=k~@p zl5)g}5U;l}%GXMiTlAb}U`$j##CRl1%kga*$FE9+2H31chLKAeuc{~be&0&!ZU6O7 zS3PUvO5BoXkf#&#M1BO5I7+i__ot|B)wgdnoQBhK-gzHo?qz;9O9%e+4cA^u83)U< zADl*`eAwQ7B9qLIlF?KVqHqM4x38ikl=(o;k}y?VPj@&?r#Woz&F8&?qmJ71unJs%1$cdMkr?|13yo^&K&FlWz&e_h zq0B!E%yA9&4Hg>F0~4AG)Kp=P7(&z0?i#FIm z?i7k6pqN4$Q+SxIjJ6F{0jt3OrU1XY9lWs#l_+cPuRM-28KIZAhNtkIU&;_QW<=9+ zWHP=-qbx1Q{r(TpT5DgszV56$8_s+GlbrhFa6Bo8Veyu0ucXYPMI1&)X)+DEx1Y&; z9Hx0vsDw05Fy!6qG>_$UC?|Pbs$5@pI31@Gbhl@-!R~&~9qjGTd+zM<|pXo%*~owA1S3wX&2BLcD@{b zf_gzaG@{Rk*^-LikX={tZjC8LEnLyufv<#&tw*u(Cw8@LFZFUM*I Date: Thu, 12 Sep 2024 16:42:49 +0200 Subject: [PATCH 07/37] updated sample size, now working --- .DS_Store | Bin 8196 -> 8196 bytes vxdetector/.DS_Store | Bin 6148 -> 6148 bytes vxdetector/VXdetector.py | 67 +++++++++++++++++++++++++++++-------- vxdetector/testlog | 11 ++++-- vxdetector/tests/.DS_Store | Bin 6148 -> 6148 bytes 5 files changed, 62 insertions(+), 16 deletions(-) diff --git a/.DS_Store b/.DS_Store index 5007d88cbd18a60f323aef667935563dabdf85ad..6b56441b7de3097435e94e155910f8b2d2e61d93 100644 GIT binary patch delta 939 zcmZp1XmOa}&nUeyU^hRb^kyD`X^fLsi8QjC8R#e&n^;aZ61AWFSWtZOBEgxHLj}co z%7TmXa`N-i85kHCCtC5k^K2CKu+*YrARU=l zRFs&Pp6XN?o0FK7n&Y46lwVSkpBpd057rBGWk69XNFUhT;QE5Z0QrJ)p~!{$8EQpU~f l65m+r2O^4)VyJ!~q6jI5>Oo;b?H+`AG~63<3-cOzA*c?>`s-Squ!=%}8d*2byNUPzAK1 zgrOYO3`SX?zBT{<*Mm&KV@L`^AXnIR8InlYp^6oAYSKm;p;Bgi3> zzX>adpqkFG9xAbr-NZmg!PwYr@;s4{&8x&>88;TFFm7g-_{K6hQ$!DotRf=-TOfqF diff --git a/vxdetector/.DS_Store b/vxdetector/.DS_Store index 3980255f502e9eb1aa9391e620c1023c74d2632a..b731e8a11f420afc3e722ba25df51d9e5f08f722 100644 GIT binary patch delta 117 zcmZoMXfc@J&&atkU^g=(=VTrhHG%*C|LZd_FjxU;FqrJbV#LN63>4~`T*czd3S!-w ze2B#n#A4K%%)x34Vlib+u3%MRV(Fedk99V?nSqXiv5Dnm2evj2rd<$&HWsEcZf58B G%MSobZz$;i delta 163 zcmZoMXfc@J&&ahgU^g=(*JK_RH30?&21g*)`ws>T43nK$jMx~~gM=nmu{g8Jf?0=H z99g@;EDly% + main() + File "/Users/jeremy/Desktop/VX/algorithm_vxdetector/vxdetector/VXdetector.py", line 141, in main + workflow(args.dir_path, args.output_file, args.write_csv, args.sample_size) + File "/Users/jeremy/Desktop/VX/algorithm_vxdetector/vxdetector/VXdetector.py", line 77, in workflow + raise ValueError('There were no FASTQ files in this directory') +ValueError: There were no FASTQ files in this directory diff --git a/vxdetector/tests/.DS_Store b/vxdetector/tests/.DS_Store index b1db8aa7b125485b75cc40b7b9725366f50d3750..00cda90bc2bc6b36c05e292e0508f64d6be0bf8d 100644 GIT binary patch delta 20 bcmZoMXffDum4)5RKu5vY#B%c;77bwlMWO~7 delta 20 bcmZoMXffDum4)5dTt~sk%zX1577bwlMa2dj From 561156990ec5cde2c812a3217845306ecdebb49e Mon Sep 17 00:00:00 2001 From: Landbarsch Date: Fri, 27 Sep 2024 12:56:45 +0200 Subject: [PATCH 08/37] added comments to the skript, removed the debugg- code. made an extra file for the sampler version --- vxdetector/VXdetector_sampler.py | 198 +++++++++++++++++++++++++++++++ 1 file changed, 198 insertions(+) create mode 100644 vxdetector/VXdetector_sampler.py diff --git a/vxdetector/VXdetector_sampler.py b/vxdetector/VXdetector_sampler.py new file mode 100644 index 0000000..2af3313 --- /dev/null +++ b/vxdetector/VXdetector_sampler.py @@ -0,0 +1,198 @@ +#!/usr/bin/python + +import argparse +import glob +import os +import sys +import warnings +import pandas as pd +import Output_counter as Output_counter +import files_manager as files_manager +from interact_bowtie2 import mapbowtie2, buildbowtie2 +from interact_bedtools import overlap +import random + +def sample_fastq(file_path, sample_size, sampled_indices=None): + '''Get random parts from the FASTQ file based on shared indices for paired-end reads.''' + sampled_reads = [] # List to hold sampled reads + with open(file_path, 'r') as f: + lines = f.readlines() + total_reads = len(lines) // 4 # FASTQ files store 4 lines per read + if sampled_indices is None: + # If no indices are provided, sample randomly from the total reads + sampled_indices = sorted(random.sample(range(total_reads), sample_size)) + for idx in sampled_indices: + # Get the read (4 lines per read) and append to the sampled_reads list + read = lines[idx*4:(idx+1)*4] + sampled_reads.extend(read) + return sampled_reads, sampled_indices + +def save_sampled_fastq(sampled_reads, output_path): + '''Save sampled reads to a temporary FASTQ file.''' + with open(output_path, 'w') as f: + f.writelines(sampled_reads) + +def do_statistic(result): + '''Compute statistics (mean and standard deviation) from alignment results.''' + average = result.mean(numeric_only=True).to_frame().T # Compute the mean of numeric columns + region = (result['Sequenced variable region'].mode().values) # Get most frequent region + region = ' / '.join(str(r) for r in region) # Join the modes into a single string + region = region.replace('\'', '').replace('[', '').replace(']', '') # Clean up string format + average['Sequenced variable region'] = region # Add region info to the average dataframe + if 'Not properly paired' not in average.columns: + average['Not properly paired'] = 'not paired' + std_dev = result.std(numeric_only=True).to_frame().T # Compute standard deviation of numeric columns + statistic = pd.concat([average, std_dev], axis=0) # Combine mean and standard deviation + # Select specific columns for the final output + statistic = statistic[['Number of Reads', 'Unaligned Reads [%]', 'Not properly paired', + 'Sequenced variable region', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', + 'V7', 'V8', 'V9', 'Not aligned to a variable region']] + statistic['row_descriptor'] = ['Average', 'Standard deviation'] # Add row descriptors + statistic = statistic.set_index('row_descriptor') # Set descriptor as index + result = pd.concat([statistic, result], axis=0) # Append stats to original results + return result + +def do_output(result, new_file, single_file): + '''Process and output results to a CSV file.''' + warnings.simplefilter(action='ignore', category=FutureWarning) + result = pd.DataFrame(result).T.sort_index() # Convert result to DataFrame and sort + + # Convert columns to numeric where applicable + for column in result: + result[column] = pd.to_numeric(result[column], errors='ignore') + + if single_file is False: + result = do_statistic(result) # Add statistics if processing multiple files + else: + result = result[['Number of Reads', 'Unaligned Reads [%]', 'Not properly paired', + 'Sequenced variable region', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', + 'V7', 'V8', 'V9', 'Not aligned to a variable region']] + + # Write the result DataFrame to a CSV file + result.to_csv(new_file, index=True) + +def workflow(file_dir, new_file, write_csv, sample_size): + '''Main workflow for processing FASTQ files and performing sequence alignment.''' + path = files_manager.get_lib() # Get the main working directory + temp_path = files_manager.tmp_dir(path, temp_path=None) # Create a temporary directory + paired = False # To check if the input is paired-end + single_file = False # To check if processing a single file + result = dict() # Dictionary to store results from different files + buildbowtie2(path) # Build the Bowtie2 index + + # Check if the input path exists + if not os.path.exists(file_dir): + raise ValueError(f"The provided path {file_dir} does not exist.") + + # Check if there are FASTQ files in the directory + if glob.glob(f'{file_dir}**/*.fastq*', recursive=True) == [] and os.path.isdir(file_dir): + files_manager.tmp_dir(path, temp_path) + raise ValueError('There were no FASTQ files in this directory') + + # Process single file + if os.path.isfile(file_dir): + single_file = True + file_name = os.path.basename(file_dir) + read2_file = os.path.join(os.path.dirname(file_dir), file_name.replace('_R1_', '_R2_')) # Get the paired file if exists + if '_R1_' in file_name and os.path.exists(read2_file): + paired = True # Mark as paired if both R1 and R2 files exist + + # Sample reads from R1 and R2 with same indices + sampled_reads_R1, sampled_indices = sample_fastq(file_dir, sample_size) + sampled_reads_R2, _ = sample_fastq(read2_file, sample_size, sampled_indices) + + # Save sampled reads to temporary files + temp_fastq_R1 = os.path.join(temp_path, 'sampled_R1.fastq') + temp_fastq_R2 = os.path.join(temp_path, 'sampled_R2.fastq') + save_sampled_fastq(sampled_reads_R1, temp_fastq_R1) + save_sampled_fastq(sampled_reads_R2, temp_fastq_R2) + + # Run Bowtie2 alignment + aligned_path, Error = mapbowtie2(temp_fastq_R1, temp_fastq_R2, path, temp_path, paired) + + if Error is True: + files_manager.tmp_dir(path, temp_path) + raise ValueError('This file does not look like a fastq file') + if paired is True and Output_counter.rawincount(f'{temp_path}paired.bed') == 0: + files_manager.tmp_dir(path, temp_path) + raise ValueError('This file has no Reads of the required mapping-quality') + + # Perform overlap analysis using Bedtools + overlap(path, temp_path, aligned_path) + if paired is False and Output_counter.rawincount(f'{temp_path}BED.bed') == 0: + files_manager.tmp_dir(path, temp_path) + raise ValueError('This file has no Reads of the required mapping-quality') + + # Generate results from output + file_name = file_name.rsplit('.f', 1)[0] + file_name = file_name.replace('_R1_001', '') + result[file_name] = Output_counter.create_row(temp_path, paired) + + # Process directory of files + elif os.path.isdir(file_dir): + single_file = False + total_files = len(glob.glob(f'{file_dir}**/*.fastq*', recursive=True)) # Count number of FASTQ files + processed_files = 0 + for fq_file in glob.glob(f'{file_dir}**/*.fastq*', recursive=True): + processed_files += 1 + paired = False + if '_R2_' in fq_file: + continue # Skip R2 files to avoid processing duplicates + + file_name = os.path.basename(fq_file) + read2_file = os.path.join(os.path.dirname(fq_file), file_name.replace('_R1_', '_R2_')) + if '_R1_' in file_name and os.path.exists(read2_file): + paired = True # Mark as paired if both R1 and R2 files exist + + # Sample reads from R1 and R2 with same indices + sampled_reads_R1, sampled_indices = sample_fastq(fq_file, sample_size) + sampled_reads_R2, _ = sample_fastq(read2_file, sample_size, sampled_indices) + + # Save sampled reads to temporary files + temp_fastq_R1 = os.path.join(temp_path, 'sampled_R1.fastq') + temp_fastq_R2 = os.path.join(temp_path, 'sampled_R2.fastq') + save_sampled_fastq(sampled_reads_R1, temp_fastq_R1) + save_sampled_fastq(sampled_reads_R2, temp_fastq_R2) + + # Run Bowtie2 alignment + aligned_path, Error = mapbowtie2(temp_fastq_R1, temp_fastq_R2, path, temp_path, paired) + if Error is True: + continue # Skip if there's an error in processing the file + + # Perform overlap analysis using Bedtools + overlap(path, temp_path, aligned_path) + + # Generate results from output + file_name = file_name.rsplit('.f', 1)[0] + file_name = file_name.replace('_R1_001', '') + result[file_name] = Output_counter.create_row(temp_path, paired) + + # Clean up temporary directory + files_manager.tmp_dir(path, temp_path) + + # Output the result as CSV or display + do_output(result, new_file, single_file) + if write_csv is True: + new_file = (f'{path}Output/{os.path.basename(os.path.dirname(file_dir))}.csv') + do_output(result, new_file, single_file) + +def main(): + '''Main function to parse user arguments and initiate the workflow.''' + parser = argparse.ArgumentParser(prog='VX detector', description=( + 'This program tries to find which variable region of the 16S sequence was sequenced')) + parser.add_argument('dir_path', help=('Directory path of the directory containing multiple fastq or fasta files.')) + parser.add_argument('-o', '--output', dest='output_file', default=sys.stdout, + help='User can specify a file format in which the output is written in the Output folder.') + parser.add_argument('-c', '--csv', dest='write_csv', action='store_true', + help='If set the output will be written in a .csv file in the Output folder') + parser.add_argument('-s', '--sample_size', dest='sample_size', type=int, default=1000, + help='Number of reads to sample from each FASTQ file') + + # Parse the user input arguments + args = parser.parse_args() + + # Start the workflow with provided arguments + workflow(args.dir_path, args.output_file, args.write_csv, args.sample_size) + +if __name__ == '__main__': + main() # Entry point of the script From b851734b67bf5f6b34831600ffd4519259663d5a Mon Sep 17 00:00:00 2001 From: Landbarsch Date: Thu, 21 Nov 2024 14:28:07 +0100 Subject: [PATCH 09/37] Merged Multiprocessing Apsect with Subsampling for a complete Tool --- Output/.DS_Store | Bin 0 -> 6148 bytes vxdetector/VXdetector.py | 275 +++++++++++++++++-------------- vxdetector/VXdetector_sampler.py | 198 ---------------------- 3 files changed, 151 insertions(+), 322 deletions(-) create mode 100644 Output/.DS_Store delete mode 100644 vxdetector/VXdetector_sampler.py diff --git a/Output/.DS_Store b/Output/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 total_reads: + sample_size = total_reads + + if sampled_indices is None: + sampled_indices = sorted(random.sample(range(total_reads), sample_size)) + + with open_func(file_path, 'rt') as f: + read_idx = 0 + read_buffer = [] + for line in f: + read_buffer.append(line) + if len(read_buffer) == 4: # Once we have a full read + if read_idx in sampled_indices: + sampled_reads.extend(read_buffer) # Add the read if it matches the sampled index + read_buffer = [] # Clear the buffer for the next read + read_idx += 1 # Move to the next read + + # Stop if we've collected enough reads + if len(sampled_reads) >= sample_size * 4: + break + return sampled_reads, sampled_indices + def save_sampled_fastq(sampled_reads, output_path): - '''Save sampled reads to a temporary FASTQ file''' - with open(output_path, 'w') as f: + '''Save sampled reads to a temporary FASTQ file.''' + open_func = gzip.open if output_path.endswith('.gz') else open # Use gzip if output is .gz + + with open_func(output_path, 'wt') as f: # Write in text mode ('wt') f.writelines(sampled_reads) -def do_statistic(result): - average = result.mean(numeric_only=True).to_frame().T - region = (result['Sequenced variable region'].mode().values) - region = ' / '.join(str(r) for r in region) - region = region.replace('\'', '') - region = region.replace('[', '') - region = region.replace(']', '') +def do_statistic(result): + '''Performs statistical analysis on the DataFrame + + Calculates the mean and standard deviation for all numeric columns + in the DataFrame, and determines the most common sequenced variable + region. Adds these statistics to the DataFrame and returns the updated + DataFrame. + ''' + average = result.mean(numeric_only=True).to_frame().T # Calculate mean for numeric columns + region = (result['Sequenced variable region'].mode().values) # Find the most common variable region + region = ' / '.join(str(r) for r in region) # Format the result for better readability + region = region.replace('\'', '').replace('[', '').replace(']', '') # Clean up formatting average['Sequenced variable region'] = region if 'Not properly paired' not in average.columns: - average['Not properly paired'] = 'not paired' - std_dev = result.std(numeric_only=True).to_frame().T - statistic = pd.concat([average, std_dev], axis=0) + average['Not properly paired'] = 'not paired' # Handle cases where reads are not paired + std_dev = result.std(numeric_only=True).to_frame().T # Calculate standard deviation for numeric columns + statistic = pd.concat([average, std_dev], axis=0) # Combine mean and standard deviation dataframes + # Select relevant columns and reorder them statistic = statistic[['Number of Reads', 'Unaligned Reads [%]', 'Not properly paired', 'Sequenced variable region', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'Not aligned to a variable region']] - statistic['row_descriptor'] = ['Average', 'Standard deviation'] - statistic = statistic.set_index('row_descriptor') - result = pd.concat([statistic, result], axis=0) + statistic['row_descriptor'] = ['Average', 'Standard deviation'] # Add descriptors for the statistics + statistic = statistic.set_index('row_descriptor') # Set the row descriptors as the index + result = pd.concat([statistic, result], axis=0) # Combine the statistical results with the original data return result + def do_output(result, new_file, single_file): - - warnings.simplefilter(action='ignore', category=FutureWarning) - result = pd.DataFrame(result).T.sort_index() - - print("Available columns in result:", result.columns) - print(result.head()) # Zeigt die ersten Zeilen des DataFrames an + '''Writes the results into a CSV file - + Converts the dictionary of results into a DataFrame, calculates statistics + if working with multiple files, and writes the data into a CSV file. + ''' + warnings.simplefilter(action='ignore', category=FutureWarning) # Ignore warnings about future behavior + result = pd.DataFrame(result).T.sort_index() # Convert the dictionary to a DataFrame and sort by index for column in result: - result[column] = pd.to_numeric(result[column], errors='ignore') + result[column] = pd.to_numeric(result[column], errors='ignore') # Convert columns to numeric where possible if single_file is False: - result = do_statistic(result) + result = do_statistic(result) # If multiple files, calculate statistics else: + # Select relevant columns for single file processing result = result[['Number of Reads', 'Unaligned Reads [%]', 'Not properly paired', 'Sequenced variable region', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'Not aligned to a variable region']] - result.to_csv(new_file, index=True) + result.to_csv(new_file, index=True) # Write the results to a CSV file -def workflow(file_dir, new_file, write_csv, sample_size): - path = files_manager.get_lib() - temp_path = files_manager.tmp_dir(path, temp_path=None) - paired = False - single_file = False - result = dict() - buildbowtie2(path) - if not os.path.exists(file_dir): - raise ValueError(f"The provided path {file_dir} does not exist.") - - if glob.glob(f'{file_dir}**/*.fastq*', recursive=True) == [] and os.path.isdir(file_dir): - files_manager.tmp_dir(path, temp_path) - raise ValueError('There were no FASTQ files in this directory') - if os.path.isfile(file_dir): - single_file = True - file_name = os.path.basename(file_dir) - read2_file = os.path.join(os.path.dirname(file_dir), file_name.replace('_R1_', '_R2_')) - if '_R1_' in file_name and os.path.exists(read2_file): - paired = True - # Sample reads from the forward file (R1) and use the same indices for the reverse file (R2) - sampled_reads_R1, sampled_indices = sample_fastq(file_dir, sample_size) - sampled_reads_R2, _ = sample_fastq(read2_file, sample_size, sampled_indices) - - # Save sampled reads for both R1 and R2 - temp_fastq_R1 = os.path.join(temp_path, 'sampled_R1.fastq') - temp_fastq_R2 = os.path.join(temp_path, 'sampled_R2.fastq') - save_sampled_fastq(sampled_reads_R1, temp_fastq_R1) - save_sampled_fastq(sampled_reads_R2, temp_fastq_R2) - - aligned_path, Error = mapbowtie2(temp_fastq_R1, temp_fastq_R2, path, temp_path, paired) - - print(f"Mapping result: {aligned_path}, Error: {Error}") - - if Error is True: - files_manager.tmp_dir(path, temp_path) - raise ValueError('This file does not look like a fastq file') - if paired is True and Output_counter.rawincount(f'{temp_path}paired.bed') == 0: - files_manager.tmp_dir(path, temp_path) - raise ValueError('This file has no Reads of the required mapping-quality') - overlap(path, temp_path, aligned_path) - if paired is False and Output_counter.rawincount(f'{temp_path}BED.bed') == 0: - files_manager.tmp_dir(path, temp_path) - raise ValueError('This file has no Reads of the required mapping-quality') - file_name = file_name.rsplit('.f', 1)[0] - file_name = file_name.replace('_R1_001', '') - result[file_name] = Output_counter.create_row(temp_path, paired) - - print(f"Output for {file_name}: {result[file_name]}") - - elif os.path.isdir(file_dir): - single_file = False - total_files = len(glob.glob(f'{file_dir}**/*.fastq*', recursive=True)) - processed_files = 0 - for fq_file in glob.glob(f'{file_dir}**/*.fastq*', recursive=True): - processed_files += 1 - paired = False - if '_R2_' in fq_file: - continue +def process_file(fq_file, path, bowtie2_params, sample_size=None): + """ + Processes a single FASTQ file, including optional read sampling, alignment with Bowtie2, + and overlap analysis using Bedtools. + """ + paired = False + result = {} + try: + with tempfile.TemporaryDirectory() as temp_path: file_name = os.path.basename(fq_file) read2_file = os.path.join(os.path.dirname(fq_file), file_name.replace('_R1_', '_R2_')) if '_R1_' in file_name and os.path.exists(read2_file): - paired = True - - # Sample reads from R1 and use the same indices for R2 - sampled_reads_R1, sampled_indices = sample_fastq(fq_file, sample_size) - sampled_reads_R2, _ = sample_fastq(read2_file, sample_size, sampled_indices) - - temp_fastq_R1 = os.path.join(temp_path, 'sampled_R1.fastq') - temp_fastq_R2 = os.path.join(temp_path, 'sampled_R2.fastq') - save_sampled_fastq(sampled_reads_R1, temp_fastq_R1) - save_sampled_fastq(sampled_reads_R2, temp_fastq_R2) - - aligned_path, Error = mapbowtie2(temp_fastq_R1, temp_fastq_R2, path, temp_path, paired) - if Error is True: - continue + paired = True # Paired-end sequencing detected + + if sample_size: # Perform sampling if required + sampled_reads_R1, sampled_indices = sample_fastq(fq_file, sample_size) + temp_fastq_R1 = os.path.join(temp_path, 'sampled_R1.fastq') + save_sampled_fastq(sampled_reads_R1, temp_fastq_R1) + fq_file = temp_fastq_R1 + + if paired: # Process the reverse reads for paired-end data + sampled_reads_R2, _ = sample_fastq(read2_file, sample_size, sampled_indices) + temp_fastq_R2 = os.path.join(temp_path, 'sampled_R2.fastq') + save_sampled_fastq(sampled_reads_R2, temp_fastq_R2) + read2_file = temp_fastq_R2 + + # Perform alignment and overlap analysis + aligned_path, Error = mapbowtie2(fq_file, read2_file, path, temp_path, paired, bowtie2_params) + if Error: + return None overlap(path, temp_path, aligned_path) - - print(f"Overlap completed for {aligned_path}") - - file_name = file_name.rsplit('.f', 1)[0] - file_name = file_name.replace('_R1_001', '') + file_name = file_name.rsplit('.f', 1)[0].replace('_R1_001', '') result[file_name] = Output_counter.create_row(temp_path, paired) - - print(f"Output for {file_name}: {result[file_name]}") + except Exception as e: + print(f"Error processing file {fq_file}: {e}") + return result - print("Result content before do_output:", result) - - files_manager.tmp_dir(path, temp_path) + +def workflow(file_dir, new_file, write_csv, sample_size, bowtie2_params): + """ + Manages the overall processing workflow, including directory traversal, BIOM file conversion, + and multiprocessing for FASTQ file processing. + """ + path = files_manager.get_lib() + buildbowtie2(path) # Prepare Bowtie2 library + result = {} + single_file = False + + if file_dir.endswith('.biom'): # Handle BIOM file input + temp_fastq = os.path.join(tempfile.gettempdir(), 'temp_sequences.fastq') + biom_to_fastq(file_dir, temp_fastq) + result = process_file(temp_fastq, path, bowtie2_params, sample_size) + single_file = True + elif os.path.isfile(file_dir): # Process single FASTQ file + single_file = True + result = process_file(file_dir, path, bowtie2_params, sample_size) + elif os.path.isdir(file_dir): # Process directory of FASTQ files + fastq_files = glob.glob(f'{file_dir}/**/*.fastq*', recursive=True) + with Pool() as pool: + results = pool.starmap(process_file, [(fq_file, path, bowtie2_params, sample_size) + for fq_file in fastq_files if '_R2_' not in fq_file]) + for res in results: + if res: + result.update(res) do_output(result, new_file, single_file) - if write_csv is True: - new_file = (f'{path}Output/{os.path.basename(os.path.dirname(file_dir))}.csv') - do_output(result, new_file, single_file) - + def main(): + '''Main function to parse user arguments and initiate the workflow.''' parser = argparse.ArgumentParser(prog='VX detector', description=( - 'This programm tries to find which variable region of the 16S sequence was sequencend')) + 'This program tries to find which variable region of the 16S sequence was sequenced')) parser.add_argument('dir_path', help=('Directory path of the directory containing multiple fastq or fasta files.')) parser.add_argument('-o', '--output', dest='output_file', default=sys.stdout, help='User can specify a file format in which the output is written in the Output folder.') @@ -176,8 +198,13 @@ def main(): help='If set the output will be written in a .csv file in the Output folder') parser.add_argument('-s', '--sample_size', dest='sample_size', type=int, default=1000, help='Number of reads to sample from each FASTQ file') + parser.add_argument('-b','--bowtie2_params', dest='bowtie2_params', default= None, + help='Additional parameters to pass to Bowtie2') + # Parse the user input arguments args = parser.parse_args() - workflow(args.dir_path, args.output_file, args.write_csv, args.sample_size) + + # Start the workflow with provided arguments + workflow(args.dir_path, args.output_file, args.write_csv, args.sample_size, args.bowtie2_params) if __name__ == '__main__': - main() + main() # Entry point of the script \ No newline at end of file diff --git a/vxdetector/VXdetector_sampler.py b/vxdetector/VXdetector_sampler.py deleted file mode 100644 index 2af3313..0000000 --- a/vxdetector/VXdetector_sampler.py +++ /dev/null @@ -1,198 +0,0 @@ -#!/usr/bin/python - -import argparse -import glob -import os -import sys -import warnings -import pandas as pd -import Output_counter as Output_counter -import files_manager as files_manager -from interact_bowtie2 import mapbowtie2, buildbowtie2 -from interact_bedtools import overlap -import random - -def sample_fastq(file_path, sample_size, sampled_indices=None): - '''Get random parts from the FASTQ file based on shared indices for paired-end reads.''' - sampled_reads = [] # List to hold sampled reads - with open(file_path, 'r') as f: - lines = f.readlines() - total_reads = len(lines) // 4 # FASTQ files store 4 lines per read - if sampled_indices is None: - # If no indices are provided, sample randomly from the total reads - sampled_indices = sorted(random.sample(range(total_reads), sample_size)) - for idx in sampled_indices: - # Get the read (4 lines per read) and append to the sampled_reads list - read = lines[idx*4:(idx+1)*4] - sampled_reads.extend(read) - return sampled_reads, sampled_indices - -def save_sampled_fastq(sampled_reads, output_path): - '''Save sampled reads to a temporary FASTQ file.''' - with open(output_path, 'w') as f: - f.writelines(sampled_reads) - -def do_statistic(result): - '''Compute statistics (mean and standard deviation) from alignment results.''' - average = result.mean(numeric_only=True).to_frame().T # Compute the mean of numeric columns - region = (result['Sequenced variable region'].mode().values) # Get most frequent region - region = ' / '.join(str(r) for r in region) # Join the modes into a single string - region = region.replace('\'', '').replace('[', '').replace(']', '') # Clean up string format - average['Sequenced variable region'] = region # Add region info to the average dataframe - if 'Not properly paired' not in average.columns: - average['Not properly paired'] = 'not paired' - std_dev = result.std(numeric_only=True).to_frame().T # Compute standard deviation of numeric columns - statistic = pd.concat([average, std_dev], axis=0) # Combine mean and standard deviation - # Select specific columns for the final output - statistic = statistic[['Number of Reads', 'Unaligned Reads [%]', 'Not properly paired', - 'Sequenced variable region', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', - 'V7', 'V8', 'V9', 'Not aligned to a variable region']] - statistic['row_descriptor'] = ['Average', 'Standard deviation'] # Add row descriptors - statistic = statistic.set_index('row_descriptor') # Set descriptor as index - result = pd.concat([statistic, result], axis=0) # Append stats to original results - return result - -def do_output(result, new_file, single_file): - '''Process and output results to a CSV file.''' - warnings.simplefilter(action='ignore', category=FutureWarning) - result = pd.DataFrame(result).T.sort_index() # Convert result to DataFrame and sort - - # Convert columns to numeric where applicable - for column in result: - result[column] = pd.to_numeric(result[column], errors='ignore') - - if single_file is False: - result = do_statistic(result) # Add statistics if processing multiple files - else: - result = result[['Number of Reads', 'Unaligned Reads [%]', 'Not properly paired', - 'Sequenced variable region', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', - 'V7', 'V8', 'V9', 'Not aligned to a variable region']] - - # Write the result DataFrame to a CSV file - result.to_csv(new_file, index=True) - -def workflow(file_dir, new_file, write_csv, sample_size): - '''Main workflow for processing FASTQ files and performing sequence alignment.''' - path = files_manager.get_lib() # Get the main working directory - temp_path = files_manager.tmp_dir(path, temp_path=None) # Create a temporary directory - paired = False # To check if the input is paired-end - single_file = False # To check if processing a single file - result = dict() # Dictionary to store results from different files - buildbowtie2(path) # Build the Bowtie2 index - - # Check if the input path exists - if not os.path.exists(file_dir): - raise ValueError(f"The provided path {file_dir} does not exist.") - - # Check if there are FASTQ files in the directory - if glob.glob(f'{file_dir}**/*.fastq*', recursive=True) == [] and os.path.isdir(file_dir): - files_manager.tmp_dir(path, temp_path) - raise ValueError('There were no FASTQ files in this directory') - - # Process single file - if os.path.isfile(file_dir): - single_file = True - file_name = os.path.basename(file_dir) - read2_file = os.path.join(os.path.dirname(file_dir), file_name.replace('_R1_', '_R2_')) # Get the paired file if exists - if '_R1_' in file_name and os.path.exists(read2_file): - paired = True # Mark as paired if both R1 and R2 files exist - - # Sample reads from R1 and R2 with same indices - sampled_reads_R1, sampled_indices = sample_fastq(file_dir, sample_size) - sampled_reads_R2, _ = sample_fastq(read2_file, sample_size, sampled_indices) - - # Save sampled reads to temporary files - temp_fastq_R1 = os.path.join(temp_path, 'sampled_R1.fastq') - temp_fastq_R2 = os.path.join(temp_path, 'sampled_R2.fastq') - save_sampled_fastq(sampled_reads_R1, temp_fastq_R1) - save_sampled_fastq(sampled_reads_R2, temp_fastq_R2) - - # Run Bowtie2 alignment - aligned_path, Error = mapbowtie2(temp_fastq_R1, temp_fastq_R2, path, temp_path, paired) - - if Error is True: - files_manager.tmp_dir(path, temp_path) - raise ValueError('This file does not look like a fastq file') - if paired is True and Output_counter.rawincount(f'{temp_path}paired.bed') == 0: - files_manager.tmp_dir(path, temp_path) - raise ValueError('This file has no Reads of the required mapping-quality') - - # Perform overlap analysis using Bedtools - overlap(path, temp_path, aligned_path) - if paired is False and Output_counter.rawincount(f'{temp_path}BED.bed') == 0: - files_manager.tmp_dir(path, temp_path) - raise ValueError('This file has no Reads of the required mapping-quality') - - # Generate results from output - file_name = file_name.rsplit('.f', 1)[0] - file_name = file_name.replace('_R1_001', '') - result[file_name] = Output_counter.create_row(temp_path, paired) - - # Process directory of files - elif os.path.isdir(file_dir): - single_file = False - total_files = len(glob.glob(f'{file_dir}**/*.fastq*', recursive=True)) # Count number of FASTQ files - processed_files = 0 - for fq_file in glob.glob(f'{file_dir}**/*.fastq*', recursive=True): - processed_files += 1 - paired = False - if '_R2_' in fq_file: - continue # Skip R2 files to avoid processing duplicates - - file_name = os.path.basename(fq_file) - read2_file = os.path.join(os.path.dirname(fq_file), file_name.replace('_R1_', '_R2_')) - if '_R1_' in file_name and os.path.exists(read2_file): - paired = True # Mark as paired if both R1 and R2 files exist - - # Sample reads from R1 and R2 with same indices - sampled_reads_R1, sampled_indices = sample_fastq(fq_file, sample_size) - sampled_reads_R2, _ = sample_fastq(read2_file, sample_size, sampled_indices) - - # Save sampled reads to temporary files - temp_fastq_R1 = os.path.join(temp_path, 'sampled_R1.fastq') - temp_fastq_R2 = os.path.join(temp_path, 'sampled_R2.fastq') - save_sampled_fastq(sampled_reads_R1, temp_fastq_R1) - save_sampled_fastq(sampled_reads_R2, temp_fastq_R2) - - # Run Bowtie2 alignment - aligned_path, Error = mapbowtie2(temp_fastq_R1, temp_fastq_R2, path, temp_path, paired) - if Error is True: - continue # Skip if there's an error in processing the file - - # Perform overlap analysis using Bedtools - overlap(path, temp_path, aligned_path) - - # Generate results from output - file_name = file_name.rsplit('.f', 1)[0] - file_name = file_name.replace('_R1_001', '') - result[file_name] = Output_counter.create_row(temp_path, paired) - - # Clean up temporary directory - files_manager.tmp_dir(path, temp_path) - - # Output the result as CSV or display - do_output(result, new_file, single_file) - if write_csv is True: - new_file = (f'{path}Output/{os.path.basename(os.path.dirname(file_dir))}.csv') - do_output(result, new_file, single_file) - -def main(): - '''Main function to parse user arguments and initiate the workflow.''' - parser = argparse.ArgumentParser(prog='VX detector', description=( - 'This program tries to find which variable region of the 16S sequence was sequenced')) - parser.add_argument('dir_path', help=('Directory path of the directory containing multiple fastq or fasta files.')) - parser.add_argument('-o', '--output', dest='output_file', default=sys.stdout, - help='User can specify a file format in which the output is written in the Output folder.') - parser.add_argument('-c', '--csv', dest='write_csv', action='store_true', - help='If set the output will be written in a .csv file in the Output folder') - parser.add_argument('-s', '--sample_size', dest='sample_size', type=int, default=1000, - help='Number of reads to sample from each FASTQ file') - - # Parse the user input arguments - args = parser.parse_args() - - # Start the workflow with provided arguments - workflow(args.dir_path, args.output_file, args.write_csv, args.sample_size) - -if __name__ == '__main__': - main() # Entry point of the script From 765987bf8ef627825ef6515804daf913aa550900 Mon Sep 17 00:00:00 2001 From: Landbarsch Date: Tue, 26 Nov 2024 15:31:38 +0100 Subject: [PATCH 10/37] Added Docstring to the missing Methods. Added the sample_fastq method from the memory_efficient_ sampling branch. Sampled indices is no longer used in sample_fastq, yet needed for the workflow to funtion. --- vxdetector/VXdetector.py | 271 ++++++++++++++++++++++++++++++++------- 1 file changed, 228 insertions(+), 43 deletions(-) diff --git a/vxdetector/VXdetector.py b/vxdetector/VXdetector.py index 755d138..6c69617 100644 --- a/vxdetector/VXdetector.py +++ b/vxdetector/VXdetector.py @@ -18,7 +18,33 @@ import random def biom_to_fastq(biom_file, fastq_output): - '''Converts a BIOM file to FASTQ format by extracting sequences and ensuring they are compatible with FASTQ standards.''' + """ + Converts a BIOM file into a FASTQ file. + + This function reads a BIOM file, which is a binary JSON-like format often used in metagenomics to store + sequence counts and metadata. It extracts sequence identifiers and sequences, and formats them into a + FASTQ format. Each sequence is given a default quality score to make it compatible with downstream + bioinformatics tools that require FASTQ files. + + Parameters + ---------- + biom_file : str + Path to the input BIOM file containing sequence data and associated metadata. + fastq_output : str + Path to the output FASTQ file where the converted sequences will be saved. Supports gzip compression + if the filename ends with '.gz'. + + Returns + ------- + None + Writes the converted sequences to the specified FASTQ file. + + Notes + ----- + - Ensure that the input BIOM file is properly formatted and includes sequence data. + - The FASTQ format generated assumes a fixed quality score, which may need adjustment based on downstream + applications. + """ with open(fastq_output, "w") as f: table = biom.load_table(biom_file) # Load the BIOM table sequences = table.ids(axis='observation') # Extract sequences @@ -31,42 +57,111 @@ def biom_to_fastq(biom_file, fastq_output): print(f"BIOM file successfully converted to FASTQ format at: {fastq_output}") -def sample_fastq(file_path, sample_size, sampled_indices=None): - '''Get random parts from the FASTQ file based on shared indices for paired-end reads.''' - sampled_reads = [] # List to hold sampled reads - open_func = gzip.open if file_path.endswith('.gz') else open +def sample_fastq(file_path, sample_size, sampled_indices=None, seed=None): + """Read a (gzipped) fastQ file and randomly (without replacement) select + a given number of reads. + + Assumes that each reads comes in exactly four lines. To save memory and + have reasonable speed, we first pass through the file "just" to count + the total amount of lines (div by 4 gives the number of reads). + Then we randomly draw (without replacement) sample_size many numbers + between 0 and line-numbers (div by 4 to only target header lines) and sort + this list. Next, we read the file again and keep track of the line + number (ln). Whenever the line number matches the current random line + number, the file line and the three following are pushed to the + sampled_readlines array. + + Parameters + ---------- + file_path : str + File path of the fastQ (gzipped) file containing the reads. + sample_size : int + Number of random reads taken from the fastQ file. + sampled_indices : list of int, optional + Consistent sampling across paired end files + seed : int + Random generator seed. - # Count total reads in the file by iterating line by line - with open_func(file_path, 'rt') as f: - total_reads = sum(1 for _ in f) // 4 + Returns + ------- + Tuple of list of random read lines and list of randomly selected line + numbers. + """ - # Adjust sample_size if it's greater than total_reads - if sample_size > total_reads: - sample_size = total_reads + def _count_generator(reader): + b = reader(1024 * 1024) + while b: + yield b + b = reader(1024 * 1024) - if sampled_indices is None: - sampled_indices = sorted(random.sample(range(total_reads), sample_size)) + def _get_filehandle(file_path): + # open file either plain or as gzip compressed file + FH = None + if file_path.endswith('.gz'): + FH = gzip.open(file_path, 'rb') + else: + FH = open(file_path, 'rb') + return FH - with open_func(file_path, 'rt') as f: - read_idx = 0 - read_buffer = [] - for line in f: - read_buffer.append(line) - if len(read_buffer) == 4: # Once we have a full read - if read_idx in sampled_indices: - sampled_reads.extend(read_buffer) # Add the read if it matches the sampled index - read_buffer = [] # Clear the buffer for the next read - read_idx += 1 # Move to the next read + # first pass: count number of lines in the file + FH = _get_filehandle(file_path) + c_generator = _count_generator(FH.read) + number_lines = sum(buffer.count(b'\n') for buffer in c_generator) + assert number_lines % 4 == 0, "file %s does not contain a multiple of 4 lines! %i" % (file_path, number_lines) + FH.close() - # Stop if we've collected enough reads - if len(sampled_reads) >= sample_size * 4: - break + # second pass: iterate through file with line counts + # add next 4 lines to sampled_reads, iff linecount is in random selection + sampled_readlines = [] # List to hold sampled reads + if seed is not None: + random.seed(seed) + sel_reads = sorted(list(map(lambda x: x*4, random.sample(range(int(number_lines / 4) + 1), sample_size)))) + FH = _get_filehandle(file_path) + ln = 0 # line number iterator + j = 0 # iterator through sorted, random header line numbers + while ln < number_lines: + line = FH.readline() + if ln == sel_reads[j]: + sampled_readlines.append(str(line, encoding='utf-8')) + for i in range(3): + line = FH.readline() + sampled_readlines.append(str(line, encoding='utf-8')) + ln += 1 + if j + 1 < len(sel_reads): + j += 1 + else: + break + ln += 1 + FH.close() - return sampled_reads, sampled_indices + return sampled_readlines, sel_reads def save_sampled_fastq(sampled_reads, output_path): - '''Save sampled reads to a temporary FASTQ file.''' + """ + Writes sampled sequencing reads into a FASTQ file. + + The function accepts a list of sampled reads in FASTQ format and saves them to the specified output + file. If the output file ends with '.gz', the file is automatically compressed using gzip. This function + ensures that the sampled reads are stored efficiently for use in subsequent steps like alignment or + analysis. + + Parameters + ---------- + sampled_reads : list of str + List of reads in FASTQ format, where each read consists of four lines: + - Line 1: Sequence identifier (e.g., @SEQ_ID). + - Line 2: Raw sequence. + - Line 3: Optional '+' separator. + - Line 4: Quality scores. + output_path : str + Path to the output file. If the file ends with '.gz', it will be written in a compressed format. + + Returns + ------- + None + Saves the sampled reads to the output file. + """ open_func = gzip.open if output_path.endswith('.gz') else open # Use gzip if output is .gz with open_func(output_path, 'wt') as f: # Write in text mode ('wt') @@ -74,13 +169,24 @@ def save_sampled_fastq(sampled_reads, output_path): def do_statistic(result): - '''Performs statistical analysis on the DataFrame + """ + Performs statistical analysis on sequencing data. + + This function analyzes the results from processing FASTQ files, calculating metrics such as the mean and + standard deviation for numeric columns in the data. It also identifies the most common variable region + across sequences, appending these statistics as new columns in the DataFrame. - Calculates the mean and standard deviation for all numeric columns - in the DataFrame, and determines the most common sequenced variable - region. Adds these statistics to the DataFrame and returns the updated - DataFrame. - ''' + Parameters + ---------- + result : pandas.DataFrame + DataFrame containing processed sequencing data. Expected columns include numeric metrics and + categorical data such as variable regions. + + Returns + ------- + pandas.DataFrame + Updated DataFrame with additional columns for calculated statistics. + """ average = result.mean(numeric_only=True).to_frame().T # Calculate mean for numeric columns region = (result['Sequenced variable region'].mode().values) # Find the most common variable region region = ' / '.join(str(r) for r in region) # Format the result for better readability @@ -101,11 +207,28 @@ def do_statistic(result): def do_output(result, new_file, single_file): - '''Writes the results into a CSV file + """ + Writes processing results to a CSV file. + + Converts sequencing results stored in a dictionary into a DataFrame, optionally calculates statistics, + and writes the data to a CSV file for downstream analysis or record-keeping. If the results are from a + single input file, no additional statistics are calculated. - Converts the dictionary of results into a DataFrame, calculates statistics - if working with multiple files, and writes the data into a CSV file. - ''' + Parameters + ---------- + result : dict + Dictionary containing processing results, keyed by file name. + new_file : str + Path to the output CSV file where results will be saved. + single_file : bool + If True, skips statistical calculations as the results are from a single input file. + + Returns + ------- + None + Saves the results to the specified file. + """ + warnings.simplefilter(action='ignore', category=FutureWarning) # Ignore warnings about future behavior result = pd.DataFrame(result).T.sort_index() # Convert the dictionary to a DataFrame and sort by index for column in result: @@ -122,9 +245,30 @@ def do_output(result, new_file, single_file): def process_file(fq_file, path, bowtie2_params, sample_size=None): """ - Processes a single FASTQ file, including optional read sampling, alignment with Bowtie2, - and overlap analysis using Bedtools. + Processes an individual FASTQ file through sampling, alignment, and overlap analysis. + + This function handles all steps for processing a single FASTQ file. It includes optional random sampling + of reads to reduce data size, alignment to a reference genome using Bowtie2, and analysis of overlap + regions using Bedtools. + + Parameters + ---------- + fq_file : str + Path to the input FASTQ file to process. + path : str + Path to the Bowtie2 library and index files. + bowtie2_params : str + Additional parameters for Bowtie2 alignment (e.g., "--fast --threads 4"). + sample_size : int, optional + Number of reads to randomly sample. If None, the entire file is processed. + + Returns + ------- + dict or None + A dictionary with processing results, including file name and metrics. Returns None if an error + occurs during processing. """ + paired = False result = {} try: @@ -160,9 +304,30 @@ def process_file(fq_file, path, bowtie2_params, sample_size=None): def workflow(file_dir, new_file, write_csv, sample_size, bowtie2_params): """ - Manages the overall processing workflow, including directory traversal, BIOM file conversion, - and multiprocessing for FASTQ file processing. + Executes the full workflow for processing sequencing data. + + The workflow includes directory traversal, conversion of BIOM files to FASTQ format (if needed), and + parallelized processing of FASTQ files. Results are aggregated and optionally written to a CSV file. + + Parameters + ---------- + file_dir : str + Path to the directory or file containing input sequencing data. + new_file : str + Path to the CSV file where results will be saved. + write_csv : bool + If True, writes the results to a CSV file. + sample_size : int + Number of reads to sample from each FASTQ file. If None, all reads are processed. + bowtie2_params : str + Additional parameters for Bowtie2 alignment. + + Returns + ------- + None + Executes the workflow and outputs results as specified. """ + path = files_manager.get_lib() buildbowtie2(path) # Prepare Bowtie2 library result = {} @@ -188,7 +353,27 @@ def workflow(file_dir, new_file, write_csv, sample_size, bowtie2_params): def main(): - '''Main function to parse user arguments and initiate the workflow.''' + """ + Parses user input and initiates the workflow. + + This function uses argparse to handle command-line arguments, passing them to the workflow function. + It provides flexibility for customizing the input directory, Bowtie2 parameters, and output options. + + Parameters + ---------- + None + + Returns + ------- + None + Executes the script based on user-provided inputs. + + Notes + ----- + - Command-line usage should follow the format specified in the argparse help message. + - Ensure all required dependencies (Bowtie2, Bedtools) are installed and accessible. + """ + parser = argparse.ArgumentParser(prog='VX detector', description=( 'This program tries to find which variable region of the 16S sequence was sequenced')) parser.add_argument('dir_path', help=('Directory path of the directory containing multiple fastq or fasta files.')) From 7f0908cc7b49bc79c5af1bda07786608b4c88133 Mon Sep 17 00:00:00 2001 From: Landbarsch Date: Thu, 28 Nov 2024 13:09:35 +0100 Subject: [PATCH 11/37] Formatting --- vxdetector/VXdetector.py | 113 ++++++++++++++++++++++++--------------- 1 file changed, 71 insertions(+), 42 deletions(-) diff --git a/vxdetector/VXdetector.py b/vxdetector/VXdetector.py index 6c69617..5ce83ef 100644 --- a/vxdetector/VXdetector.py +++ b/vxdetector/VXdetector.py @@ -13,10 +13,10 @@ import tempfile from multiprocessing import Pool import biom -import shutil import gzip import random + def biom_to_fastq(biom_file, fastq_output): """ Converts a BIOM file into a FASTQ file. @@ -38,7 +38,7 @@ def biom_to_fastq(biom_file, fastq_output): ------- None Writes the converted sequences to the specified FASTQ file. - + Notes ----- - Ensure that the input BIOM file is properly formatted and includes sequence data. @@ -50,11 +50,15 @@ def biom_to_fastq(biom_file, fastq_output): sequences = table.ids(axis='observation') # Extract sequences for seq_id in sequences: # Truncate the sequence ID if it’s too long for Bowtie2's requirements - truncated_id = seq_id[:50] # Cut the sequence ID to 50 characters (adjust if needed) + # Cut the sequence ID to 50 characters (adjust if needed) + truncated_id = seq_id[:50] # Check and sanitize the sequence to avoid unwanted characters - sanitized_seq = ''.join(filter(str.isalpha, seq_id)) # Ensure only letters are in the sequence - f.write(f"@{truncated_id}\n{sanitized_seq}\n+\n{'I' * len(sanitized_seq)}\n") # Write in FASTQ format - print(f"BIOM file successfully converted to FASTQ format at: {fastq_output}") + # Ensure only letters are in the sequence + sanitized_seq = ''.join(filter(str.isalpha, seq_id)) + f.write(f"@{truncated_id}\n{sanitized_seq}\n+\n{'I' * + len(sanitized_seq)}\n") # Write in FASTQ format + print(f"BIOM file successfully converted to FASTQ format at: { + fastq_output}") def sample_fastq(file_path, sample_size, sampled_indices=None, seed=None): @@ -107,7 +111,8 @@ def _get_filehandle(file_path): FH = _get_filehandle(file_path) c_generator = _count_generator(FH.read) number_lines = sum(buffer.count(b'\n') for buffer in c_generator) - assert number_lines % 4 == 0, "file %s does not contain a multiple of 4 lines! %i" % (file_path, number_lines) + assert number_lines % 4 == 0, "file %s does not contain a multiple of 4 lines! %i" % ( + file_path, number_lines) FH.close() # second pass: iterate through file with line counts @@ -115,7 +120,8 @@ def _get_filehandle(file_path): sampled_readlines = [] # List to hold sampled reads if seed is not None: random.seed(seed) - sel_reads = sorted(list(map(lambda x: x*4, random.sample(range(int(number_lines / 4) + 1), sample_size)))) + sel_reads = sorted(list( + map(lambda x: x*4, random.sample(range(int(number_lines / 4) + 1), sample_size)))) FH = _get_filehandle(file_path) ln = 0 # line number iterator j = 0 # iterator through sorted, random header line numbers @@ -128,7 +134,7 @@ def _get_filehandle(file_path): sampled_readlines.append(str(line, encoding='utf-8')) ln += 1 if j + 1 < len(sel_reads): - j += 1 + j += 1 else: break ln += 1 @@ -162,7 +168,8 @@ def save_sampled_fastq(sampled_reads, output_path): None Saves the sampled reads to the output file. """ - open_func = gzip.open if output_path.endswith('.gz') else open # Use gzip if output is .gz + open_func = gzip.open if output_path.endswith( + '.gz') else open # Use gzip if output is .gz with open_func(output_path, 'wt') as f: # Write in text mode ('wt') f.writelines(sampled_reads) @@ -187,22 +194,32 @@ def do_statistic(result): pandas.DataFrame Updated DataFrame with additional columns for calculated statistics. """ - average = result.mean(numeric_only=True).to_frame().T # Calculate mean for numeric columns - region = (result['Sequenced variable region'].mode().values) # Find the most common variable region - region = ' / '.join(str(r) for r in region) # Format the result for better readability - region = region.replace('\'', '').replace('[', '').replace(']', '') # Clean up formatting + average = result.mean(numeric_only=True).to_frame( + ).T # Calculate mean for numeric columns + # Find the most common variable region + region = (result['Sequenced variable region'].mode().values) + # Format the result for better readability + region = ' / '.join(str(r) for r in region) + region = region.replace('\'', '').replace( + '[', '').replace(']', '') # Clean up formatting average['Sequenced variable region'] = region if 'Not properly paired' not in average.columns: - average['Not properly paired'] = 'not paired' # Handle cases where reads are not paired - std_dev = result.std(numeric_only=True).to_frame().T # Calculate standard deviation for numeric columns - statistic = pd.concat([average, std_dev], axis=0) # Combine mean and standard deviation dataframes + # Handle cases where reads are not paired + average['Not properly paired'] = 'not paired' + # Calculate standard deviation for numeric columns + std_dev = result.std(numeric_only=True).to_frame().T + # Combine mean and standard deviation dataframes + statistic = pd.concat([average, std_dev], axis=0) # Select relevant columns and reorder them statistic = statistic[['Number of Reads', 'Unaligned Reads [%]', 'Not properly paired', 'Sequenced variable region', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'Not aligned to a variable region']] - statistic['row_descriptor'] = ['Average', 'Standard deviation'] # Add descriptors for the statistics - statistic = statistic.set_index('row_descriptor') # Set the row descriptors as the index - result = pd.concat([statistic, result], axis=0) # Combine the statistical results with the original data + # Add descriptors for the statistics + statistic['row_descriptor'] = ['Average', 'Standard deviation'] + # Set the row descriptors as the index + statistic = statistic.set_index('row_descriptor') + # Combine the statistical results with the original data + result = pd.concat([statistic, result], axis=0) return result @@ -228,13 +245,17 @@ def do_output(result, new_file, single_file): None Saves the results to the specified file. """ - - warnings.simplefilter(action='ignore', category=FutureWarning) # Ignore warnings about future behavior - result = pd.DataFrame(result).T.sort_index() # Convert the dictionary to a DataFrame and sort by index + + # Ignore warnings about future behavior + warnings.simplefilter(action='ignore', category=FutureWarning) + # Convert the dictionary to a DataFrame and sort by index + result = pd.DataFrame(result).T.sort_index() for column in result: - result[column] = pd.to_numeric(result[column], errors='ignore') # Convert columns to numeric where possible + # Convert columns to numeric where possible + result[column] = pd.to_numeric(result[column], errors='ignore') if single_file is False: - result = do_statistic(result) # If multiple files, calculate statistics + # If multiple files, calculate statistics + result = do_statistic(result) else: # Select relevant columns for single file processing result = result[['Number of Reads', 'Unaligned Reads [%]', 'Not properly paired', @@ -268,30 +289,34 @@ def process_file(fq_file, path, bowtie2_params, sample_size=None): A dictionary with processing results, including file name and metrics. Returns None if an error occurs during processing. """ - + paired = False result = {} try: with tempfile.TemporaryDirectory() as temp_path: file_name = os.path.basename(fq_file) - read2_file = os.path.join(os.path.dirname(fq_file), file_name.replace('_R1_', '_R2_')) + read2_file = os.path.join(os.path.dirname( + fq_file), file_name.replace('_R1_', '_R2_')) if '_R1_' in file_name and os.path.exists(read2_file): paired = True # Paired-end sequencing detected if sample_size: # Perform sampling if required - sampled_reads_R1, sampled_indices = sample_fastq(fq_file, sample_size) + sampled_reads_R1, sampled_indices = sample_fastq( + fq_file, sample_size) temp_fastq_R1 = os.path.join(temp_path, 'sampled_R1.fastq') save_sampled_fastq(sampled_reads_R1, temp_fastq_R1) fq_file = temp_fastq_R1 if paired: # Process the reverse reads for paired-end data - sampled_reads_R2, _ = sample_fastq(read2_file, sample_size, sampled_indices) + sampled_reads_R2, _ = sample_fastq( + read2_file, sample_size, sampled_indices) temp_fastq_R2 = os.path.join(temp_path, 'sampled_R2.fastq') save_sampled_fastq(sampled_reads_R2, temp_fastq_R2) read2_file = temp_fastq_R2 # Perform alignment and overlap analysis - aligned_path, Error = mapbowtie2(fq_file, read2_file, path, temp_path, paired, bowtie2_params) + aligned_path, Error = mapbowtie2( + fq_file, read2_file, path, temp_path, paired, bowtie2_params) if Error: return None overlap(path, temp_path, aligned_path) @@ -327,14 +352,15 @@ def workflow(file_dir, new_file, write_csv, sample_size, bowtie2_params): None Executes the workflow and outputs results as specified. """ - + path = files_manager.get_lib() buildbowtie2(path) # Prepare Bowtie2 library result = {} single_file = False if file_dir.endswith('.biom'): # Handle BIOM file input - temp_fastq = os.path.join(tempfile.gettempdir(), 'temp_sequences.fastq') + temp_fastq = os.path.join( + tempfile.gettempdir(), 'temp_sequences.fastq') biom_to_fastq(file_dir, temp_fastq) result = process_file(temp_fastq, path, bowtie2_params, sample_size) single_file = True @@ -367,29 +393,32 @@ def main(): ------- None Executes the script based on user-provided inputs. - + Notes ----- - Command-line usage should follow the format specified in the argparse help message. - Ensure all required dependencies (Bowtie2, Bedtools) are installed and accessible. """ - + parser = argparse.ArgumentParser(prog='VX detector', description=( 'This program tries to find which variable region of the 16S sequence was sequenced')) - parser.add_argument('dir_path', help=('Directory path of the directory containing multiple fastq or fasta files.')) - parser.add_argument('-o', '--output', dest='output_file', default=sys.stdout, + parser.add_argument('dir_path', help=( + 'Directory path of the directory containing multiple fastq or fasta files.')) + parser.add_argument('-o', '--output', dest='output_file', default=sys.stdout, help='User can specify a file format in which the output is written in the Output folder.') - parser.add_argument('-c', '--csv', dest='write_csv', action='store_true', + parser.add_argument('-c', '--csv', dest='write_csv', action='store_true', help='If set the output will be written in a .csv file in the Output folder') - parser.add_argument('-s', '--sample_size', dest='sample_size', type=int, default=1000, + parser.add_argument('-s', '--sample_size', dest='sample_size', type=int, default=1000, help='Number of reads to sample from each FASTQ file') - parser.add_argument('-b','--bowtie2_params', dest='bowtie2_params', default= None, + parser.add_argument('-b', '--bowtie2_params', dest='bowtie2_params', default=None, help='Additional parameters to pass to Bowtie2') # Parse the user input arguments args = parser.parse_args() - + # Start the workflow with provided arguments - workflow(args.dir_path, args.output_file, args.write_csv, args.sample_size, args.bowtie2_params) + workflow(args.dir_path, args.output_file, args.write_csv, + args.sample_size, args.bowtie2_params) + if __name__ == '__main__': - main() # Entry point of the script \ No newline at end of file + main() # Entry point of the script From dd79888c89ef1ccb87cb1d302c6a6c683ac2d64d Mon Sep 17 00:00:00 2001 From: Landbarsch Date: Thu, 28 Nov 2024 13:40:36 +0100 Subject: [PATCH 12/37] more formatting --- vxdetector/VXdetector.py | 148 ++++++++++++++++++++++++--------------- 1 file changed, 90 insertions(+), 58 deletions(-) diff --git a/vxdetector/VXdetector.py b/vxdetector/VXdetector.py index 5ce83ef..8866a32 100644 --- a/vxdetector/VXdetector.py +++ b/vxdetector/VXdetector.py @@ -21,17 +21,20 @@ def biom_to_fastq(biom_file, fastq_output): """ Converts a BIOM file into a FASTQ file. - This function reads a BIOM file, which is a binary JSON-like format often used in metagenomics to store - sequence counts and metadata. It extracts sequence identifiers and sequences, and formats them into a - FASTQ format. Each sequence is given a default quality score to make it compatible with downstream - bioinformatics tools that require FASTQ files. + This function reads a BIOM file, which is a binary JSON-like format often + used in metagenomics to store sequence counts and metadata. It extracts + sequence identifiers and sequences, and formats them into a FASTQ format. + Each sequence is given a default quality score to make it compatible with + downstream bioinformatics tools that require FASTQ files. Parameters ---------- biom_file : str - Path to the input BIOM file containing sequence data and associated metadata. + Path to the input BIOM file containing sequence data + and associated metadata. fastq_output : str - Path to the output FASTQ file where the converted sequences will be saved. Supports gzip compression + Path to the output FASTQ file where the converted sequences will + be saved. Supports gzip compression if the filename ends with '.gz'. Returns @@ -41,24 +44,26 @@ def biom_to_fastq(biom_file, fastq_output): Notes ----- - - Ensure that the input BIOM file is properly formatted and includes sequence data. - - The FASTQ format generated assumes a fixed quality score, which may need adjustment based on downstream - applications. + - Ensure that the input BIOM file is properly formatted and + includes sequence data. + - The FASTQ format generated assumes a fixed quality score, + which may need adjustment based on downstream applications. """ with open(fastq_output, "w") as f: table = biom.load_table(biom_file) # Load the BIOM table sequences = table.ids(axis='observation') # Extract sequences for seq_id in sequences: - # Truncate the sequence ID if it’s too long for Bowtie2's requirements + # Truncate the sequence ID if it’s too long for Bowtie2's + # requirements. # Cut the sequence ID to 50 characters (adjust if needed) truncated_id = seq_id[:50] # Check and sanitize the sequence to avoid unwanted characters # Ensure only letters are in the sequence sanitized_seq = ''.join(filter(str.isalpha, seq_id)) - f.write(f"@{truncated_id}\n{sanitized_seq}\n+\n{'I' * - len(sanitized_seq)}\n") # Write in FASTQ format - print(f"BIOM file successfully converted to FASTQ format at: { - fastq_output}") + f.write(f"@{truncated_id}\n{sanitized_seq}\n+ \ + \n{'I' * len(sanitized_seq)}\n") # Write in FASTQ format + print(f"BIOM file successfully converted to FASTQ format at: \ + {fastq_output}") def sample_fastq(file_path, sample_size, sampled_indices=None, seed=None): @@ -69,8 +74,8 @@ def sample_fastq(file_path, sample_size, sampled_indices=None, seed=None): have reasonable speed, we first pass through the file "just" to count the total amount of lines (div by 4 gives the number of reads). Then we randomly draw (without replacement) sample_size many numbers - between 0 and line-numbers (div by 4 to only target header lines) and sort - this list. Next, we read the file again and keep track of the line + between 0 and line-numbers (div by 4 to only target header lines) and + sort this list. Next, we read the file again and keep track of the line number (ln). Whenever the line number matches the current random line number, the file line and the three following are pushed to the sampled_readlines array. @@ -111,7 +116,8 @@ def _get_filehandle(file_path): FH = _get_filehandle(file_path) c_generator = _count_generator(FH.read) number_lines = sum(buffer.count(b'\n') for buffer in c_generator) - assert number_lines % 4 == 0, "file %s does not contain a multiple of 4 lines! %i" % ( + assert number_lines % 4 == 0, "file %s does not contain \ + a multiple of 4 lines! %i" % ( file_path, number_lines) FH.close() @@ -121,7 +127,8 @@ def _get_filehandle(file_path): if seed is not None: random.seed(seed) sel_reads = sorted(list( - map(lambda x: x*4, random.sample(range(int(number_lines / 4) + 1), sample_size)))) + map(lambda x: x*4, random.sample(range(int(number_lines / 4) + 1), + sample_size)))) FH = _get_filehandle(file_path) ln = 0 # line number iterator j = 0 # iterator through sorted, random header line numbers @@ -147,10 +154,11 @@ def save_sampled_fastq(sampled_reads, output_path): """ Writes sampled sequencing reads into a FASTQ file. - The function accepts a list of sampled reads in FASTQ format and saves them to the specified output - file. If the output file ends with '.gz', the file is automatically compressed using gzip. This function - ensures that the sampled reads are stored efficiently for use in subsequent steps like alignment or - analysis. + The function accepts a list of sampled reads in FASTQ format and + saves them to the specified output file. If the output file ends + with '.gz', the file is automatically compressed using gzip. + This function ensures that the sampled reads are stored efficiently + for use in subsequent steps like alignment or analysis. Parameters ---------- @@ -161,7 +169,8 @@ def save_sampled_fastq(sampled_reads, output_path): - Line 3: Optional '+' separator. - Line 4: Quality scores. output_path : str - Path to the output file. If the file ends with '.gz', it will be written in a compressed format. + Path to the output file. If the file ends with '.gz', + it will be written in a compressed format. Returns ------- @@ -179,15 +188,17 @@ def do_statistic(result): """ Performs statistical analysis on sequencing data. - This function analyzes the results from processing FASTQ files, calculating metrics such as the mean and - standard deviation for numeric columns in the data. It also identifies the most common variable region - across sequences, appending these statistics as new columns in the DataFrame. + This function analyzes the results from processing FASTQ files, + calculating metrics such as the mean and standard deviation for + numeric columns in the data. It also identifies the most common + variable region across sequences, appending these statistics as + new columns in the DataFrame. Parameters ---------- result : pandas.DataFrame - DataFrame containing processed sequencing data. Expected columns include numeric metrics and - categorical data such as variable regions. + DataFrame containing processed sequencing data. Expected columns + include numeric metrics and categorical data such as variable regions. Returns ------- @@ -211,9 +222,10 @@ def do_statistic(result): # Combine mean and standard deviation dataframes statistic = pd.concat([average, std_dev], axis=0) # Select relevant columns and reorder them - statistic = statistic[['Number of Reads', 'Unaligned Reads [%]', 'Not properly paired', - 'Sequenced variable region', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', - 'V7', 'V8', 'V9', 'Not aligned to a variable region']] + statistic = statistic[ + ['Number of Reads', 'Unaligned Reads [%]', 'Not properly paired', + 'Sequenced variable region', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', + 'V7', 'V8', 'V9', 'Not aligned to a variable region']] # Add descriptors for the statistics statistic['row_descriptor'] = ['Average', 'Standard deviation'] # Set the row descriptors as the index @@ -227,8 +239,9 @@ def do_output(result, new_file, single_file): """ Writes processing results to a CSV file. - Converts sequencing results stored in a dictionary into a DataFrame, optionally calculates statistics, - and writes the data to a CSV file for downstream analysis or record-keeping. If the results are from a + Converts sequencing results stored in a dictionary into a DataFrame, + optionally calculates statistics, and writes the data to a CSV file + for downstream analysis or record-keeping. If the results are from a single input file, no additional statistics are calculated. Parameters @@ -238,7 +251,8 @@ def do_output(result, new_file, single_file): new_file : str Path to the output CSV file where results will be saved. single_file : bool - If True, skips statistical calculations as the results are from a single input file. + If True, skips statistical calculations as the results + are from a single input file. Returns ------- @@ -258,19 +272,22 @@ def do_output(result, new_file, single_file): result = do_statistic(result) else: # Select relevant columns for single file processing - result = result[['Number of Reads', 'Unaligned Reads [%]', 'Not properly paired', - 'Sequenced variable region', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', - 'V7', 'V8', 'V9', 'Not aligned to a variable region']] + result = result[ + ['Number of Reads', 'Unaligned Reads [%]', 'Not properly paired', + 'Sequenced variable region', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', + 'V7', 'V8', 'V9', 'Not aligned to a variable region']] result.to_csv(new_file, index=True) # Write the results to a CSV file def process_file(fq_file, path, bowtie2_params, sample_size=None): """ - Processes an individual FASTQ file through sampling, alignment, and overlap analysis. + Processes an individual FASTQ file through sampling, alignment, + and overlap analysis. - This function handles all steps for processing a single FASTQ file. It includes optional random sampling - of reads to reduce data size, alignment to a reference genome using Bowtie2, and analysis of overlap - regions using Bedtools. + This function handles all steps for processing a single FASTQ file. + It includes optional random sampling of reads to reduce data size, + alignment to a reference genome using Bowtie2, and analysis of + overlap regions using Bedtools. Parameters ---------- @@ -279,14 +296,17 @@ def process_file(fq_file, path, bowtie2_params, sample_size=None): path : str Path to the Bowtie2 library and index files. bowtie2_params : str - Additional parameters for Bowtie2 alignment (e.g., "--fast --threads 4"). + Additional parameters for Bowtie2 alignment + (e.g., "--fast --threads 4"). sample_size : int, optional - Number of reads to randomly sample. If None, the entire file is processed. + Number of reads to randomly sample. + If None, the entire file is processed. Returns ------- dict or None - A dictionary with processing results, including file name and metrics. Returns None if an error + A dictionary with processing results, including file name and metrics. + Returns None if an error occurs during processing. """ @@ -331,8 +351,9 @@ def workflow(file_dir, new_file, write_csv, sample_size, bowtie2_params): """ Executes the full workflow for processing sequencing data. - The workflow includes directory traversal, conversion of BIOM files to FASTQ format (if needed), and - parallelized processing of FASTQ files. Results are aggregated and optionally written to a CSV file. + The workflow includes directory traversal, conversion of BIOM files to + FASTQ format (if needed), and parallelized processing of FASTQ files. + Results are aggregated and optionally written to a CSV file. Parameters ---------- @@ -343,7 +364,8 @@ def workflow(file_dir, new_file, write_csv, sample_size, bowtie2_params): write_csv : bool If True, writes the results to a CSV file. sample_size : int - Number of reads to sample from each FASTQ file. If None, all reads are processed. + Number of reads to sample from each FASTQ file. + If None, all reads are processed. bowtie2_params : str Additional parameters for Bowtie2 alignment. @@ -382,8 +404,9 @@ def main(): """ Parses user input and initiates the workflow. - This function uses argparse to handle command-line arguments, passing them to the workflow function. - It provides flexibility for customizing the input directory, Bowtie2 parameters, and output options. + This function uses argparse to handle command-line arguments, passing them + to the workflow function. It provides flexibility for customizing the input + directory, Bowtie2 parameters, and output options. Parameters ---------- @@ -396,21 +419,30 @@ def main(): Notes ----- - - Command-line usage should follow the format specified in the argparse help message. - - Ensure all required dependencies (Bowtie2, Bedtools) are installed and accessible. + - Command-line usage should follow the format specified in + the argparse help message. + - Ensure all required dependencies (Bowtie2, Bedtools) are + installed and accessible. """ parser = argparse.ArgumentParser(prog='VX detector', description=( - 'This program tries to find which variable region of the 16S sequence was sequenced')) + 'This program tries to find which variable region of the 16S \ + sequence was sequenced')) parser.add_argument('dir_path', help=( - 'Directory path of the directory containing multiple fastq or fasta files.')) - parser.add_argument('-o', '--output', dest='output_file', default=sys.stdout, - help='User can specify a file format in which the output is written in the Output folder.') + 'Directory path of the directory containing \ + multiple fastq or fasta files.')) + parser.add_argument('-o', '--output', dest='output_file', + default=sys.stdout, + help='User can specify a file format in which the \ + output is written in the Output folder.') parser.add_argument('-c', '--csv', dest='write_csv', action='store_true', - help='If set the output will be written in a .csv file in the Output folder') - parser.add_argument('-s', '--sample_size', dest='sample_size', type=int, default=1000, + help='If set the output will be written in a .csv \ + file in the Output folder') + parser.add_argument('-s', '--sample_size', dest='sample_size', type=int, + default=1000, help='Number of reads to sample from each FASTQ file') - parser.add_argument('-b', '--bowtie2_params', dest='bowtie2_params', default=None, + parser.add_argument('-b', '--bowtie2_params', dest='bowtie2_params', + default=None, help='Additional parameters to pass to Bowtie2') # Parse the user input arguments args = parser.parse_args() From 3cf6e0fb3f56383da27259eaf9bb9ac702e3b6a0 Mon Sep 17 00:00:00 2001 From: Landbarsch Date: Thu, 28 Nov 2024 13:44:07 +0100 Subject: [PATCH 13/37] formatted for flake8 --- vxdetector/VXdetector.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vxdetector/VXdetector.py b/vxdetector/VXdetector.py index 8866a32..7c0f9d0 100644 --- a/vxdetector/VXdetector.py +++ b/vxdetector/VXdetector.py @@ -156,7 +156,7 @@ def save_sampled_fastq(sampled_reads, output_path): The function accepts a list of sampled reads in FASTQ format and saves them to the specified output file. If the output file ends - with '.gz', the file is automatically compressed using gzip. + with '.gz', the file is automatically compressed using gzip. This function ensures that the sampled reads are stored efficiently for use in subsequent steps like alignment or analysis. @@ -392,8 +392,10 @@ def workflow(file_dir, new_file, write_csv, sample_size, bowtie2_params): elif os.path.isdir(file_dir): # Process directory of FASTQ files fastq_files = glob.glob(f'{file_dir}/**/*.fastq*', recursive=True) with Pool() as pool: - results = pool.starmap(process_file, [(fq_file, path, bowtie2_params, sample_size) - for fq_file in fastq_files if '_R2_' not in fq_file]) + results = pool.starmap(process_file, [(fq_file, path, + bowtie2_params, sample_size) + for fq_file in fastq_files + if '_R2_' not in fq_file]) for res in results: if res: result.update(res) From 3849a7d0bc5ce0dfe07f234609229fe094839184 Mon Sep 17 00:00:00 2001 From: Landbarsch Date: Thu, 12 Dec 2024 16:29:53 +0100 Subject: [PATCH 14/37] Delete .DS_Store --- .DS_Store | Bin 8196 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 6b56441b7de3097435e94e155910f8b2d2e61d93..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8196 zcmeHMO>7%Q6n^8x%{r!dn;#lLfF-NIp$g7VQ&m0Sx~`~&P^vg8soLW1dNz!qH zogYFZA1k=_zzHF7Mo5SQN7MscH~>N%s=^703kQxxJ+yCrHr_ZkAduQfosnj~w=?gX znSJx-jb{x2q};Y<0Hy#y!^}|6V$mnDc9svMl&pCkksu%3=1i?YTmAt}RiP}PETAl) zETAl)EO1yXfM+&K+JJLkw5n2BKw034v;d0_dCUw;5<3EU=)i(s0T73=nkSTXCLrb| zmLzrrVhbiBsEC3}|r z3$VNUFib-e9Eg(l_w?;gUrx5QXr*Gh)WkaN-rb8|HjCe!+uv^eZM(nL{Wg&P_Isql zNIWjAK^x5E=m0}M9n?t9;fvqeGj%bCo_<=iJCng$$@$2bl>-|**g#7JKv8P(=Ll!H zjj~&HX+v6o1sV4C(+_w;2_OqqASI7tmKK@D}1U`jOScV0KC}GziguBJA5o zeIK@a^#AJ7A4QMd;(iw)uYpVo!G2C^9Xay}U+v3z>^?U$hdwuV>VNuN-*ndnyVzTOCI+q>Ef9BXlPHMFCZgnZ>%h?}_@*{!O# zO0oY9+{lZ5)o7RYJAOvSO{x&34CA8CEpEg31&7sq}_USGXroIsuX zrZ^b}=yuNs)IBXvk-oag7?G}ijq3f!?}-ph@9?J~F~zAY;)TKA{}2C*s2M5?C=2{& z3y|STtx`sM9G-i7Vt4I4<{Ow7 Spw54F{_ovX(E*>(^Zy}UVQlXJ From 41f3d6a2c56fe92d97798c15079673ce520d732e Mon Sep 17 00:00:00 2001 From: Landbarsch Date: Thu, 12 Dec 2024 16:31:35 +0100 Subject: [PATCH 15/37] Update .gitignore ignore DS_Store --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index f133658..d1dcd15 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .coverage coverage.lcov bowtie2.*.bt2 +.DS_Store From ceaa08e9b5ebe0b7bee576c37a73afd00a351676 Mon Sep 17 00:00:00 2001 From: Landbarsch Date: Thu, 12 Dec 2024 16:34:02 +0100 Subject: [PATCH 16/37] Delete vxdetector/testlog --- vxdetector/testlog | 9 --------- 1 file changed, 9 deletions(-) delete mode 100644 vxdetector/testlog diff --git a/vxdetector/testlog b/vxdetector/testlog deleted file mode 100644 index 6e6ca14..0000000 --- a/vxdetector/testlog +++ /dev/null @@ -1,9 +0,0 @@ -sh: /bin/bowtie2-build: No such file or directory -Traceback (most recent call last): - File "/Users/jeremy/Desktop/VX/algorithm_vxdetector/vxdetector/VXdetector.py", line 144, in - main() - File "/Users/jeremy/Desktop/VX/algorithm_vxdetector/vxdetector/VXdetector.py", line 141, in main - workflow(args.dir_path, args.output_file, args.write_csv, args.sample_size) - File "/Users/jeremy/Desktop/VX/algorithm_vxdetector/vxdetector/VXdetector.py", line 77, in workflow - raise ValueError('There were no FASTQ files in this directory') -ValueError: There were no FASTQ files in this directory From 844c824fc5e8455fd15c8feca7246a913b565d3c Mon Sep 17 00:00:00 2001 From: Landbarsch Date: Thu, 12 Dec 2024 16:40:35 +0100 Subject: [PATCH 17/37] Update environment.yml --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 7358ce6..8b4c374 100644 --- a/environment.yml +++ b/environment.yml @@ -5,7 +5,7 @@ channels: - defaults dependencies: - flake8 - - nose + - pytest - coverage >= 6 # to ensure lcov option is available - pandas - samtools From 8867fa87906d1fdf832816d2050c7e319acd03d4 Mon Sep 17 00:00:00 2001 From: Landbarsch Date: Thu, 12 Dec 2024 16:45:48 +0100 Subject: [PATCH 18/37] Update environment.yml --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 8b4c374..a597d56 100644 --- a/environment.yml +++ b/environment.yml @@ -5,7 +5,7 @@ channels: - defaults dependencies: - flake8 - - pytest + - pynose - coverage >= 6 # to ensure lcov option is available - pandas - samtools From 2b2a01aab7acbfdc61fabcec1205a10475d2905c Mon Sep 17 00:00:00 2001 From: Landbarsch Date: Thu, 12 Dec 2024 16:47:50 +0100 Subject: [PATCH 19/37] Update environment.yml --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index a597d56..7358ce6 100644 --- a/environment.yml +++ b/environment.yml @@ -5,7 +5,7 @@ channels: - defaults dependencies: - flake8 - - pynose + - nose - coverage >= 6 # to ensure lcov option is available - pandas - samtools From 37f860122c8c19303db58515bb143a413de101fb Mon Sep 17 00:00:00 2001 From: Landbarsch Date: Sat, 18 Jan 2025 22:37:10 +0100 Subject: [PATCH 20/37] Update environment.yml --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 7358ce6..a597d56 100644 --- a/environment.yml +++ b/environment.yml @@ -5,7 +5,7 @@ channels: - defaults dependencies: - flake8 - - nose + - pynose - coverage >= 6 # to ensure lcov option is available - pandas - samtools From 85b5d96f65d5b5251b2542c6a0ca81ddbec89ee0 Mon Sep 17 00:00:00 2001 From: Landbarsch Date: Sat, 18 Jan 2025 22:47:21 +0100 Subject: [PATCH 21/37] Update environment.yml --- environment.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/environment.yml b/environment.yml index a597d56..b4cb5d0 100644 --- a/environment.yml +++ b/environment.yml @@ -5,9 +5,12 @@ channels: - defaults dependencies: - flake8 - - pynose - - coverage >= 6 # to ensure lcov option is available - pandas - samtools - bedtools - bowtie2 + - coverage >= 6 # to ensure lcov option is available + - pip # Add pip to handle packages not available in conda + - pip: + - pynose # Install pynose via pip, if available + From 9f15b023d479d4877156623c8b1c73035c3bf322 Mon Sep 17 00:00:00 2001 From: Landbarsch Date: Sat, 18 Jan 2025 23:00:24 +0100 Subject: [PATCH 22/37] Update VXdetector.py --- vxdetector/VXdetector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vxdetector/VXdetector.py b/vxdetector/VXdetector.py index 7c0f9d0..a8e62e8 100644 --- a/vxdetector/VXdetector.py +++ b/vxdetector/VXdetector.py @@ -6,7 +6,7 @@ import sys import warnings import pandas as pd -import Output_counter as Output_counter +import Output_counter import files_manager as files_manager from interact_bowtie2 import mapbowtie2, buildbowtie2 from interact_bedtools import overlap From 015bbf5e9659d02f581845cb1034994b89f3dc5c Mon Sep 17 00:00:00 2001 From: Landbarsch Date: Sat, 18 Jan 2025 23:12:53 +0100 Subject: [PATCH 23/37] Update test_VXdetector.py --- vxdetector/tests/test_VXdetector.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vxdetector/tests/test_VXdetector.py b/vxdetector/tests/test_VXdetector.py index a260e47..dbf6981 100644 --- a/vxdetector/tests/test_VXdetector.py +++ b/vxdetector/tests/test_VXdetector.py @@ -34,7 +34,7 @@ def tearDown(self): shutil.rmtree(self.fp_tmpdir) def test_csv_output(self): - new_file = f'{self.fp_tmpdir}test.csv' + new_file = f'{self.fp_tmpdir}/test.csv' single_file = True vx.do_output(self.result, new_file, single_file) content = [] @@ -63,7 +63,7 @@ def test_numeric_conversion(self): 'V6': 0.0122571189279732, }} single_file = True - new_file = f'{self.fp_tmpdir}test3.csv' + new_file = f'{self.fp_tmpdir}/test3.csv' vx.do_output(mixed_result, new_file, single_file) content = [] with open(self.output_test)as f: @@ -90,7 +90,7 @@ def test_output_options(self): vx.do_output(self.result, new_file, single_file) sys.stdout = sys.__stdout__ self.assertEqual(capturedOutput.getvalue(), expected) - new_file = f'{self.fp_tmpdir}test2.csv' + new_file = f'{self.fp_tmpdir}/test2.csv' vx.do_output(self.result, new_file, single_file) self.assertTrue(os.path.exists(new_file)) From 528ab57672a60c804c562db54b7ee31b0dde919b Mon Sep 17 00:00:00 2001 From: Landbarsch Date: Sat, 18 Jan 2025 23:20:09 +0100 Subject: [PATCH 24/37] Update test_VXdetector.py --- vxdetector/tests/test_VXdetector.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vxdetector/tests/test_VXdetector.py b/vxdetector/tests/test_VXdetector.py index dbf6981..a260e47 100644 --- a/vxdetector/tests/test_VXdetector.py +++ b/vxdetector/tests/test_VXdetector.py @@ -34,7 +34,7 @@ def tearDown(self): shutil.rmtree(self.fp_tmpdir) def test_csv_output(self): - new_file = f'{self.fp_tmpdir}/test.csv' + new_file = f'{self.fp_tmpdir}test.csv' single_file = True vx.do_output(self.result, new_file, single_file) content = [] @@ -63,7 +63,7 @@ def test_numeric_conversion(self): 'V6': 0.0122571189279732, }} single_file = True - new_file = f'{self.fp_tmpdir}/test3.csv' + new_file = f'{self.fp_tmpdir}test3.csv' vx.do_output(mixed_result, new_file, single_file) content = [] with open(self.output_test)as f: @@ -90,7 +90,7 @@ def test_output_options(self): vx.do_output(self.result, new_file, single_file) sys.stdout = sys.__stdout__ self.assertEqual(capturedOutput.getvalue(), expected) - new_file = f'{self.fp_tmpdir}/test2.csv' + new_file = f'{self.fp_tmpdir}test2.csv' vx.do_output(self.result, new_file, single_file) self.assertTrue(os.path.exists(new_file)) From d2833b7181ddd266bc21f777e51bb1edd814b3e8 Mon Sep 17 00:00:00 2001 From: Landbarsch Date: Fri, 31 Jan 2025 16:13:06 +0100 Subject: [PATCH 25/37] cleanuo --- Output/.DS_Store | Bin 6148 -> 0 bytes environment.yml | 15 +++++++++++++++ vxdetector/.DS_Store | Bin 6148 -> 0 bytes vxdetector/tests/.DS_Store | Bin 6148 -> 0 bytes 4 files changed, 15 insertions(+) delete mode 100644 Output/.DS_Store create mode 100644 environment.yml delete mode 100644 vxdetector/.DS_Store delete mode 100644 vxdetector/tests/.DS_Store diff --git a/Output/.DS_Store b/Output/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0= 6 # to ensure lcov option is available + - pip # Add pip to handle packages not available in conda + - pip: + - pynose # Install pynose via pip, if available \ No newline at end of file diff --git a/vxdetector/.DS_Store b/vxdetector/.DS_Store deleted file mode 100644 index b731e8a11f420afc3e722ba25df51d9e5f08f722..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKU2oGc6us^`mQqbBVn}<8#2cEn(oK^FFQJSFo{$I{NPtSRlo80{s!54ZRVmNo z4I!+K2e8=93;l(O=LC3{hU^;VOm;1 zp<=|8(*VytjaIyEgH^yP@V6mKo_Ww)%suJV}tO)x%$l}+5tPoe!>0$ovWQAL@h!|;zUy>(HE9nS_gS5T3=uV$j)n zEYm!Q)8SYV;%Eq$H!tHfl0{!m(kN40Uw1eyr#0y8PN%)&?w)&ibTZp>rzbs>j}DJ# zvzBw`?!m+7gHOq2Dz7kln7|INY}?=hcy^V!fA+^|BGU`BGGm+0=oP-{P^rUTS=RqP z=n+xldqt&1e2dqmfcc_M`?_9W<5~r*0{^T6yg#@|jGn<#quM%yQ5E#>K9gA) zCfO)g2}wM{kaw?>ESA+!PO`XAxxVgjdQLCsZ%(H>`-3fackf`fPbYX#fT(+DV|OXqj+$M@!nq;B~=)afVqv0 z*WaAqMuY7IO;8)FGQbQaqWO&5xPD&Zy3^n~rZ-sGNVV`IwktqFEVY{$Su7*Pv&x)`%XM(5XP3D$Er_ z=ybHZW}IiR)~M4-n9GMS|18W6MW}xVzpI9m@HE=eDqs~@RA5C{`+WXC|MUHSG0E1f z0#<=*rGV%j`iDa-$(*fAi{rD_g@1sv@wi%}NWaXRy|Y7MT4I NP%_xUD)3Jg_zgWaxibI& From 7d60e26afc4435b4fa7f0f19516dbd69b6431efe Mon Sep 17 00:00:00 2001 From: Landbarsch Date: Fri, 31 Jan 2025 16:23:55 +0100 Subject: [PATCH 26/37] TestChanges --- vxdetector/VXdetector.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vxdetector/VXdetector.py b/vxdetector/VXdetector.py index a8e62e8..3f79bd2 100644 --- a/vxdetector/VXdetector.py +++ b/vxdetector/VXdetector.py @@ -6,10 +6,10 @@ import sys import warnings import pandas as pd -import Output_counter -import files_manager as files_manager -from interact_bowtie2 import mapbowtie2, buildbowtie2 -from interact_bedtools import overlap +import vxdetector.Output_counter as Output_counter +import vxdetector.files_manager as files_manager +from vxdetector.interact_bowtie2 import mapbowtie2, buildbowtie2 +from vxdetector.interact_bedtools import overlap import tempfile from multiprocessing import Pool import biom From 10398e93cda8cb00dbbdc23dc88c32d97064a819 Mon Sep 17 00:00:00 2001 From: Landbarsch Date: Fri, 31 Jan 2025 16:37:46 +0100 Subject: [PATCH 27/37] added biom to env --- environment.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 3388e23..d21489e 100644 --- a/environment.yml +++ b/environment.yml @@ -12,4 +12,5 @@ dependencies: - coverage >= 6 # to ensure lcov option is available - pip # Add pip to handle packages not available in conda - pip: - - pynose # Install pynose via pip, if available \ No newline at end of file + - pynose # Install pynose via pip, if available + - biom \ No newline at end of file From 4ab20f703009f07e5993003c30dc36b623a3c1fe Mon Sep 17 00:00:00 2001 From: Landbarsch Date: Fri, 31 Jan 2025 16:39:54 +0100 Subject: [PATCH 28/37] -format --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index d21489e..0815b9e 100644 --- a/environment.yml +++ b/environment.yml @@ -13,4 +13,4 @@ dependencies: - pip # Add pip to handle packages not available in conda - pip: - pynose # Install pynose via pip, if available - - biom \ No newline at end of file + - biom-format \ No newline at end of file From 0edb2894a7263855dd619281cf4104de3bdce53f Mon Sep 17 00:00:00 2001 From: Landbarsch Date: Mon, 3 Feb 2025 13:48:29 +0100 Subject: [PATCH 29/37] Changed test to include new params --- vxdetector/tests/test_VXdetector.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/vxdetector/tests/test_VXdetector.py b/vxdetector/tests/test_VXdetector.py index a260e47..d68e4a3 100644 --- a/vxdetector/tests/test_VXdetector.py +++ b/vxdetector/tests/test_VXdetector.py @@ -151,7 +151,7 @@ def test_singleFile(self): expected = f'{self.path}test_data/Output_test.csv' actual = f'{self.fp_tmpdir}/singleFile_test.csv' test_file = f'{self.path}test_data/5011_S225_L001_R1_001.fastq.gz' - vx.workflow(test_file, actual, False) + vx.workflow(test_file, actual, False, 1000, None) content = [] with open(expected)as f: for line in f: @@ -166,7 +166,7 @@ def test_directory(self): expected = f'{self.path}test_data/dir_test.csv' actual = f'{self.path}/test_data/dir_test_actual.csv' test_file = f'{self.path}test_data/test_dir/' - vx.workflow(test_file, actual, False) + vx.workflow(test_file, actual, False, 1000, None) content = [] with open(expected)as f: for line in f: @@ -181,7 +181,7 @@ def test_c_option(self): expected = f'{self.path}test_data/Output_test.csv' actual = sys.stdout test_file = f'{self.path}test_data/5011_S225_L001_R1_001.fastq.gz' - vx.workflow(test_file, actual, True) + vx.workflow(test_file, actual, False, 1000, None) content = [] with open(expected)as f: for line in f: @@ -194,25 +194,25 @@ def test_c_option(self): def test_no_fastq(self): with self.assertRaises(ValueError) as cm: - vx.workflow(f'{self.path}test_data/Indexed_bt2', sys.stdout, False) + vx.workflow(f'{self.path}test_data/Indexed_bt2', sys.stdout, False, 1000, None) self.assertEqual('There were no FASTQ files in this directory', str(cm.exception)) def test_Error(self): with self.assertRaises(ValueError) as cm: vx.workflow(f'{self.path}test_data/Output_test.csv', - sys.stdout, False) + sys.stdout, False, 1000, None) self.assertEqual('This file does not look like a fastq file', str(cm.exception)) def test_raise(self): with self.assertRaises(ValueError) as cm: vx.workflow(f'{self.path}test_data/test_dir/no_qual_test.fastq', - sys.stdout, False) + sys.stdout, False, 1000, None) self.assertEqual('This file has no Reads of the required ' 'mapping-quality', str(cm.exception)) with self.assertRaises(ValueError) as cm: vx.workflow(f'{self.path}test_data/test_dir/no_qual_' - 'paired_R1_001.fastq', sys.stdout, False) + 'paired_R1_001.fastq', sys.stdout, False, 1000, None) self.assertEqual('This file has no Reads of the required ' 'mapping-quality', str(cm.exception)) From bf931b651b8bcc235f80e6fc73e64be178554407 Mon Sep 17 00:00:00 2001 From: Landbarsch Date: Mon, 3 Feb 2025 13:51:20 +0100 Subject: [PATCH 30/37] Formatiing for Lint --- vxdetector/tests/test_VXdetector.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vxdetector/tests/test_VXdetector.py b/vxdetector/tests/test_VXdetector.py index d68e4a3..547e5d8 100644 --- a/vxdetector/tests/test_VXdetector.py +++ b/vxdetector/tests/test_VXdetector.py @@ -194,7 +194,8 @@ def test_c_option(self): def test_no_fastq(self): with self.assertRaises(ValueError) as cm: - vx.workflow(f'{self.path}test_data/Indexed_bt2', sys.stdout, False, 1000, None) + vx.workflow(f'{self.path}test_data/Indexed_bt2', sys.stdout, + False, 1000, None) self.assertEqual('There were no FASTQ files in this directory', str(cm.exception)) From b910cef7383a2022ab0131d7b2ccacfde97cc734 Mon Sep 17 00:00:00 2001 From: Landbarsch Date: Mon, 3 Feb 2025 14:43:44 +0100 Subject: [PATCH 31/37] Changed the interact_bowtie2 Script --- vxdetector/interact_bowtie2.py | 144 ++++++++++++--------------------- 1 file changed, 52 insertions(+), 92 deletions(-) diff --git a/vxdetector/interact_bowtie2.py b/vxdetector/interact_bowtie2.py index 912657a..2438cee 100644 --- a/vxdetector/interact_bowtie2.py +++ b/vxdetector/interact_bowtie2.py @@ -2,7 +2,9 @@ import os import shutil +import subprocess +# Check for the installation paths of Bowtie2, Samtools, and Bedtools bowtie2_path = shutil.which('bowtie2') if bowtie2_path is None: bowtie2_path = '$CONDA/bin/bowtie2' @@ -12,109 +14,67 @@ bedtools_path = shutil.which('bedtools') if bedtools_path is None: bedtools_path = '$CONDA/bin/bedtools' -# finds where bowtie2, samtools and bedtools -# are installed - def buildbowtie2(path): - r'''Builds bowtie2 index + """Builds Bowtie2 index for the reference genome.""" + input_ref = f'{path}Indexed_bt2/85_otus.fasta' # Path to the reference genome + output_path = f'{path}Indexed_bt2/bowtie2' # Output path for the index + # Check if the index already exists; if not, build it + if not os.path.exists(f'{output_path}.1.bt2'): + os.system(f'{bowtie2_path}-build -f {input_ref} {output_path} > /dev/null') - This function builds an index for bowtie2 it is the equivalent - of building it manually with the "" command. +def mapbowtie2(fasta_file, read2_file, path, temp_path, paired, bowtie2_params=None): + """Maps reads against a previously built Bowtie2 index.""" + + index_path = f'{path}Indexed_bt2/bowtie2' # Path to the Bowtie2 index + # Check if the index files exist + if not os.path.exists(f'{index_path}.1.bt2'): + raise FileNotFoundError(f'No index files found under "{index_path}"') - Parameters - ---------- - path : str - The program filepath. The directory where it needs to look - for the reference genome and where the index should be saved. + log_path = f'{temp_path}bowtie2.log' # Log file for Bowtie2 + bed_logpath = f'{temp_path}bed.log' # Log file for Bedtools + Error = False - ''' - input_ref = f'{path}Indexed_bt2/85_otus.fasta' - # reference genome greengenes is used - output_path = f'{path}Indexed_bt2/bowtie2' - if os.path.exists(f'{output_path}.1.bt2'): - # only builds an index for the alignment if there isn't already one - pass - else: - os.system(f'{bowtie2_path}-build -f {input_ref} {output_path} ' - '> /dev/null') - # if a indexfile is missing a new index is build + # Base command for Bowtie2 + bowtie2_base_cmd = [bowtie2_path, '-x', index_path, '--fast'] + # Add additional Bowtie2 parameters if provided + if bowtie2_params: + bowtie2_base_cmd += bowtie2_params.split() -def mapbowtie2(fasta_file, read2_file, path, temp_path, paired): - r'''Maps reads against index + # Handle paired-end reads + if paired: + aligned_path = f'{temp_path}paired.bed' # Output path for paired reads + bowtie2_base_cmd += ['-1', fasta_file, '-2', read2_file] + + # Commands for Samtools and Bedtools + cmd_samtools = [samtools_path, 'view', '-b', '-q', '30', '-S', '-F', '4'] + cmd_bedtools = [bedtools_path, 'bamtobed', '-bedpe', '-i', 'stdin'] - This function maps read files against a previously build index. - The bowtie standard output .sam is piped into samtools view and - converted to .bam. In this step all unmapped reads and those with a - lower mapquality than requested are discarded. - If the reads are paired the .bam file is converted to .bed with bedtools - bamtobed that way they are properly paired and can be intersected. + # Run the Bowtie2, Samtools, and Bedtools pipeline + with open(log_path, 'w') as log_file, open(bed_logpath, 'w') as bed_log, open(aligned_path, 'w') as output_file: + bowtie_process = subprocess.Popen(bowtie2_base_cmd, stderr=log_file, stdout=subprocess.PIPE) + samtools_process = subprocess.Popen(cmd_samtools, stdin=bowtie_process.stdout, stdout=subprocess.PIPE) + subprocess.run(cmd_bedtools, stdin=samtools_process.stdout, stdout=output_file, stderr=bed_log) - Parameters - ---------- - fasta_file : str - Path to a .fastq file (or .fastq.gz). This file contains the (forward) - reads which are mapped against the reference. - read2_file : str - Path to a .fastq file (or .fastq.gz) containing backwards reads. - Is disregarded if paired = False. If not needed can be ''. - path : str - The program filepath. The directory where it needs to look - for the index files. - temp_path : str - Path to a (temporary) folder where the resulting .log files and output - files are saved - paired : Bool - Dictates wether bowtie2 alignes paired or unpaired reads. - If paired is set to False bowtie2 will do an unpaired alignment - otherwise it will do a paired one. + # Handle unpaired reads + else: + aligned_path = f'{temp_path}unpaired.bed' # Output path for unpaired reads + bowtie2_base_cmd += ['-U', fasta_file] + + # Commands for Samtools and Bedtools + cmd_samtools = [samtools_path, 'view', '-b', '-q', '30', '-S', '-F', '4'] + cmd_bedtools = [bedtools_path, 'bamtobed', '-i', 'stdin'] - Returns - ------- - aligned_path : str - Path to the converted bowtie2 Output file in which either paired or - unpaired reads have been mapped against the reference. - Error : Bool - Wether or not bowtie2 ended with an error. - In the context of the whole program Error = True leads to either - a raised Exception (if only a single file was given) or to the current - file being skipped (if a directory was given). + # Run the Bowtie2 and Samtools pipeline + with open(log_path, 'w') as log_file, open(aligned_path, 'w') as output_file: + bowtie_process = subprocess.Popen(bowtie2_base_cmd, stderr=log_file, stdout=subprocess.PIPE) + samtools_process = subprocess.Popen(cmd_samtools, stdin=bowtie_process.stdout, stdout=subprocess.PIPE) + subprocess.run(cmd_bedtools, stdin=samtools_process.stdout, stdout=output_file) - ''' - index_path = f'{path}Indexed_bt2/bowtie2' - if os.path.exists(f'{index_path}.1.bt2') is False: - raise FileNotFoundError(f'No Index files found under "{index_path}"') - # raises an Exception if the Index files cannot be found - log_path = f'{temp_path}bowtie2.log' - bed_logpath = f'{temp_path}bed.log' - # declares various filepaths - Error = False - if paired is True: - aligned_path = f'{temp_path}paired.bed' - cmd = f'{bowtie2_path} -x {index_path} -1 {fasta_file} \ - -2 {read2_file} --fast 2> {log_path} | \ - {samtools_path} view -b -q 30 -S -F 4 \ - | {bedtools_path} bamtobed -bedpe -i stdin > {aligned_path} \ - 2> {bed_logpath}' - # Should a backward read be found both files will be given to bowtie2. - # After converting .sam to .bam a conversion to .bed is done to - # properly mate the pairs - else: - aligned_path = f'{temp_path}unpaired.bam' - cmd = f'{bowtie2_path} -x {index_path} -q -U {fasta_file} \ - --fast 2> {log_path} | {samtools_path} view -b -q 30 -S -F 4 \ - -o {aligned_path}' - # Should no backward read be found it will just use the forward - # read and does an alignment followed by a pipe to convert - # the bowtie2 output .sam to a .bam file - os.system(cmd) + # Check for errors in the Bowtie2 log file with open(log_path, 'r') as log: - lines = log.readlines() - try: - int(lines[0].split()[0]) - except ValueError: + if any("error" in line.lower() for line in log): Error = True - # Checks if bowtie2 exited with an error - return aligned_path, Error + return aligned_path, Error # Return the path to the aligned file and error status From 99018bc14585541f0e26c266f7ea187392c55be6 Mon Sep 17 00:00:00 2001 From: Landbarsch Date: Tue, 4 Feb 2025 09:46:35 +0100 Subject: [PATCH 32/37] formatting of interact_bowtie2 --- vxdetector/interact_bowtie2.py | 55 +++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 21 deletions(-) diff --git a/vxdetector/interact_bowtie2.py b/vxdetector/interact_bowtie2.py index 2438cee..595e7c6 100644 --- a/vxdetector/interact_bowtie2.py +++ b/vxdetector/interact_bowtie2.py @@ -15,17 +15,21 @@ if bedtools_path is None: bedtools_path = '$CONDA/bin/bedtools' + def buildbowtie2(path): """Builds Bowtie2 index for the reference genome.""" - input_ref = f'{path}Indexed_bt2/85_otus.fasta' # Path to the reference genome + input_ref = f'{path}Indexed_bt2/85_otus.fasta' # Path to reference genome output_path = f'{path}Indexed_bt2/bowtie2' # Output path for the index # Check if the index already exists; if not, build it if not os.path.exists(f'{output_path}.1.bt2'): - os.system(f'{bowtie2_path}-build -f {input_ref} {output_path} > /dev/null') + os.system( + f'{bowtie2_path}-build -f {input_ref}{output_path} > /dev/null') + -def mapbowtie2(fasta_file, read2_file, path, temp_path, paired, bowtie2_params=None): +def mapbowtie2( + fasta_file, read2_file, path, temp_path, paired, bowtie2_params=None): """Maps reads against a previously built Bowtie2 index.""" - + index_path = f'{path}Indexed_bt2/bowtie2' # Path to the Bowtie2 index # Check if the index files exist if not os.path.exists(f'{index_path}.1.bt2'): @@ -46,35 +50,44 @@ def mapbowtie2(fasta_file, read2_file, path, temp_path, paired, bowtie2_params=N if paired: aligned_path = f'{temp_path}paired.bed' # Output path for paired reads bowtie2_base_cmd += ['-1', fasta_file, '-2', read2_file] - + # Commands for Samtools and Bedtools - cmd_samtools = [samtools_path, 'view', '-b', '-q', '30', '-S', '-F', '4'] - cmd_bedtools = [bedtools_path, 'bamtobed', '-bedpe', '-i', 'stdin'] + cmd_sam = [samtools_path, 'view', '-b', '-q', '30', '-S', '-F', '4'] + cmd_bed = [bedtools_path, 'bamtobed', '-bedpe', '-i', 'stdin'] # Run the Bowtie2, Samtools, and Bedtools pipeline - with open(log_path, 'w') as log_file, open(bed_logpath, 'w') as bed_log, open(aligned_path, 'w') as output_file: - bowtie_process = subprocess.Popen(bowtie2_base_cmd, stderr=log_file, stdout=subprocess.PIPE) - samtools_process = subprocess.Popen(cmd_samtools, stdin=bowtie_process.stdout, stdout=subprocess.PIPE) - subprocess.run(cmd_bedtools, stdin=samtools_process.stdout, stdout=output_file, stderr=bed_log) + with open(log_path, 'w') as log_file, \ + open(bed_logpath, 'w') as bed_log, \ + open(aligned_path, 'w') as output_file: + bowtie_process = subprocess.Popen( + bowtie2_base_cmd, stderr=log_file, stdout=subprocess.PIPE) + samtools_process = subprocess.Popen( + cmd_sam, stdin=bowtie_process.stdout, stdout=subprocess.PIPE) + subprocess.run(cmd_bed, stdin=samtools_process.stdout, + stdout=output_file, stderr=bed_log) # Handle unpaired reads else: - aligned_path = f'{temp_path}unpaired.bed' # Output path for unpaired reads + # Output path for unpaired reads + aligned_path = f'{temp_path}unpaired.bed' bowtie2_base_cmd += ['-U', fasta_file] - + # Commands for Samtools and Bedtools - cmd_samtools = [samtools_path, 'view', '-b', '-q', '30', '-S', '-F', '4'] - cmd_bedtools = [bedtools_path, 'bamtobed', '-i', 'stdin'] + cmd_sam = [samtools_path, 'view', '-b', '-q', '30', '-S', '-F', '4'] + cmd_bed = [bedtools_path, 'bamtobed', '-i', 'stdin'] # Run the Bowtie2 and Samtools pipeline - with open(log_path, 'w') as log_file, open(aligned_path, 'w') as output_file: - bowtie_process = subprocess.Popen(bowtie2_base_cmd, stderr=log_file, stdout=subprocess.PIPE) - samtools_process = subprocess.Popen(cmd_samtools, stdin=bowtie_process.stdout, stdout=subprocess.PIPE) - subprocess.run(cmd_bedtools, stdin=samtools_process.stdout, stdout=output_file) + with open(log_path, 'w') as log_f, open(aligned_path, 'w') as output_f: + bowtie_process = subprocess.Popen( + bowtie2_base_cmd, stderr=log_f, stdout=subprocess.PIPE) + samtools_process = subprocess.Popen( + cmd_sam, stdin=bowtie_process.stdout, stdout=subprocess.PIPE) + subprocess.run( + cmd_bed, stdin=samtools_process.stdout, stdout=output_f) # Check for errors in the Bowtie2 log file with open(log_path, 'r') as log: if any("error" in line.lower() for line in log): Error = True - - return aligned_path, Error # Return the path to the aligned file and error status + # Return the path to the aligned file and error status + return aligned_path, Error From cc4fadc681c010580abccf5e7c77ccfbb3c7a524 Mon Sep 17 00:00:00 2001 From: Landbarsch Date: Tue, 4 Feb 2025 11:35:56 +0100 Subject: [PATCH 33/37] fixes for testing --- vxdetector/interact_bowtie2.py | 9 +++++---- vxdetector/tests/test_interact_bowtie.py | 20 +++++++++++++------- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/vxdetector/interact_bowtie2.py b/vxdetector/interact_bowtie2.py index 595e7c6..0bd10a8 100644 --- a/vxdetector/interact_bowtie2.py +++ b/vxdetector/interact_bowtie2.py @@ -23,7 +23,7 @@ def buildbowtie2(path): # Check if the index already exists; if not, build it if not os.path.exists(f'{output_path}.1.bt2'): os.system( - f'{bowtie2_path}-build -f {input_ref}{output_path} > /dev/null') + f'{bowtie2_path}-build -f {input_ref} {output_path} > /dev/null') def mapbowtie2( @@ -77,13 +77,14 @@ def mapbowtie2( cmd_bed = [bedtools_path, 'bamtobed', '-i', 'stdin'] # Run the Bowtie2 and Samtools pipeline - with open(log_path, 'w') as log_f, open(aligned_path, 'w') as output_f: + with open(log_path, 'w') as log_file, \ + open(aligned_path, 'w') as output_file: bowtie_process = subprocess.Popen( - bowtie2_base_cmd, stderr=log_f, stdout=subprocess.PIPE) + bowtie2_base_cmd, stderr=log_file, stdout=subprocess.PIPE) samtools_process = subprocess.Popen( cmd_sam, stdin=bowtie_process.stdout, stdout=subprocess.PIPE) subprocess.run( - cmd_bed, stdin=samtools_process.stdout, stdout=output_f) + cmd_bed, stdin=samtools_process.stdout, stdout=output_file) # Check for errors in the Bowtie2 log file with open(log_path, 'r') as log: diff --git a/vxdetector/tests/test_interact_bowtie.py b/vxdetector/tests/test_interact_bowtie.py index 860d1f0..429ace7 100644 --- a/vxdetector/tests/test_interact_bowtie.py +++ b/vxdetector/tests/test_interact_bowtie.py @@ -31,12 +31,14 @@ def test_aligned_path(self): paired = False temp_path = self.fp_tmpdir aligned_path, Error = ibo.mapbowtie2(self.fasta_file, self.read2_file, - path, temp_path, paired) - self.assertEqual(aligned_path, f'{temp_path}unpaired.bam') + path, temp_path, paired, + bowtie2_params="") + self.assertEqual(aligned_path, f'{temp_path}unpaired.bed') self.assertEqual(Error, False) paired = True aligned_path, Error = ibo.mapbowtie2(self.fasta_file, self.read2_file, - path, temp_path, paired) + path, temp_path, paired, + bowtie2_params="") self.assertEqual(aligned_path, f'{temp_path}paired.bed') self.assertEqual(Error, False) @@ -52,7 +54,8 @@ def test_pipe(self): for line in f: content.append(line.strip().split()) ibo.mapbowtie2(self.fasta_file, self.read2_file, - path, temp_path, paired) + path, temp_path, paired, + bowtie2_params="") os.system(f'{samtools_path} view {temp_path}unpaired.bam ' f'> {temp_path}unpaired.sam') output = [] @@ -66,7 +69,8 @@ def test_pipe(self): for line in f: content.append(line.strip().split()) ibo.mapbowtie2(self.fasta_file, self.read2_file, - path, temp_path, paired) + path, temp_path, paired, + bowtie2_params="") output = [] with open(f'{temp_path}paired.bed')as f: for line in f: @@ -78,12 +82,14 @@ def test_wrong_file_type(self): read2_file = f'{path}test_data/paired/BED.bed' paired = True aligned_path, Error = ibo.mapbowtie2(self.fasta_file, read2_file, - path, self.fp_tmpdir, paired) + path, self.fp_tmpdir, paired, + bowtie2_params="") self.assertEqual(Error, True) paired = False fasta_file_local = f'{path}test_data/paired/BED.bed' aligned_path, Error = ibo.mapbowtie2(fasta_file_local, read2_file, - path, self.fp_tmpdir, paired) + path, self.fp_tmpdir, paired, + bowtie2_params="") self.assertEqual(Error, True) def test_no_index(self): From b2743c3f7e1816a4ca8173d320cc56f475bbe7c9 Mon Sep 17 00:00:00 2001 From: Landbarsch Date: Tue, 4 Feb 2025 11:41:51 +0100 Subject: [PATCH 34/37] reduced sample size for the test --- vxdetector/tests/test_VXdetector.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vxdetector/tests/test_VXdetector.py b/vxdetector/tests/test_VXdetector.py index 547e5d8..ba17a58 100644 --- a/vxdetector/tests/test_VXdetector.py +++ b/vxdetector/tests/test_VXdetector.py @@ -151,7 +151,7 @@ def test_singleFile(self): expected = f'{self.path}test_data/Output_test.csv' actual = f'{self.fp_tmpdir}/singleFile_test.csv' test_file = f'{self.path}test_data/5011_S225_L001_R1_001.fastq.gz' - vx.workflow(test_file, actual, False, 1000, None) + vx.workflow(test_file, actual, False, 100, None) content = [] with open(expected)as f: for line in f: @@ -166,7 +166,7 @@ def test_directory(self): expected = f'{self.path}test_data/dir_test.csv' actual = f'{self.path}/test_data/dir_test_actual.csv' test_file = f'{self.path}test_data/test_dir/' - vx.workflow(test_file, actual, False, 1000, None) + vx.workflow(test_file, actual, False, 100, None) content = [] with open(expected)as f: for line in f: @@ -181,7 +181,7 @@ def test_c_option(self): expected = f'{self.path}test_data/Output_test.csv' actual = sys.stdout test_file = f'{self.path}test_data/5011_S225_L001_R1_001.fastq.gz' - vx.workflow(test_file, actual, False, 1000, None) + vx.workflow(test_file, actual, False, 100, None) content = [] with open(expected)as f: for line in f: @@ -195,21 +195,21 @@ def test_c_option(self): def test_no_fastq(self): with self.assertRaises(ValueError) as cm: vx.workflow(f'{self.path}test_data/Indexed_bt2', sys.stdout, - False, 1000, None) + False, 100, None) self.assertEqual('There were no FASTQ files in this directory', str(cm.exception)) def test_Error(self): with self.assertRaises(ValueError) as cm: vx.workflow(f'{self.path}test_data/Output_test.csv', - sys.stdout, False, 1000, None) + sys.stdout, False, 100, None) self.assertEqual('This file does not look like a fastq file', str(cm.exception)) def test_raise(self): with self.assertRaises(ValueError) as cm: vx.workflow(f'{self.path}test_data/test_dir/no_qual_test.fastq', - sys.stdout, False, 1000, None) + sys.stdout, False, 100, None) self.assertEqual('This file has no Reads of the required ' 'mapping-quality', str(cm.exception)) with self.assertRaises(ValueError) as cm: From 545c8c2df09dd4a2cae4e48639a4766ee7976854 Mon Sep 17 00:00:00 2001 From: Landbarsch Date: Tue, 4 Feb 2025 12:39:09 +0100 Subject: [PATCH 35/37] changed a parameter --- vxdetector/tests/test_VXdetector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vxdetector/tests/test_VXdetector.py b/vxdetector/tests/test_VXdetector.py index ba17a58..2c9c72b 100644 --- a/vxdetector/tests/test_VXdetector.py +++ b/vxdetector/tests/test_VXdetector.py @@ -214,6 +214,6 @@ def test_raise(self): 'mapping-quality', str(cm.exception)) with self.assertRaises(ValueError) as cm: vx.workflow(f'{self.path}test_data/test_dir/no_qual_' - 'paired_R1_001.fastq', sys.stdout, False, 1000, None) + 'paired_R1_001.fastq', sys.stdout, False, 100, None) self.assertEqual('This file has no Reads of the required ' 'mapping-quality', str(cm.exception)) From e2d5f2ba31a0cecbda8765f9092473a81b41afba Mon Sep 17 00:00:00 2001 From: Landbarsch Date: Tue, 4 Feb 2025 13:16:38 +0100 Subject: [PATCH 36/37] Changes to test_interact_bowtie --- vxdetector/tests/test_interact_bowtie.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vxdetector/tests/test_interact_bowtie.py b/vxdetector/tests/test_interact_bowtie.py index 429ace7..2256178 100644 --- a/vxdetector/tests/test_interact_bowtie.py +++ b/vxdetector/tests/test_interact_bowtie.py @@ -54,8 +54,7 @@ def test_pipe(self): for line in f: content.append(line.strip().split()) ibo.mapbowtie2(self.fasta_file, self.read2_file, - path, temp_path, paired, - bowtie2_params="") + path, temp_path, paired) os.system(f'{samtools_path} view {temp_path}unpaired.bam ' f'> {temp_path}unpaired.sam') output = [] @@ -69,8 +68,7 @@ def test_pipe(self): for line in f: content.append(line.strip().split()) ibo.mapbowtie2(self.fasta_file, self.read2_file, - path, temp_path, paired, - bowtie2_params="") + path, temp_path, paired) output = [] with open(f'{temp_path}paired.bed')as f: for line in f: From 7f6cfc8f61ab917bab2c24a029ff22cee4009aa2 Mon Sep 17 00:00:00 2001 From: Landbarsch Date: Tue, 4 Feb 2025 13:22:32 +0100 Subject: [PATCH 37/37] undo changes --- vxdetector/tests/test_interact_bowtie.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vxdetector/tests/test_interact_bowtie.py b/vxdetector/tests/test_interact_bowtie.py index 2256178..429ace7 100644 --- a/vxdetector/tests/test_interact_bowtie.py +++ b/vxdetector/tests/test_interact_bowtie.py @@ -54,7 +54,8 @@ def test_pipe(self): for line in f: content.append(line.strip().split()) ibo.mapbowtie2(self.fasta_file, self.read2_file, - path, temp_path, paired) + path, temp_path, paired, + bowtie2_params="") os.system(f'{samtools_path} view {temp_path}unpaired.bam ' f'> {temp_path}unpaired.sam') output = [] @@ -68,7 +69,8 @@ def test_pipe(self): for line in f: content.append(line.strip().split()) ibo.mapbowtie2(self.fasta_file, self.read2_file, - path, temp_path, paired) + path, temp_path, paired, + bowtie2_params="") output = [] with open(f'{temp_path}paired.bed')as f: for line in f: