From d38ae75264c14d534c95e7df14fd63d713abccb1 Mon Sep 17 00:00:00 2001
From: Jeremy Meiss <jeremy@stud-070-079.wlan.uni-giessen.de>
Date: Mon, 5 Aug 2024 10:35:57 +0200
Subject: [PATCH 01/37] add subsampling

---
 vxdetector/VXdetector.py | 232 +++++++++++----------------------------
 1 file changed, 64 insertions(+), 168 deletions(-)

diff --git a/vxdetector/VXdetector.py b/vxdetector/VXdetector.py
index f69dcf7..bee5e52 100644
--- a/vxdetector/VXdetector.py
+++ b/vxdetector/VXdetector.py
@@ -4,246 +4,142 @@
 import glob
 import os
 import sys
+import warnings
 import pandas as pd
-import vxdetector.Output_counter as Output_counter
-import vxdetector.files_manager as files_manager
-from vxdetector.interact_bowtie2 import mapbowtie2, buildbowtie2
-from vxdetector.interact_bedtools import overlap
-
+import Output_counter as Output_counter
+import files_manager as files_manager
+from interact_bowtie2 import mapbowtie2, buildbowtie2
+from interact_bedtools import overlap
+import time
+from multiprocessing import pool
+import random
+
+def sample_fastq(file_path, sample_size):
+    '''get random parts from the fq file'''
+    sampled_reads = []
+    with open(file_path, 'r') as f:
+        lines = f.readlines()
+        total_reads = len(lines) // 4
+        sampled_indices = sorted(random.sample(range(total_reads), sample_size))
+        for idx in sampled_indices:
+            read = lines[idx*4:(idx+1)*4]
+            sampled_reads.extend(read)
+    return sampled_reads
+
+def save_sampled_fastq(sampled_reads, output_path):
+    '''Save sampled reads to a temporary FASTQ file'''
+    with open(output_path, 'w') as f:
+        f.writelines(sampled_reads)
 
 def do_statistic(result):
-    r'''Statistical analysis of given directory
-
-    This function calculates the average (mean) value and
-    the standard deviation of all dataFrame collumns containg
-    numeric data.
-    It also declares which variable region(s) is/are the most
-    probable sequenced region over all fastq files.
-
-    Parameters
-    ----------
-    result : pandas dataFrame
-        DataFrame with filenames as indey and the obtained information
-        in the columns. Each file is a single row.
-
-    Returns
-    -------
-    result : pandas dataFrame
-        DataFrame is the same as above.
-        The first two lines were added which show the average (mean) and
-        standard deviation of each column containg numeric data.
-        Index of these rows is ['Average', 'Standard deviation']
-        Columns are:
-        ['Number of Reads', 'Unaligned Reads [%]', 'Not properly paired',
-         'Sequenced variable region', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6',
-         'V7', 'V8', 'V9', 'Not aligned to a variable region']
 
-    '''
     average = result.mean(numeric_only=True).to_frame().T
-    # calculates mean value for every numeric column
     region = (result['Sequenced variable region'].mode().values)
-    # finds the most common value in column 'Sequenced variable region'
     region = ' / '.join(str(r) for r in region)
     region = region.replace('\'', '')
     region = region.replace('[', '')
     region = region.replace(']', '')
-    # formats the mode() output for easy viewing
     average['Sequenced variable region'] = region
     if 'Not properly paired' not in average.columns:
         average['Not properly paired'] = 'not paired'
-        # adds 'Not properly paired' column if reads are unpaired
     std_dev = result.std(numeric_only=True).to_frame().T
-    # calculates standard deviation for every numeric column
     statistic = pd.concat([average, std_dev], axis=0)
-    statistic = statistic[['Number of Reads',
-                           'Unaligned Reads [%]', 'Not properly paired',
-                           'Sequenced variable region', 'V1', 'V2', 'V3',
-                           'V4', 'V5', 'V6', 'V7', 'V8', 'V9',
-                           'Not aligned to a variable region']]
-    # combines the average and std_dev dataframes and sorts the columns
-    # in the correct order
+    statistic = statistic[['Number of Reads', 'Unaligned Reads [%]', 'Not properly paired',
+                           'Sequenced variable region', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6',
+                           'V7', 'V8', 'V9', 'Not aligned to a variable region']]
     statistic['row_descriptor'] = ['Average', 'Standard deviation']
     statistic = statistic.set_index('row_descriptor')
-    # adds a descriptive index
     result = pd.concat([statistic, result], axis=0)
     return result
 
-
 def do_output(result, new_file, single_file):
-    r'''Writes Output
-
-    This function writes the obtained information in a csv format.
 
-    Parameters
-    ---------
-    result : dict
-        Dictionary with filenames as keys and the obtained information
-        as a Dictionary stored in the value.
-    new_file : str or _io.TextIOWrapper
-        Filepath to the output file.
-        If none is given to the program (_io.TextIOWrapper) output
-        will go to STDOUT.
-    single_file : Bool
-        Signifies wether a directory or a single file was given via terminal
-        input.
-
-    '''
+    warnings.simplefilter(action='ignore', category=FutureWarning)
     result = pd.DataFrame(result).T.sort_index()
-    # converts dict to dataframe and sorts the dataframe by index
-    # allows for easy navigation within the file
     for column in result:
         result[column] = pd.to_numeric(result[column], errors='ignore')
-        # converts all cell values to a numeric data type if applicable
     if single_file is False:
         result = do_statistic(result)
     else:
-        result = result[['Number of Reads',
-                         'Unaligned Reads [%]', 'Not properly paired',
-                         'Sequenced variable region', 'V1', 'V2', 'V3',
-                         'V4', 'V5', 'V6', 'V7', 'V8', 'V9',
-                         'Not aligned to a variable region']]
+        result = result[['Number of Reads', 'Unaligned Reads [%]', 'Not properly paired',
+                         'Sequenced variable region', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6',
+                         'V7', 'V8', 'V9', 'Not aligned to a variable region']]
     result.to_csv(new_file, index=True)
 
-
-def workflow(file_dir, new_file, write_csv):
-    r'''Worker function
-
-    This function is the center piece of this program.
-    It does some preliminary work such as grabbing all fastq files
-    in the given directory.
-    After that it calls other functions which analyse the found
-    fastq-files.
-
-    Parameters
-    ----------
-    file_dir : str
-        Given filepath. Can be a path to single file which leads
-        to visual output whichcan easily be understood or a
-        path to a directory which produces csv output.
-    new_file : str
-        Filepath to the Output file. If the flag is not set
-        output will be printed via STDOUT.
-    write_csv : Bool
-        Wether or not a csv file should be written in the
-        standard Output folder of this program.
-
-    '''
+def workflow(file_dir, new_file, write_csv, sample_size):
+    
     path = files_manager.get_lib()
     temp_path = files_manager.tmp_dir(path, temp_path=None)
-    # sets the paths of the programm itself and a temporary folder
     paired = False
     result = dict()
     buildbowtie2(path)
-    # builds bowtie2 index
-    if glob.glob(f'{file_dir}**/*.fastq*', recursive=True) == [] \
-       and os.path.isdir(file_dir):
+    if glob.glob(f'{file_dir}**/*.fastq*', recursive=True) == [] and os.path.isdir(file_dir):
         files_manager.tmp_dir(path, temp_path)
-        raise ValueError('There were no FASTQ files '
-                         'in this directory')
-        # checks if given directory contains fastq files
+        raise ValueError('There were no FASTQ files in this directory')
     if os.path.isfile(file_dir):
         single_file = True
         file_name = os.path.basename(file_dir)
-        read2_file = os.path.join(os.path.dirname(file_dir),
-                                  file_name.replace('_R1_', '_R2_'))
+        read2_file = os.path.join(os.path.dirname(file_dir), file_name.replace('_R1_', '_R2_'))
         if '_R1_' in file_name and os.path.exists(read2_file):
             paired = True
-        # searches for a reverse read file
-        aligned_path, Error = mapbowtie2(file_dir, read2_file,
-                                         path, temp_path, paired)
-        # The Programm bowtie2 is used to align the Reads to a reference
-        # 16S database.
+        sampled_reads = sample_fastq(file_dir, sample_size)
+        temp_fastq = os.path.join(temp_path, 'sampled.fastq')
+        save_sampled_fastq(sampled_reads, temp_fastq)
+        aligned_path, Error = mapbowtie2(temp_fastq, read2_file, path, temp_path, paired)
         if Error is True:
             files_manager.tmp_dir(path, temp_path)
             raise ValueError('This file does not look like a fastq file')
-            # raises error if given file is not a fastq file
-        if paired is True and Output_counter.rawincount(f'{temp_path}'
-                                                        'paired.bed') == 0:
+        if paired is True and Output_counter.rawincount(f'{temp_path}paired.bed') == 0:
             files_manager.tmp_dir(path, temp_path)
-            raise ValueError('This file has no Reads of the required '
-                             'mapping-quality')
+            raise ValueError('This file has no Reads of the required mapping-quality')
         overlap(path, temp_path, aligned_path)
-        # look which reads intersect with which variable Region
-        if paired is False and Output_counter.rawincount(f'{temp_path}'
-                                                         'BED.bed') == 0:
+        if paired is False and Output_counter.rawincount(f'{temp_path}BED.bed') == 0:
             files_manager.tmp_dir(path, temp_path)
-            raise ValueError('This file has no Reads of the required '
-                             'mapping-quality')
+            raise ValueError('This file has no Reads of the required mapping-quality')
         file_name = file_name.rsplit('.f', 1)[0]
         file_name = file_name.replace('_R1_001', '')
-        # reformats filename for easy viewing
         result[file_name] = Output_counter.create_row(temp_path, paired)
-        # streamlines generated output to a visual terminal output
     elif os.path.isdir(file_dir):
         single_file = False
+        total_files = len(glob.glob(f'{file_dir}**/*.fastq*', recursive=True))
+        processed_files = 0
         for fq_file in glob.glob(f'{file_dir}**/*.fastq*', recursive=True):
+            processed_files += 1
             paired = False
             if '_R2_' in fq_file:
                 continue
             file_name = os.path.basename(fq_file)
-            read2_file = os.path.join(os.path.dirname(fq_file),
-                                      file_name.replace('_R1_', '_R2_'))
+            read2_file = os.path.join(os.path.dirname(fq_file), file_name.replace('_R1_', '_R2_'))
             if '_R1_' in file_name and os.path.exists(read2_file):
                 paired = True
-            # searches for a reverse read file
-            aligned_path, Error = mapbowtie2(fq_file, read2_file,
-                                             path, temp_path, paired)
-            # The Programm bowtie2 is used to align the Reads to a reference
-            # 16S database.
+            sampled_reads = sample_fastq(fq_file, sample_size)
+            temp_fastq = os.path.join(temp_path, 'sampled.fastq')
+            save_sampled_fastq(sampled_reads, temp_fastq)
+            aligned_path, Error = mapbowtie2(temp_fastq, read2_file, path, temp_path, paired)
             if Error is True:
                 continue
             overlap(path, temp_path, aligned_path)
-            # look which reads intersect with which variable Region
             file_name = file_name.rsplit('.f', 1)[0]
             file_name = file_name.replace('_R1_001', '')
-            # reformats filename for easy viewing
-            result[file_name] = Output_counter.create_row(temp_path,
-                                                          paired)
-            # streamlines generated output to a visual terminal output
+            result[file_name] = Output_counter.create_row(temp_path, paired)
     files_manager.tmp_dir(path, temp_path)
-    # deletes temporary folder
     do_output(result, new_file, single_file)
-    # writes ouput eiher to STDOUT or to a file specified
-    # via the -o option
     if write_csv is True:
-        new_file = (f'{path}Output/'
-                    f'{os.path.basename(os.path.dirname(file_dir))}.csv')
+        new_file = (f'{path}Output/{os.path.basename(os.path.dirname(file_dir))}.csv')
         do_output(result, new_file, single_file)
-        # writes csv file to the standard output folder
-
 
 def main():
-    r'''Main function
-
-    This function uses argparse to interpret user
-    input and then calls the workflow function which
-    does the actual work.
-
-    Iput otions
-        -o, --output :
-            User specified output path
-        -c, --csv :
-            If set a csv file will be written into
-            the standard Output folder
-
-    '''
     parser = argparse.ArgumentParser(prog='VX detector', description=(
-        'This programm tries to find which variable region of the 16S '
-        'sequence was sequencend'))
-    parser.add_argument('dir_path',
-                        help=('Directory path of the directory containing '
-                              'multiple fastq or fasta files.'))
-    parser.add_argument('-o', '--output', dest='output_file',
-                        default=sys.stdout,
-                        help='User can specify a file format in which the \
-                        output is written in the Output folder.')
-    parser.add_argument('-c', '--csv', dest='write_csv', action='store_true',
-                        help='If set the output will be written in a \
-                       .csv file in the Output folder')
+        'This programm tries to find which variable region of the 16S sequence was sequencend'))
+    parser.add_argument('dir_path', help=('Directory path of the directory containing multiple fastq or fasta files.'))
+    parser.add_argument('-o', '--output', dest='output_file', default=sys.stdout, 
+                        help='User can specify a file format in which the output is written in the Output folder.')
+    parser.add_argument('-c', '--csv', dest='write_csv', action='store_true', 
+                        help='If set the output will be written in a .csv file in the Output folder')
+    parser.add_argument('-s', '--sample_size', dest='sample_size', type=int, default=1000, 
+                        help='Number of reads to sample from each FASTQ file')
     args = parser.parse_args()
-    # allows terminal input
-    workflow(args.dir_path, args.output_file, args.write_csv)
-
+    workflow(args.dir_path, args.output_file, args.write_csv, args.sample_size)
 
 if __name__ == '__main__':
     main()

From 4684232aaab4cee66a698740c91458888a49cf27 Mon Sep 17 00:00:00 2001
From: Jeremy Meiss <jeremy@stud-070-079.wlan.uni-giessen.de>
Date: Mon, 5 Aug 2024 10:52:50 +0200
Subject: [PATCH 02/37] add subsampling

---
 vxdetector/VXdetector.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vxdetector/VXdetector.py b/vxdetector/VXdetector.py
index bee5e52..22f41f7 100644
--- a/vxdetector/VXdetector.py
+++ b/vxdetector/VXdetector.py
@@ -10,7 +10,6 @@
 import files_manager as files_manager
 from interact_bowtie2 import mapbowtie2, buildbowtie2
 from interact_bedtools import overlap
-import time
 from multiprocessing import pool
 import random
 

From d3e1cafe80b6e119b63a6d9ab97c9eed0c1edbf3 Mon Sep 17 00:00:00 2001
From: Landbarsch <j.meiss1312@gmail.com>
Date: Mon, 5 Aug 2024 11:01:54 +0200
Subject: [PATCH 03/37] add subsampling

---
 .DS_Store | Bin 0 -> 6148 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 .DS_Store

diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..44be779b37ebb780b7d4c19577818362dacc2ceb
GIT binary patch
literal 6148
zcmeHKJxc>Y5Ph2u3<*N8vb@etjE(IXBA8PA0m>P_z<5CtQLwn5U}@!#@E3@Py<%%8
z_#<q5v%4`HlfzDe%)rdM+nL#W_uzJK0hs2X(*WuKDs;h8m0^lWUVOp|engs$%yDwq
z*xpOKy@^U4Pz6+hSyaG&_*ljvj?oXl-}1xd4GEWqxo+BOB`0aZF<A_n2P;qO<N3~a
zetxUA|1;k^8m$fG$7{A?Za6oNFhCMM9Vos|R)iex-Up2x$Kl63-E_nxzbfS8)<4wH
z#wj}3#|6EE9VF;uHRM%ze!C^FoXU{j(#_S7UvBk?=kL`a-@=Q?hbqp=RQUGkdyFa0
zLbm12g^#?tkSl9G6Q||WIYUKRyXJMDS{w2#x9?f5Hlqq~&laoH9BNbrQ~_1sTLD=g
zBD!GgF>`2_4hDM!AeK06jo0!^Aw7}D*kk698JcmeMAvHk5yQB4jz_*O_Lw<z?J)lE
zVf>SgKcN`=bj}~ya+uhmMpZx+7+0X^ZtJrDZ{(l<$D8y{6;K8Kl>(;J+HN)ZN`7yh
xcsbc?Bl<2~OzJX+wuQlO$M!<D;zhbO<|4Tu#vU_=9HHrtfR#amD)6TYd;#NOkwyRj

literal 0
HcmV?d00001


From c3266ff3ab4099099e1732ed663dc11fedf0691b Mon Sep 17 00:00:00 2001
From: Landbarsch <j.meiss1312@gmail.com>
Date: Mon, 5 Aug 2024 11:06:00 +0200
Subject: [PATCH 04/37] "add subsampling"

---
 .DS_Store | Bin 6148 -> 6148 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/.DS_Store b/.DS_Store
index 44be779b37ebb780b7d4c19577818362dacc2ceb..5adcdaf9c4ac643ccd6532e224019fbae50e472d 100644
GIT binary patch
delta 106
zcmZoMXffEJ#uD4Gi-CcGg+Y%YogtH<Bsbs1B`GIA2`I+V>wcs<&ES|Ls(cDw`GO3?
V;N<+=0-zoS)*}ZtH?uSf0|3rs95es`

delta 106
zcmZoMXffEJ#u8gOgMop8g+Y%YogtH<Bsbs1B`GIA2`I(^#7=1j#~e}RQ{eKvf!wL+
Z@&y@&!O8i#1wcIvtVbqnZf0o|1^^657{vep


From 68876c5460f8a2471cf6cf950c9236e6647c9071 Mon Sep 17 00:00:00 2001
From: Landbarsch <j.meiss1312@gmail.com>
Date: Mon, 5 Aug 2024 11:12:27 +0200
Subject: [PATCH 05/37] add subsampling

---
 .DS_Store | Bin 6148 -> 6148 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/.DS_Store b/.DS_Store
index 5adcdaf9c4ac643ccd6532e224019fbae50e472d..f8561586ee19eeb6420ee78f8731c0fcfb2a330d 100644
GIT binary patch
delta 106
zcmZoMXffEJ#u7UxgMop8g+Y%YogtH<Bsbs1B`GIA2`I(^#1GR9jya;rr{I+@$S@2}
T&d)6X>S16#zF>1ROQSFVG`Sgl

delta 106
zcmZoMXffEJ#uD4Gi-CcGg+Y%YogtH<Bsbs1B`GIA2`I+V>wcs<&ES|Ls(cDw`GO3?
V;N<+=0-zoS)*}ZtH?uSf0|3rs95es`


From 8d8438e481d1f785e326fd4681775d9347521919 Mon Sep 17 00:00:00 2001
From: Landbarsch <j.meiss1312@gmail.com>
Date: Mon, 19 Aug 2024 09:59:03 +0200
Subject: [PATCH 06/37] update

---
 .DS_Store                  | Bin 6148 -> 8196 bytes
 vxdetector/.DS_Store       | Bin 0 -> 6148 bytes
 vxdetector/testlog         |   2 ++
 vxdetector/tests/.DS_Store | Bin 0 -> 6148 bytes
 4 files changed, 2 insertions(+)
 create mode 100644 vxdetector/.DS_Store
 create mode 100644 vxdetector/testlog
 create mode 100644 vxdetector/tests/.DS_Store

diff --git a/.DS_Store b/.DS_Store
index f8561586ee19eeb6420ee78f8731c0fcfb2a330d..5007d88cbd18a60f323aef667935563dabdf85ad 100644
GIT binary patch
literal 8196
zcmeHM&ubGw6n>i~?ZzU8qR@-7An2vkG_955CB_umg9wf2K_zChF$vx5uE{3p4+}Z#
zKj6uWS3wXD9>s%RRm79t1pfuagMRZP?PilSJ&C0=F!SEdeDCdh-|Wn0HbkUln${fA
zG!f~jOeQAL3@B`z)xHvuJ!c>_*ppqJ%N5<4*N0RciUGxdVn8vV7*GuSFAU(F&5Azc
zy)Q;ps~AuW97zV){$QXonU=C6B;PvF@FM_Z6y3Z)PcQ}2kTNZ0M@WpIXu=dym}>Hf
zp$T*J8xoh6vLmE0CrutcG<j!Do=`M?clb9roV2u%s#Odq2KpIbXZI-0Qkkmc2hZ=Z
zx6jT6$7*giYuc`fKK<v{l~?8DyDPop*1qFjW$)WS?EMFr(nR;ZRG}uB!MhWUeeOpD
zV|e+!IhPkPg#6Upy-0?mg7L^b>jag^p%$2^lMAiJuYE+h3Ef_`-J)cGZ89wT*+0BM
zBbuaD*jTXHK(#1OCi#>KS-m)zRgjCw$8S%E`wwz*{>j5GCqtqMax)qo<S6oqB}wCS
z9dqTA$J*Z9w9^<5G7_gXS_V&9x&}E54y};I<B)|;4RGhfw+FmOn<4zK5Pk~S8gO~K
zk6E)}+kl?J3^ZvG{T0mSQm>Cl`9MC3cmhh4q%GXRr*^Lop6T<z+z*b!q6e@~A+B{q
z*$DYyxsUcyp(66|5Q&A1Lb+&+kI_DA$i+q+OQ;vA)f)%rW3-PJGAZKu-a-^*@_K#n
zwGi}i47_jfwcvxrA{cb39<pI8VWf@OZg_Idy*53&+VGRfFIr?Q8XKR`C-h1EZm#0C
zbAG{ZHZq0!ZFY6Tb#1e&7tEX0QahhGdDV6NLe(kNxk1$`0dnV7)v?@m#%(%QgWKlB
zfF9T5`NZ`0c4{FxW6aMl?#vk5ODX6J(0Ahcsk0X@ujSY6n&Un~9w?gpVM52n^=`}N
zc_1IbH6B};`Y{X_T;ag{HpQ+t&}VRc1A9sCkOQ&7)g4=X1Nh)LTe-SpyAE47-5eQK
zmBY@!m`Grno&T4IfB!%1X{z`Y1B!ut7|^2GTs8y1u<f0lwGGsJsH`wtM@Vjhh9Acv
n`EeX__YXtp2B6F)EoDbY^q~Cn4*}}^FMp4z_dkCCk7)B3Z)n)x

delta 217
zcmZp1XfcprU|?W$DortDU=RQ@Ie-{Mvv5sJ6q~50$SANeU^g?Pz+@f)sn|IgAQ=Wd
zhIEEZhLYTT7nh`*{3NKZhiL}K98u*{@X8lt7zQWj=N16<Ft8q9Fu7S^%H&W%@yUw>
zXHK3fB09NEbn?d38H|hBIXDEFL5hGb;|3D0AlGe7{LVa?U&a&URt6@ByFp%I*c{I@
GhZz9w!z{W0

diff --git a/vxdetector/.DS_Store b/vxdetector/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..3980255f502e9eb1aa9391e620c1023c74d2632a
GIT binary patch
literal 6148
zcmeHK&x_MQ6n@j~+SIa8afQ7I0k37PUD-vvbnANXBp6u*m6~j74Q4Z?NexRW<gEXS
z;MKpx|HYGjZ)PfK+CA(=WSJLczWMRKmzgigOooWWU=}_gY7>!z#8|(HY=dz>r!}jY
z7Ul^RBchCkxb|tX=4~6S0#<?lO#%LPTk4yQD53KAd+{umXF)6jc<}OYjI{q-8A{yt
z0sb+KC<XtVeDHl5qdryNf0h5e!ruiu!Fs{|KAowRBS0-gN#cqTm)DU>9C5c2=k~@p
zl5)g}5U;l}%GXMiTlAb}U`$j##CRl1%kga*$FE9+2H31chLKAeuc{~be&0&!ZU6O7
zS3PUvO5BoXkf#&#M1BO5I7+i__ot|B)wgdnoQBhK-gzHo?qz;9O9%e+4cA^u83)U<
zADl*`eAwQ7B9qLIlF?KVqHqM4x38ikl=(o;k}y?VPj@&?r#Woz&F8&?qmJ7<JX&<z
z`J-O9<32n%Tr8T-oqP8mzZias&lC9tqlXFX2g-H~E`fC@&FHf?O=6kwoSCshr}P?c
zz(}dX?}WVMZQ{4FtS740uBd={Mo#OBJnL>1unJs%1$cdMkr?|13yo^&K&FlWz&e_h
zq0B!E%yA9&4Hg>F0~4AG)Kp=P7(&z0?i#<o!9t^^lQ4%5VeTx<2}P*8BfhKLN%S?^
z+$vxdSXE$4SKGY*KfV0?ze=(-tAJJDS}7pvC*H{bQ!;z&+T?hzb&=j9v9VvFQBjbY
i?N}9fE8as=hCY`Iz`nskBU)hgM?lG7GpoQqRp1Y_rp1N;

literal 0
HcmV?d00001

diff --git a/vxdetector/testlog b/vxdetector/testlog
new file mode 100644
index 0000000..109619a
--- /dev/null
+++ b/vxdetector/testlog
@@ -0,0 +1,2 @@
+/usr/bin/time: illegal option -- v
+usage: time [-al] [-h | -p] [-o file] utility [argument ...]
diff --git a/vxdetector/tests/.DS_Store b/vxdetector/tests/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..b1db8aa7b125485b75cc40b7b9725366f50d3750
GIT binary patch
literal 6148
zcmeHK&ui2`6n?W^ZPp?zVrxAJ0k2D4x6&eB;#v<q2}Y`*5|bv@&@{6oyIo{i$XWjw
zul^<eFP`*!GqaHGM!Xb}GH-eFz4`IJo%up$GDIYXllUP~mxvrVWBUe*2IGGA6>FIm
z?i7k6pqN4$Q+SxIjJ6F{0jt3OrU1XY9lWs#l_+cPuRM-28KIZAhNtkIU&;_QW<=9+
zWHP=-qbx1Q{r(TpT5DgszV56$8_s+GlbrhFa6Bo8Veyu0ucXYPMI1&)X)+DEx1Y&;
z9Hx0vsDw05Fy!6qG>_$UC?|Pbs$5@pI31@Gbhl@-!R~&~9qjGTd+zM<px<*J?e5Lz
z9cT0YgC{S8)9fslU)0=5V6Rek)8GQWB52U!+%NJ><|pXo%*~owA1S3wX&2BLcD@{b
zf_gzaG@{Rk*^-LikX={tZjC8LE<c3=49Dt8M5x7xBz_s5P6MO3cY*QFUl=7#F(L(X
z6C1C;KEJIR+Y6eYHd1AP8A?Qp88>nLyufv<#&tw*u(Cw8@LFZFUM*I<J|9%AU7`Z!
zJ#aQIk<(IH1*`&BLjm3&TsWg=u+pfu4ixGN0BoaK8Djom;2hVWXRy+U9+=RnK%FYg
z6+`HBw7X`UXRy+!(@B`ihcN#v%ne1Te+R#-hLi9#+R`du6<AhaO;@{o{y+co{eL;h
z)~o_nfh(neXdn0oLoCUhtt*S;v(|-wfV1(qN~20aVUA-J@KL-CSB5^91E6QH(ufwA
P{SZ(x*upCCPZjtLcEY(f

literal 0
HcmV?d00001


From 7f08f11e42c9b65c05d983e7f23bb10f89f7ca51 Mon Sep 17 00:00:00 2001
From: Landbarsch <j.meiss1312@gmail.com>
Date: Thu, 12 Sep 2024 16:42:49 +0200
Subject: [PATCH 07/37] updated sample size, now working

---
 .DS_Store                  | Bin 8196 -> 8196 bytes
 vxdetector/.DS_Store       | Bin 6148 -> 6148 bytes
 vxdetector/VXdetector.py   |  67 +++++++++++++++++++++++++++++--------
 vxdetector/testlog         |  11 ++++--
 vxdetector/tests/.DS_Store | Bin 6148 -> 6148 bytes
 5 files changed, 62 insertions(+), 16 deletions(-)

diff --git a/.DS_Store b/.DS_Store
index 5007d88cbd18a60f323aef667935563dabdf85ad..6b56441b7de3097435e94e155910f8b2d2e61d93 100644
GIT binary patch
delta 939
zcmZp1XmOa}&nUeyU^hRb^kyD`X^fLsi8QjC8R#e&n^;aZ61AWFSWtZOBEgxHLj}co
z%7TmXa`N-i85kHCCtC<joIF)TbaI&p9~)CPP{?F*m&ihPLt~&`V^g5skowHzvVx?X
z{3M|ADM<x6nZ+dr2KN}5m|0la*f}`4xOsT__yq*{h4_VqMMTBK#U<kfgp(4Jv(t<6
zOY>5k^K<fxoN`K2BQlfo^MW(0QX@)ID@s6&cmW|abwQ~qaR{N1$^xL8dJ%*O*f95^
z)YQD_^rFlZ|FpE?)RG7!c4SO(Mt-?xNouZVURplTg0jria{q#o%>2CKu+*YrARU=l
zRFs&Pp6XN?o0FK7n&Y46lwVSkpBpd057rBGWk69XNFUhT;QE5Z<jlNuW&01n&;bHv
zhX5Eu1HuqsP=+uV7&t^=EGW$wl9!m9%E_th;0+bwU{GbSW^iHfWk_NuWawa+!mxm0
zEyE#(GYq#F-Z1=NWMt%G6lN4<lw*`<G+;DkR5oKYhiGMDgc>0QrJ)p~!{$8EQpU~f
l65m+r2O^4)VyJ!~q6jI5>Oo;b?<hiwAqi-lV2h$6MgTas^#=d|

delta 508
zcmZp1XmOa}&nUMsU^hRb+-4quX^fK_gl0~*7w)SUVklwAWhh{XXGmknWXNGiWhe%+
zjTkBzN*T<6;u#EO43?fb`N>H+`AG~63<3-cOzA*c?>`s-Squ!=%}8d*2byNUPzAK1
zgrOYO3`SX?zBT{<*Mm&KV@L`^A<zILpu@6(&MZbVq#I%g0W&ffl7Sv606HZF=p|Io
z)H5*tB+8Inpl@=34zUDcQ-(5Bhk!!_=o<nKDFb>XnIR8InlYp^6oAYSKm;p;Bgi3>
zzX>adpqkFG9xAbr-NZmg!PwYr@;s4{&8x&>88;TFFm7g-_{K6hQ$!DotRf=-TOfqF

diff --git a/vxdetector/.DS_Store b/vxdetector/.DS_Store
index 3980255f502e9eb1aa9391e620c1023c74d2632a..b731e8a11f420afc3e722ba25df51d9e5f08f722 100644
GIT binary patch
delta 117
zcmZoMXfc@J&&atkU^g=(=VTrhHG%*C|LZd_FjxU;FqrJbV#LN63>4~`T*czd3S!-w
ze2B#n#A4K%%)x34Vlib+u3%MRV(Fedk99V?nSqXiv5Dnm2evj2rd<$&HWsEcZf58B
G%MSobZz$;i

delta 163
zcmZoMXfc@J&&ahgU^g=(*JK_RH30?&21g*)`ws>T43nK$jMx~~gM=nmu{g8Jf?0=H
z99g@;EDly%<DbYHfNXY#5{6WUVj!8rkk63rnUkNKl#`zXlmQyg2-3G^vKOlg6I1%+
i0@m5=#^yQ-MrP)dh1l9S7}h{6-dJeKxS5^fFFyd5MKEdr

diff --git a/vxdetector/VXdetector.py b/vxdetector/VXdetector.py
index 22f41f7..317def6 100644
--- a/vxdetector/VXdetector.py
+++ b/vxdetector/VXdetector.py
@@ -13,17 +13,18 @@
 from multiprocessing import pool
 import random
 
-def sample_fastq(file_path, sample_size):
-    '''get random parts from the fq file'''
+def sample_fastq(file_path, sample_size, sampled_indices=None):
+    '''Get random parts from the FASTQ file based on shared indices for paired-end reads.'''
     sampled_reads = []
     with open(file_path, 'r') as f:
         lines = f.readlines()
         total_reads = len(lines) // 4
-        sampled_indices = sorted(random.sample(range(total_reads), sample_size))
+        if sampled_indices is None:
+            sampled_indices = sorted(random.sample(range(total_reads), sample_size))
         for idx in sampled_indices:
             read = lines[idx*4:(idx+1)*4]
             sampled_reads.extend(read)
-    return sampled_reads
+    return sampled_reads, sampled_indices
 
 def save_sampled_fastq(sampled_reads, output_path):
     '''Save sampled reads to a temporary FASTQ file'''
@@ -52,9 +53,14 @@ def do_statistic(result):
     return result
 
 def do_output(result, new_file, single_file):
-
+    
     warnings.simplefilter(action='ignore', category=FutureWarning)
     result = pd.DataFrame(result).T.sort_index()
+    
+    print("Available columns in result:", result.columns)
+    print(result.head())  # Zeigt die ersten Zeilen des DataFrames an
+
+    
     for column in result:
         result[column] = pd.to_numeric(result[column], errors='ignore')
     if single_file is False:
@@ -66,12 +72,16 @@ def do_output(result, new_file, single_file):
     result.to_csv(new_file, index=True)
 
 def workflow(file_dir, new_file, write_csv, sample_size):
-    
     path = files_manager.get_lib()
     temp_path = files_manager.tmp_dir(path, temp_path=None)
     paired = False
+    single_file = False
     result = dict()
     buildbowtie2(path)
+
+    if not os.path.exists(file_dir):
+        raise ValueError(f"The provided path {file_dir} does not exist.")
+    
     if glob.glob(f'{file_dir}**/*.fastq*', recursive=True) == [] and os.path.isdir(file_dir):
         files_manager.tmp_dir(path, temp_path)
         raise ValueError('There were no FASTQ files in this directory')
@@ -81,10 +91,20 @@ def workflow(file_dir, new_file, write_csv, sample_size):
         read2_file = os.path.join(os.path.dirname(file_dir), file_name.replace('_R1_', '_R2_'))
         if '_R1_' in file_name and os.path.exists(read2_file):
             paired = True
-        sampled_reads = sample_fastq(file_dir, sample_size)
-        temp_fastq = os.path.join(temp_path, 'sampled.fastq')
-        save_sampled_fastq(sampled_reads, temp_fastq)
-        aligned_path, Error = mapbowtie2(temp_fastq, read2_file, path, temp_path, paired)
+        # Sample reads from the forward file (R1) and use the same indices for the reverse file (R2)
+        sampled_reads_R1, sampled_indices = sample_fastq(file_dir, sample_size)
+        sampled_reads_R2, _ = sample_fastq(read2_file, sample_size, sampled_indices)
+        
+        # Save sampled reads for both R1 and R2
+        temp_fastq_R1 = os.path.join(temp_path, 'sampled_R1.fastq')
+        temp_fastq_R2 = os.path.join(temp_path, 'sampled_R2.fastq')
+        save_sampled_fastq(sampled_reads_R1, temp_fastq_R1)
+        save_sampled_fastq(sampled_reads_R2, temp_fastq_R2)
+
+        aligned_path, Error = mapbowtie2(temp_fastq_R1, temp_fastq_R2, path, temp_path, paired)
+
+        print(f"Mapping result: {aligned_path}, Error: {Error}")
+        
         if Error is True:
             files_manager.tmp_dir(path, temp_path)
             raise ValueError('This file does not look like a fastq file')
@@ -98,6 +118,9 @@ def workflow(file_dir, new_file, write_csv, sample_size):
         file_name = file_name.rsplit('.f', 1)[0]
         file_name = file_name.replace('_R1_001', '')
         result[file_name] = Output_counter.create_row(temp_path, paired)
+        
+        print(f"Output for {file_name}: {result[file_name]}")
+        
     elif os.path.isdir(file_dir):
         single_file = False
         total_files = len(glob.glob(f'{file_dir}**/*.fastq*', recursive=True))
@@ -111,21 +134,37 @@ def workflow(file_dir, new_file, write_csv, sample_size):
             read2_file = os.path.join(os.path.dirname(fq_file), file_name.replace('_R1_', '_R2_'))
             if '_R1_' in file_name and os.path.exists(read2_file):
                 paired = True
-            sampled_reads = sample_fastq(fq_file, sample_size)
-            temp_fastq = os.path.join(temp_path, 'sampled.fastq')
-            save_sampled_fastq(sampled_reads, temp_fastq)
-            aligned_path, Error = mapbowtie2(temp_fastq, read2_file, path, temp_path, paired)
+
+            # Sample reads from R1 and use the same indices for R2
+            sampled_reads_R1, sampled_indices = sample_fastq(fq_file, sample_size)
+            sampled_reads_R2, _ = sample_fastq(read2_file, sample_size, sampled_indices)
+            
+            temp_fastq_R1 = os.path.join(temp_path, 'sampled_R1.fastq')
+            temp_fastq_R2 = os.path.join(temp_path, 'sampled_R2.fastq')
+            save_sampled_fastq(sampled_reads_R1, temp_fastq_R1)
+            save_sampled_fastq(sampled_reads_R2, temp_fastq_R2)
+
+            aligned_path, Error = mapbowtie2(temp_fastq_R1, temp_fastq_R2, path, temp_path, paired)
             if Error is True:
                 continue
             overlap(path, temp_path, aligned_path)
+
+            print(f"Overlap completed for {aligned_path}")
+            
             file_name = file_name.rsplit('.f', 1)[0]
             file_name = file_name.replace('_R1_001', '')
             result[file_name] = Output_counter.create_row(temp_path, paired)
+            
+            print(f"Output for {file_name}: {result[file_name]}")
+
+    print("Result content before do_output:", result)
+    
     files_manager.tmp_dir(path, temp_path)
     do_output(result, new_file, single_file)
     if write_csv is True:
         new_file = (f'{path}Output/{os.path.basename(os.path.dirname(file_dir))}.csv')
         do_output(result, new_file, single_file)
+        
 
 def main():
     parser = argparse.ArgumentParser(prog='VX detector', description=(
diff --git a/vxdetector/testlog b/vxdetector/testlog
index 109619a..6e6ca14 100644
--- a/vxdetector/testlog
+++ b/vxdetector/testlog
@@ -1,2 +1,9 @@
-/usr/bin/time: illegal option -- v
-usage: time [-al] [-h | -p] [-o file] utility [argument ...]
+sh: /bin/bowtie2-build: No such file or directory
+Traceback (most recent call last):
+  File "/Users/jeremy/Desktop/VX/algorithm_vxdetector/vxdetector/VXdetector.py", line 144, in <module>
+    main()
+  File "/Users/jeremy/Desktop/VX/algorithm_vxdetector/vxdetector/VXdetector.py", line 141, in main
+    workflow(args.dir_path, args.output_file, args.write_csv, args.sample_size)
+  File "/Users/jeremy/Desktop/VX/algorithm_vxdetector/vxdetector/VXdetector.py", line 77, in workflow
+    raise ValueError('There were no FASTQ files in this directory')
+ValueError: There were no FASTQ files in this directory
diff --git a/vxdetector/tests/.DS_Store b/vxdetector/tests/.DS_Store
index b1db8aa7b125485b75cc40b7b9725366f50d3750..00cda90bc2bc6b36c05e292e0508f64d6be0bf8d 100644
GIT binary patch
delta 20
bcmZoMXffDum4)5RKu5vY#B%c;77bwlMWO~7

delta 20
bcmZoMXffDum4)5dTt~sk%zX1577bwlMa2dj


From 561156990ec5cde2c812a3217845306ecdebb49e Mon Sep 17 00:00:00 2001
From: Landbarsch <j.meiss1312@gmail.com>
Date: Fri, 27 Sep 2024 12:56:45 +0200
Subject: [PATCH 08/37] added comments to the skript, removed the debugg- code.
 made an extra file for the sampler version

---
 vxdetector/VXdetector_sampler.py | 198 +++++++++++++++++++++++++++++++
 1 file changed, 198 insertions(+)
 create mode 100644 vxdetector/VXdetector_sampler.py

diff --git a/vxdetector/VXdetector_sampler.py b/vxdetector/VXdetector_sampler.py
new file mode 100644
index 0000000..2af3313
--- /dev/null
+++ b/vxdetector/VXdetector_sampler.py
@@ -0,0 +1,198 @@
+#!/usr/bin/python
+
+import argparse
+import glob
+import os
+import sys
+import warnings
+import pandas as pd
+import Output_counter as Output_counter
+import files_manager as files_manager
+from interact_bowtie2 import mapbowtie2, buildbowtie2
+from interact_bedtools import overlap
+import random
+
+def sample_fastq(file_path, sample_size, sampled_indices=None):
+    '''Get random parts from the FASTQ file based on shared indices for paired-end reads.'''
+    sampled_reads = []  # List to hold sampled reads
+    with open(file_path, 'r') as f:
+        lines = f.readlines()
+        total_reads = len(lines) // 4  # FASTQ files store 4 lines per read
+        if sampled_indices is None:
+            # If no indices are provided, sample randomly from the total reads
+            sampled_indices = sorted(random.sample(range(total_reads), sample_size))
+        for idx in sampled_indices:
+            # Get the read (4 lines per read) and append to the sampled_reads list
+            read = lines[idx*4:(idx+1)*4]
+            sampled_reads.extend(read)
+    return sampled_reads, sampled_indices
+
+def save_sampled_fastq(sampled_reads, output_path):
+    '''Save sampled reads to a temporary FASTQ file.'''
+    with open(output_path, 'w') as f:
+        f.writelines(sampled_reads)
+
+def do_statistic(result):
+    '''Compute statistics (mean and standard deviation) from alignment results.'''
+    average = result.mean(numeric_only=True).to_frame().T  # Compute the mean of numeric columns
+    region = (result['Sequenced variable region'].mode().values)  # Get most frequent region
+    region = ' / '.join(str(r) for r in region)  # Join the modes into a single string
+    region = region.replace('\'', '').replace('[', '').replace(']', '')  # Clean up string format
+    average['Sequenced variable region'] = region  # Add region info to the average dataframe
+    if 'Not properly paired' not in average.columns:
+        average['Not properly paired'] = 'not paired'
+    std_dev = result.std(numeric_only=True).to_frame().T  # Compute standard deviation of numeric columns
+    statistic = pd.concat([average, std_dev], axis=0)  # Combine mean and standard deviation
+    # Select specific columns for the final output
+    statistic = statistic[['Number of Reads', 'Unaligned Reads [%]', 'Not properly paired',
+                           'Sequenced variable region', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6',
+                           'V7', 'V8', 'V9', 'Not aligned to a variable region']]
+    statistic['row_descriptor'] = ['Average', 'Standard deviation']  # Add row descriptors
+    statistic = statistic.set_index('row_descriptor')  # Set descriptor as index
+    result = pd.concat([statistic, result], axis=0)  # Append stats to original results
+    return result
+
+def do_output(result, new_file, single_file):
+    '''Process and output results to a CSV file.'''
+    warnings.simplefilter(action='ignore', category=FutureWarning)
+    result = pd.DataFrame(result).T.sort_index()  # Convert result to DataFrame and sort
+
+    # Convert columns to numeric where applicable
+    for column in result:
+        result[column] = pd.to_numeric(result[column], errors='ignore')
+    
+    if single_file is False:
+        result = do_statistic(result)  # Add statistics if processing multiple files
+    else:
+        result = result[['Number of Reads', 'Unaligned Reads [%]', 'Not properly paired',
+                         'Sequenced variable region', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6',
+                         'V7', 'V8', 'V9', 'Not aligned to a variable region']]
+    
+    # Write the result DataFrame to a CSV file
+    result.to_csv(new_file, index=True)
+
+def workflow(file_dir, new_file, write_csv, sample_size):
+    '''Main workflow for processing FASTQ files and performing sequence alignment.'''
+    path = files_manager.get_lib()  # Get the main working directory
+    temp_path = files_manager.tmp_dir(path, temp_path=None)  # Create a temporary directory
+    paired = False  # To check if the input is paired-end
+    single_file = False  # To check if processing a single file
+    result = dict()  # Dictionary to store results from different files
+    buildbowtie2(path)  # Build the Bowtie2 index
+
+    # Check if the input path exists
+    if not os.path.exists(file_dir):
+        raise ValueError(f"The provided path {file_dir} does not exist.")
+    
+    # Check if there are FASTQ files in the directory
+    if glob.glob(f'{file_dir}**/*.fastq*', recursive=True) == [] and os.path.isdir(file_dir):
+        files_manager.tmp_dir(path, temp_path)
+        raise ValueError('There were no FASTQ files in this directory')
+    
+    # Process single file
+    if os.path.isfile(file_dir):
+        single_file = True
+        file_name = os.path.basename(file_dir)
+        read2_file = os.path.join(os.path.dirname(file_dir), file_name.replace('_R1_', '_R2_'))  # Get the paired file if exists
+        if '_R1_' in file_name and os.path.exists(read2_file):
+            paired = True  # Mark as paired if both R1 and R2 files exist
+
+        # Sample reads from R1 and R2 with same indices
+        sampled_reads_R1, sampled_indices = sample_fastq(file_dir, sample_size)
+        sampled_reads_R2, _ = sample_fastq(read2_file, sample_size, sampled_indices)
+
+        # Save sampled reads to temporary files
+        temp_fastq_R1 = os.path.join(temp_path, 'sampled_R1.fastq')
+        temp_fastq_R2 = os.path.join(temp_path, 'sampled_R2.fastq')
+        save_sampled_fastq(sampled_reads_R1, temp_fastq_R1)
+        save_sampled_fastq(sampled_reads_R2, temp_fastq_R2)
+
+        # Run Bowtie2 alignment
+        aligned_path, Error = mapbowtie2(temp_fastq_R1, temp_fastq_R2, path, temp_path, paired)
+
+        if Error is True:
+            files_manager.tmp_dir(path, temp_path)
+            raise ValueError('This file does not look like a fastq file')
+        if paired is True and Output_counter.rawincount(f'{temp_path}paired.bed') == 0:
+            files_manager.tmp_dir(path, temp_path)
+            raise ValueError('This file has no Reads of the required mapping-quality')
+        
+        # Perform overlap analysis using Bedtools
+        overlap(path, temp_path, aligned_path)
+        if paired is False and Output_counter.rawincount(f'{temp_path}BED.bed') == 0:
+            files_manager.tmp_dir(path, temp_path)
+            raise ValueError('This file has no Reads of the required mapping-quality')
+
+        # Generate results from output
+        file_name = file_name.rsplit('.f', 1)[0]
+        file_name = file_name.replace('_R1_001', '')
+        result[file_name] = Output_counter.create_row(temp_path, paired)
+        
+    # Process directory of files
+    elif os.path.isdir(file_dir):
+        single_file = False
+        total_files = len(glob.glob(f'{file_dir}**/*.fastq*', recursive=True))  # Count number of FASTQ files
+        processed_files = 0
+        for fq_file in glob.glob(f'{file_dir}**/*.fastq*', recursive=True):
+            processed_files += 1
+            paired = False
+            if '_R2_' in fq_file:
+                continue  # Skip R2 files to avoid processing duplicates
+            
+            file_name = os.path.basename(fq_file)
+            read2_file = os.path.join(os.path.dirname(fq_file), file_name.replace('_R1_', '_R2_'))
+            if '_R1_' in file_name and os.path.exists(read2_file):
+                paired = True  # Mark as paired if both R1 and R2 files exist
+
+            # Sample reads from R1 and R2 with same indices
+            sampled_reads_R1, sampled_indices = sample_fastq(fq_file, sample_size)
+            sampled_reads_R2, _ = sample_fastq(read2_file, sample_size, sampled_indices)
+            
+            # Save sampled reads to temporary files
+            temp_fastq_R1 = os.path.join(temp_path, 'sampled_R1.fastq')
+            temp_fastq_R2 = os.path.join(temp_path, 'sampled_R2.fastq')
+            save_sampled_fastq(sampled_reads_R1, temp_fastq_R1)
+            save_sampled_fastq(sampled_reads_R2, temp_fastq_R2)
+
+            # Run Bowtie2 alignment
+            aligned_path, Error = mapbowtie2(temp_fastq_R1, temp_fastq_R2, path, temp_path, paired)
+            if Error is True:
+                continue  # Skip if there's an error in processing the file
+
+            # Perform overlap analysis using Bedtools
+            overlap(path, temp_path, aligned_path)
+
+            # Generate results from output
+            file_name = file_name.rsplit('.f', 1)[0]
+            file_name = file_name.replace('_R1_001', '')
+            result[file_name] = Output_counter.create_row(temp_path, paired)
+                
+    # Clean up temporary directory
+    files_manager.tmp_dir(path, temp_path)
+    
+    # Output the result as CSV or display
+    do_output(result, new_file, single_file)
+    if write_csv is True:
+        new_file = (f'{path}Output/{os.path.basename(os.path.dirname(file_dir))}.csv')
+        do_output(result, new_file, single_file)
+
+def main():
+    '''Main function to parse user arguments and initiate the workflow.'''
+    parser = argparse.ArgumentParser(prog='VX detector', description=(
+        'This program tries to find which variable region of the 16S sequence was sequenced'))
+    parser.add_argument('dir_path', help=('Directory path of the directory containing multiple fastq or fasta files.'))
+    parser.add_argument('-o', '--output', dest='output_file', default=sys.stdout, 
+                        help='User can specify a file format in which the output is written in the Output folder.')
+    parser.add_argument('-c', '--csv', dest='write_csv', action='store_true', 
+                        help='If set the output will be written in a .csv file in the Output folder')
+    parser.add_argument('-s', '--sample_size', dest='sample_size', type=int, default=1000, 
+                        help='Number of reads to sample from each FASTQ file')
+    
+    # Parse the user input arguments
+    args = parser.parse_args()
+    
+    # Start the workflow with provided arguments
+    workflow(args.dir_path, args.output_file, args.write_csv, args.sample_size)
+
+if __name__ == '__main__':
+    main()  # Entry point of the script

From b851734b67bf5f6b34831600ffd4519259663d5a Mon Sep 17 00:00:00 2001
From: Landbarsch <j.meiss1312@gmail.com>
Date: Thu, 21 Nov 2024 14:28:07 +0100
Subject: [PATCH 09/37] Merged Multiprocessing Apsect with Subsampling for a
 complete Tool

---
 Output/.DS_Store                 | Bin 0 -> 6148 bytes
 vxdetector/VXdetector.py         | 275 +++++++++++++++++--------------
 vxdetector/VXdetector_sampler.py | 198 ----------------------
 3 files changed, 151 insertions(+), 322 deletions(-)
 create mode 100644 Output/.DS_Store
 delete mode 100644 vxdetector/VXdetector_sampler.py

diff --git a/Output/.DS_Store b/Output/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6
GIT binary patch
literal 6148
zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3
zem<@ulZcFPQ@L2!n>{z**<q8>++&mCkOWA81W14cNZ<zv;LbK1Poaz?KmsK2CSc!(
z0ynLxE!0092;Krf2c+FF_Fe*7ECH>lEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ
zLs35+`xjp>T0<F0fCPF1$Cyrb|F7^5{eNG?83~ZUUlGt@xh*qZDeu<Z%US-OSsOPv
j)R!Z4KLME7ReXlK;d!wEw5GODWMKRea10D2@KpjYNUI8I

literal 0
HcmV?d00001

diff --git a/vxdetector/VXdetector.py b/vxdetector/VXdetector.py
index 317def6..755d138 100644
--- a/vxdetector/VXdetector.py
+++ b/vxdetector/VXdetector.py
@@ -10,165 +10,187 @@
 import files_manager as files_manager
 from interact_bowtie2 import mapbowtie2, buildbowtie2
 from interact_bedtools import overlap
-from multiprocessing import pool
+import tempfile
+from multiprocessing import Pool
+import biom
+import shutil
+import gzip
 import random
 
+def biom_to_fastq(biom_file, fastq_output):
+    '''Converts a BIOM file to FASTQ format by extracting sequences and ensuring they are compatible with FASTQ standards.'''
+    with open(fastq_output, "w") as f:
+        table = biom.load_table(biom_file)  # Load the BIOM table
+        sequences = table.ids(axis='observation')  # Extract sequences
+        for seq_id in sequences:
+            # Truncate the sequence ID if it’s too long for Bowtie2's requirements
+            truncated_id = seq_id[:50]  # Cut the sequence ID to 50 characters (adjust if needed)
+            # Check and sanitize the sequence to avoid unwanted characters
+            sanitized_seq = ''.join(filter(str.isalpha, seq_id))  # Ensure only letters are in the sequence
+            f.write(f"@{truncated_id}\n{sanitized_seq}\n+\n{'I' * len(sanitized_seq)}\n")  # Write in FASTQ format
+    print(f"BIOM file successfully converted to FASTQ format at: {fastq_output}")
+
+
 def sample_fastq(file_path, sample_size, sampled_indices=None):
     '''Get random parts from the FASTQ file based on shared indices for paired-end reads.'''
-    sampled_reads = []
-    with open(file_path, 'r') as f:
-        lines = f.readlines()
-        total_reads = len(lines) // 4
-        if sampled_indices is None:
-            sampled_indices = sorted(random.sample(range(total_reads), sample_size))
-        for idx in sampled_indices:
-            read = lines[idx*4:(idx+1)*4]
-            sampled_reads.extend(read)
+    sampled_reads = []  # List to hold sampled reads
+    open_func = gzip.open if file_path.endswith('.gz') else open
+
+    # Count total reads in the file by iterating line by line
+    with open_func(file_path, 'rt') as f:
+        total_reads = sum(1 for _ in f) // 4
+
+    # Adjust sample_size if it's greater than total_reads
+    if sample_size > total_reads:
+        sample_size = total_reads
+
+    if sampled_indices is None:
+        sampled_indices = sorted(random.sample(range(total_reads), sample_size))
+
+    with open_func(file_path, 'rt') as f:
+        read_idx = 0
+        read_buffer = []
+        for line in f:
+            read_buffer.append(line)
+            if len(read_buffer) == 4:  # Once we have a full read
+                if read_idx in sampled_indices:
+                    sampled_reads.extend(read_buffer)  # Add the read if it matches the sampled index
+                read_buffer = []  # Clear the buffer for the next read
+                read_idx += 1  # Move to the next read
+
+                # Stop if we've collected enough reads
+                if len(sampled_reads) >= sample_size * 4:
+                    break
+
     return sampled_reads, sampled_indices
 
+
 def save_sampled_fastq(sampled_reads, output_path):
-    '''Save sampled reads to a temporary FASTQ file'''
-    with open(output_path, 'w') as f:
+    '''Save sampled reads to a temporary FASTQ file.'''
+    open_func = gzip.open if output_path.endswith('.gz') else open  # Use gzip if output is .gz
+
+    with open_func(output_path, 'wt') as f:  # Write in text mode ('wt')
         f.writelines(sampled_reads)
 
-def do_statistic(result):
 
-    average = result.mean(numeric_only=True).to_frame().T
-    region = (result['Sequenced variable region'].mode().values)
-    region = ' / '.join(str(r) for r in region)
-    region = region.replace('\'', '')
-    region = region.replace('[', '')
-    region = region.replace(']', '')
+def do_statistic(result):
+    '''Performs statistical analysis on the DataFrame
+
+    Calculates the mean and standard deviation for all numeric columns
+    in the DataFrame, and determines the most common sequenced variable 
+    region. Adds these statistics to the DataFrame and returns the updated 
+    DataFrame.
+    '''
+    average = result.mean(numeric_only=True).to_frame().T  # Calculate mean for numeric columns
+    region = (result['Sequenced variable region'].mode().values)  # Find the most common variable region
+    region = ' / '.join(str(r) for r in region)  # Format the result for better readability
+    region = region.replace('\'', '').replace('[', '').replace(']', '')  # Clean up formatting
     average['Sequenced variable region'] = region
     if 'Not properly paired' not in average.columns:
-        average['Not properly paired'] = 'not paired'
-    std_dev = result.std(numeric_only=True).to_frame().T
-    statistic = pd.concat([average, std_dev], axis=0)
+        average['Not properly paired'] = 'not paired'  # Handle cases where reads are not paired
+    std_dev = result.std(numeric_only=True).to_frame().T  # Calculate standard deviation for numeric columns
+    statistic = pd.concat([average, std_dev], axis=0)  # Combine mean and standard deviation dataframes
+    # Select relevant columns and reorder them
     statistic = statistic[['Number of Reads', 'Unaligned Reads [%]', 'Not properly paired',
                            'Sequenced variable region', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6',
                            'V7', 'V8', 'V9', 'Not aligned to a variable region']]
-    statistic['row_descriptor'] = ['Average', 'Standard deviation']
-    statistic = statistic.set_index('row_descriptor')
-    result = pd.concat([statistic, result], axis=0)
+    statistic['row_descriptor'] = ['Average', 'Standard deviation']  # Add descriptors for the statistics
+    statistic = statistic.set_index('row_descriptor')  # Set the row descriptors as the index
+    result = pd.concat([statistic, result], axis=0)  # Combine the statistical results with the original data
     return result
 
+
 def do_output(result, new_file, single_file):
-    
-    warnings.simplefilter(action='ignore', category=FutureWarning)
-    result = pd.DataFrame(result).T.sort_index()
-    
-    print("Available columns in result:", result.columns)
-    print(result.head())  # Zeigt die ersten Zeilen des DataFrames an
+    '''Writes the results into a CSV file
 
-    
+    Converts the dictionary of results into a DataFrame, calculates statistics 
+    if working with multiple files, and writes the data into a CSV file.
+    '''
+    warnings.simplefilter(action='ignore', category=FutureWarning)  # Ignore warnings about future behavior
+    result = pd.DataFrame(result).T.sort_index()  # Convert the dictionary to a DataFrame and sort by index
     for column in result:
-        result[column] = pd.to_numeric(result[column], errors='ignore')
+        result[column] = pd.to_numeric(result[column], errors='ignore')  # Convert columns to numeric where possible
     if single_file is False:
-        result = do_statistic(result)
+        result = do_statistic(result)  # If multiple files, calculate statistics
     else:
+        # Select relevant columns for single file processing
         result = result[['Number of Reads', 'Unaligned Reads [%]', 'Not properly paired',
                          'Sequenced variable region', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6',
                          'V7', 'V8', 'V9', 'Not aligned to a variable region']]
-    result.to_csv(new_file, index=True)
+    result.to_csv(new_file, index=True)  # Write the results to a CSV file
 
-def workflow(file_dir, new_file, write_csv, sample_size):
-    path = files_manager.get_lib()
-    temp_path = files_manager.tmp_dir(path, temp_path=None)
-    paired = False
-    single_file = False
-    result = dict()
-    buildbowtie2(path)
 
-    if not os.path.exists(file_dir):
-        raise ValueError(f"The provided path {file_dir} does not exist.")
-    
-    if glob.glob(f'{file_dir}**/*.fastq*', recursive=True) == [] and os.path.isdir(file_dir):
-        files_manager.tmp_dir(path, temp_path)
-        raise ValueError('There were no FASTQ files in this directory')
-    if os.path.isfile(file_dir):
-        single_file = True
-        file_name = os.path.basename(file_dir)
-        read2_file = os.path.join(os.path.dirname(file_dir), file_name.replace('_R1_', '_R2_'))
-        if '_R1_' in file_name and os.path.exists(read2_file):
-            paired = True
-        # Sample reads from the forward file (R1) and use the same indices for the reverse file (R2)
-        sampled_reads_R1, sampled_indices = sample_fastq(file_dir, sample_size)
-        sampled_reads_R2, _ = sample_fastq(read2_file, sample_size, sampled_indices)
-        
-        # Save sampled reads for both R1 and R2
-        temp_fastq_R1 = os.path.join(temp_path, 'sampled_R1.fastq')
-        temp_fastq_R2 = os.path.join(temp_path, 'sampled_R2.fastq')
-        save_sampled_fastq(sampled_reads_R1, temp_fastq_R1)
-        save_sampled_fastq(sampled_reads_R2, temp_fastq_R2)
-
-        aligned_path, Error = mapbowtie2(temp_fastq_R1, temp_fastq_R2, path, temp_path, paired)
-
-        print(f"Mapping result: {aligned_path}, Error: {Error}")
-        
-        if Error is True:
-            files_manager.tmp_dir(path, temp_path)
-            raise ValueError('This file does not look like a fastq file')
-        if paired is True and Output_counter.rawincount(f'{temp_path}paired.bed') == 0:
-            files_manager.tmp_dir(path, temp_path)
-            raise ValueError('This file has no Reads of the required mapping-quality')
-        overlap(path, temp_path, aligned_path)
-        if paired is False and Output_counter.rawincount(f'{temp_path}BED.bed') == 0:
-            files_manager.tmp_dir(path, temp_path)
-            raise ValueError('This file has no Reads of the required mapping-quality')
-        file_name = file_name.rsplit('.f', 1)[0]
-        file_name = file_name.replace('_R1_001', '')
-        result[file_name] = Output_counter.create_row(temp_path, paired)
-        
-        print(f"Output for {file_name}: {result[file_name]}")
-        
-    elif os.path.isdir(file_dir):
-        single_file = False
-        total_files = len(glob.glob(f'{file_dir}**/*.fastq*', recursive=True))
-        processed_files = 0
-        for fq_file in glob.glob(f'{file_dir}**/*.fastq*', recursive=True):
-            processed_files += 1
-            paired = False
-            if '_R2_' in fq_file:
-                continue
+def process_file(fq_file, path, bowtie2_params, sample_size=None):
+    """
+    Processes a single FASTQ file, including optional read sampling, alignment with Bowtie2, 
+    and overlap analysis using Bedtools.
+    """
+    paired = False
+    result = {}
+    try:
+        with tempfile.TemporaryDirectory() as temp_path:
             file_name = os.path.basename(fq_file)
             read2_file = os.path.join(os.path.dirname(fq_file), file_name.replace('_R1_', '_R2_'))
             if '_R1_' in file_name and os.path.exists(read2_file):
-                paired = True
-
-            # Sample reads from R1 and use the same indices for R2
-            sampled_reads_R1, sampled_indices = sample_fastq(fq_file, sample_size)
-            sampled_reads_R2, _ = sample_fastq(read2_file, sample_size, sampled_indices)
-            
-            temp_fastq_R1 = os.path.join(temp_path, 'sampled_R1.fastq')
-            temp_fastq_R2 = os.path.join(temp_path, 'sampled_R2.fastq')
-            save_sampled_fastq(sampled_reads_R1, temp_fastq_R1)
-            save_sampled_fastq(sampled_reads_R2, temp_fastq_R2)
-
-            aligned_path, Error = mapbowtie2(temp_fastq_R1, temp_fastq_R2, path, temp_path, paired)
-            if Error is True:
-                continue
+                paired = True  # Paired-end sequencing detected
+
+            if sample_size:  # Perform sampling if required
+                sampled_reads_R1, sampled_indices = sample_fastq(fq_file, sample_size)
+                temp_fastq_R1 = os.path.join(temp_path, 'sampled_R1.fastq')
+                save_sampled_fastq(sampled_reads_R1, temp_fastq_R1)
+                fq_file = temp_fastq_R1
+
+                if paired:  # Process the reverse reads for paired-end data
+                    sampled_reads_R2, _ = sample_fastq(read2_file, sample_size, sampled_indices)
+                    temp_fastq_R2 = os.path.join(temp_path, 'sampled_R2.fastq')
+                    save_sampled_fastq(sampled_reads_R2, temp_fastq_R2)
+                    read2_file = temp_fastq_R2
+
+            # Perform alignment and overlap analysis
+            aligned_path, Error = mapbowtie2(fq_file, read2_file, path, temp_path, paired, bowtie2_params)
+            if Error:
+                return None
             overlap(path, temp_path, aligned_path)
-
-            print(f"Overlap completed for {aligned_path}")
-            
-            file_name = file_name.rsplit('.f', 1)[0]
-            file_name = file_name.replace('_R1_001', '')
+            file_name = file_name.rsplit('.f', 1)[0].replace('_R1_001', '')
             result[file_name] = Output_counter.create_row(temp_path, paired)
-            
-            print(f"Output for {file_name}: {result[file_name]}")
+    except Exception as e:
+        print(f"Error processing file {fq_file}: {e}")
+    return result
 
-    print("Result content before do_output:", result)
-    
-    files_manager.tmp_dir(path, temp_path)
+
+def workflow(file_dir, new_file, write_csv, sample_size, bowtie2_params):
+    """
+    Manages the overall processing workflow, including directory traversal, BIOM file conversion, 
+    and multiprocessing for FASTQ file processing.
+    """
+    path = files_manager.get_lib()
+    buildbowtie2(path)  # Prepare Bowtie2 library
+    result = {}
+    single_file = False
+
+    if file_dir.endswith('.biom'):  # Handle BIOM file input
+        temp_fastq = os.path.join(tempfile.gettempdir(), 'temp_sequences.fastq')
+        biom_to_fastq(file_dir, temp_fastq)
+        result = process_file(temp_fastq, path, bowtie2_params, sample_size)
+        single_file = True
+    elif os.path.isfile(file_dir):  # Process single FASTQ file
+        single_file = True
+        result = process_file(file_dir, path, bowtie2_params, sample_size)
+    elif os.path.isdir(file_dir):  # Process directory of FASTQ files
+        fastq_files = glob.glob(f'{file_dir}/**/*.fastq*', recursive=True)
+        with Pool() as pool:
+            results = pool.starmap(process_file, [(fq_file, path, bowtie2_params, sample_size)
+                                                  for fq_file in fastq_files if '_R2_' not in fq_file])
+        for res in results:
+            if res:
+                result.update(res)
     do_output(result, new_file, single_file)
-    if write_csv is True:
-        new_file = (f'{path}Output/{os.path.basename(os.path.dirname(file_dir))}.csv')
-        do_output(result, new_file, single_file)
-        
+
 
 def main():
+    '''Main function to parse user arguments and initiate the workflow.'''
     parser = argparse.ArgumentParser(prog='VX detector', description=(
-        'This programm tries to find which variable region of the 16S sequence was sequencend'))
+        'This program tries to find which variable region of the 16S sequence was sequenced'))
     parser.add_argument('dir_path', help=('Directory path of the directory containing multiple fastq or fasta files.'))
     parser.add_argument('-o', '--output', dest='output_file', default=sys.stdout, 
                         help='User can specify a file format in which the output is written in the Output folder.')
@@ -176,8 +198,13 @@ def main():
                         help='If set the output will be written in a .csv file in the Output folder')
     parser.add_argument('-s', '--sample_size', dest='sample_size', type=int, default=1000, 
                         help='Number of reads to sample from each FASTQ file')
+    parser.add_argument('-b','--bowtie2_params', dest='bowtie2_params', default= None,
+                        help='Additional parameters to pass to Bowtie2')
+    # Parse the user input arguments
     args = parser.parse_args()
-    workflow(args.dir_path, args.output_file, args.write_csv, args.sample_size)
+    
+    # Start the workflow with provided arguments
+    workflow(args.dir_path, args.output_file, args.write_csv, args.sample_size, args.bowtie2_params)
 
 if __name__ == '__main__':
-    main()
+    main()  # Entry point of the script
\ No newline at end of file
diff --git a/vxdetector/VXdetector_sampler.py b/vxdetector/VXdetector_sampler.py
deleted file mode 100644
index 2af3313..0000000
--- a/vxdetector/VXdetector_sampler.py
+++ /dev/null
@@ -1,198 +0,0 @@
-#!/usr/bin/python
-
-import argparse
-import glob
-import os
-import sys
-import warnings
-import pandas as pd
-import Output_counter as Output_counter
-import files_manager as files_manager
-from interact_bowtie2 import mapbowtie2, buildbowtie2
-from interact_bedtools import overlap
-import random
-
-def sample_fastq(file_path, sample_size, sampled_indices=None):
-    '''Get random parts from the FASTQ file based on shared indices for paired-end reads.'''
-    sampled_reads = []  # List to hold sampled reads
-    with open(file_path, 'r') as f:
-        lines = f.readlines()
-        total_reads = len(lines) // 4  # FASTQ files store 4 lines per read
-        if sampled_indices is None:
-            # If no indices are provided, sample randomly from the total reads
-            sampled_indices = sorted(random.sample(range(total_reads), sample_size))
-        for idx in sampled_indices:
-            # Get the read (4 lines per read) and append to the sampled_reads list
-            read = lines[idx*4:(idx+1)*4]
-            sampled_reads.extend(read)
-    return sampled_reads, sampled_indices
-
-def save_sampled_fastq(sampled_reads, output_path):
-    '''Save sampled reads to a temporary FASTQ file.'''
-    with open(output_path, 'w') as f:
-        f.writelines(sampled_reads)
-
-def do_statistic(result):
-    '''Compute statistics (mean and standard deviation) from alignment results.'''
-    average = result.mean(numeric_only=True).to_frame().T  # Compute the mean of numeric columns
-    region = (result['Sequenced variable region'].mode().values)  # Get most frequent region
-    region = ' / '.join(str(r) for r in region)  # Join the modes into a single string
-    region = region.replace('\'', '').replace('[', '').replace(']', '')  # Clean up string format
-    average['Sequenced variable region'] = region  # Add region info to the average dataframe
-    if 'Not properly paired' not in average.columns:
-        average['Not properly paired'] = 'not paired'
-    std_dev = result.std(numeric_only=True).to_frame().T  # Compute standard deviation of numeric columns
-    statistic = pd.concat([average, std_dev], axis=0)  # Combine mean and standard deviation
-    # Select specific columns for the final output
-    statistic = statistic[['Number of Reads', 'Unaligned Reads [%]', 'Not properly paired',
-                           'Sequenced variable region', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6',
-                           'V7', 'V8', 'V9', 'Not aligned to a variable region']]
-    statistic['row_descriptor'] = ['Average', 'Standard deviation']  # Add row descriptors
-    statistic = statistic.set_index('row_descriptor')  # Set descriptor as index
-    result = pd.concat([statistic, result], axis=0)  # Append stats to original results
-    return result
-
-def do_output(result, new_file, single_file):
-    '''Process and output results to a CSV file.'''
-    warnings.simplefilter(action='ignore', category=FutureWarning)
-    result = pd.DataFrame(result).T.sort_index()  # Convert result to DataFrame and sort
-
-    # Convert columns to numeric where applicable
-    for column in result:
-        result[column] = pd.to_numeric(result[column], errors='ignore')
-    
-    if single_file is False:
-        result = do_statistic(result)  # Add statistics if processing multiple files
-    else:
-        result = result[['Number of Reads', 'Unaligned Reads [%]', 'Not properly paired',
-                         'Sequenced variable region', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6',
-                         'V7', 'V8', 'V9', 'Not aligned to a variable region']]
-    
-    # Write the result DataFrame to a CSV file
-    result.to_csv(new_file, index=True)
-
-def workflow(file_dir, new_file, write_csv, sample_size):
-    '''Main workflow for processing FASTQ files and performing sequence alignment.'''
-    path = files_manager.get_lib()  # Get the main working directory
-    temp_path = files_manager.tmp_dir(path, temp_path=None)  # Create a temporary directory
-    paired = False  # To check if the input is paired-end
-    single_file = False  # To check if processing a single file
-    result = dict()  # Dictionary to store results from different files
-    buildbowtie2(path)  # Build the Bowtie2 index
-
-    # Check if the input path exists
-    if not os.path.exists(file_dir):
-        raise ValueError(f"The provided path {file_dir} does not exist.")
-    
-    # Check if there are FASTQ files in the directory
-    if glob.glob(f'{file_dir}**/*.fastq*', recursive=True) == [] and os.path.isdir(file_dir):
-        files_manager.tmp_dir(path, temp_path)
-        raise ValueError('There were no FASTQ files in this directory')
-    
-    # Process single file
-    if os.path.isfile(file_dir):
-        single_file = True
-        file_name = os.path.basename(file_dir)
-        read2_file = os.path.join(os.path.dirname(file_dir), file_name.replace('_R1_', '_R2_'))  # Get the paired file if exists
-        if '_R1_' in file_name and os.path.exists(read2_file):
-            paired = True  # Mark as paired if both R1 and R2 files exist
-
-        # Sample reads from R1 and R2 with same indices
-        sampled_reads_R1, sampled_indices = sample_fastq(file_dir, sample_size)
-        sampled_reads_R2, _ = sample_fastq(read2_file, sample_size, sampled_indices)
-
-        # Save sampled reads to temporary files
-        temp_fastq_R1 = os.path.join(temp_path, 'sampled_R1.fastq')
-        temp_fastq_R2 = os.path.join(temp_path, 'sampled_R2.fastq')
-        save_sampled_fastq(sampled_reads_R1, temp_fastq_R1)
-        save_sampled_fastq(sampled_reads_R2, temp_fastq_R2)
-
-        # Run Bowtie2 alignment
-        aligned_path, Error = mapbowtie2(temp_fastq_R1, temp_fastq_R2, path, temp_path, paired)
-
-        if Error is True:
-            files_manager.tmp_dir(path, temp_path)
-            raise ValueError('This file does not look like a fastq file')
-        if paired is True and Output_counter.rawincount(f'{temp_path}paired.bed') == 0:
-            files_manager.tmp_dir(path, temp_path)
-            raise ValueError('This file has no Reads of the required mapping-quality')
-        
-        # Perform overlap analysis using Bedtools
-        overlap(path, temp_path, aligned_path)
-        if paired is False and Output_counter.rawincount(f'{temp_path}BED.bed') == 0:
-            files_manager.tmp_dir(path, temp_path)
-            raise ValueError('This file has no Reads of the required mapping-quality')
-
-        # Generate results from output
-        file_name = file_name.rsplit('.f', 1)[0]
-        file_name = file_name.replace('_R1_001', '')
-        result[file_name] = Output_counter.create_row(temp_path, paired)
-        
-    # Process directory of files
-    elif os.path.isdir(file_dir):
-        single_file = False
-        total_files = len(glob.glob(f'{file_dir}**/*.fastq*', recursive=True))  # Count number of FASTQ files
-        processed_files = 0
-        for fq_file in glob.glob(f'{file_dir}**/*.fastq*', recursive=True):
-            processed_files += 1
-            paired = False
-            if '_R2_' in fq_file:
-                continue  # Skip R2 files to avoid processing duplicates
-            
-            file_name = os.path.basename(fq_file)
-            read2_file = os.path.join(os.path.dirname(fq_file), file_name.replace('_R1_', '_R2_'))
-            if '_R1_' in file_name and os.path.exists(read2_file):
-                paired = True  # Mark as paired if both R1 and R2 files exist
-
-            # Sample reads from R1 and R2 with same indices
-            sampled_reads_R1, sampled_indices = sample_fastq(fq_file, sample_size)
-            sampled_reads_R2, _ = sample_fastq(read2_file, sample_size, sampled_indices)
-            
-            # Save sampled reads to temporary files
-            temp_fastq_R1 = os.path.join(temp_path, 'sampled_R1.fastq')
-            temp_fastq_R2 = os.path.join(temp_path, 'sampled_R2.fastq')
-            save_sampled_fastq(sampled_reads_R1, temp_fastq_R1)
-            save_sampled_fastq(sampled_reads_R2, temp_fastq_R2)
-
-            # Run Bowtie2 alignment
-            aligned_path, Error = mapbowtie2(temp_fastq_R1, temp_fastq_R2, path, temp_path, paired)
-            if Error is True:
-                continue  # Skip if there's an error in processing the file
-
-            # Perform overlap analysis using Bedtools
-            overlap(path, temp_path, aligned_path)
-
-            # Generate results from output
-            file_name = file_name.rsplit('.f', 1)[0]
-            file_name = file_name.replace('_R1_001', '')
-            result[file_name] = Output_counter.create_row(temp_path, paired)
-                
-    # Clean up temporary directory
-    files_manager.tmp_dir(path, temp_path)
-    
-    # Output the result as CSV or display
-    do_output(result, new_file, single_file)
-    if write_csv is True:
-        new_file = (f'{path}Output/{os.path.basename(os.path.dirname(file_dir))}.csv')
-        do_output(result, new_file, single_file)
-
-def main():
-    '''Main function to parse user arguments and initiate the workflow.'''
-    parser = argparse.ArgumentParser(prog='VX detector', description=(
-        'This program tries to find which variable region of the 16S sequence was sequenced'))
-    parser.add_argument('dir_path', help=('Directory path of the directory containing multiple fastq or fasta files.'))
-    parser.add_argument('-o', '--output', dest='output_file', default=sys.stdout, 
-                        help='User can specify a file format in which the output is written in the Output folder.')
-    parser.add_argument('-c', '--csv', dest='write_csv', action='store_true', 
-                        help='If set the output will be written in a .csv file in the Output folder')
-    parser.add_argument('-s', '--sample_size', dest='sample_size', type=int, default=1000, 
-                        help='Number of reads to sample from each FASTQ file')
-    
-    # Parse the user input arguments
-    args = parser.parse_args()
-    
-    # Start the workflow with provided arguments
-    workflow(args.dir_path, args.output_file, args.write_csv, args.sample_size)
-
-if __name__ == '__main__':
-    main()  # Entry point of the script

From 765987bf8ef627825ef6515804daf913aa550900 Mon Sep 17 00:00:00 2001
From: Landbarsch <j.meiss1312@gmail.com>
Date: Tue, 26 Nov 2024 15:31:38 +0100
Subject: [PATCH 10/37] Added Docstring to the missing Methods. Added the
 sample_fastq method from the memory_efficient_ sampling branch. Sampled
 indices is no longer used in sample_fastq, yet needed for the workflow to
 funtion.

---
 vxdetector/VXdetector.py | 271 ++++++++++++++++++++++++++++++++-------
 1 file changed, 228 insertions(+), 43 deletions(-)

diff --git a/vxdetector/VXdetector.py b/vxdetector/VXdetector.py
index 755d138..6c69617 100644
--- a/vxdetector/VXdetector.py
+++ b/vxdetector/VXdetector.py
@@ -18,7 +18,33 @@
 import random
 
 def biom_to_fastq(biom_file, fastq_output):
-    '''Converts a BIOM file to FASTQ format by extracting sequences and ensuring they are compatible with FASTQ standards.'''
+    """
+    Converts a BIOM file into a FASTQ file.
+
+    This function reads a BIOM file, which is a binary JSON-like format often used in metagenomics to store 
+    sequence counts and metadata. It extracts sequence identifiers and sequences, and formats them into a 
+    FASTQ format. Each sequence is given a default quality score to make it compatible with downstream 
+    bioinformatics tools that require FASTQ files.
+
+    Parameters
+    ----------
+    biom_file : str
+        Path to the input BIOM file containing sequence data and associated metadata.
+    fastq_output : str
+        Path to the output FASTQ file where the converted sequences will be saved. Supports gzip compression 
+        if the filename ends with '.gz'.
+
+    Returns
+    -------
+    None
+        Writes the converted sequences to the specified FASTQ file.
+    
+    Notes
+    -----
+    - Ensure that the input BIOM file is properly formatted and includes sequence data.
+    - The FASTQ format generated assumes a fixed quality score, which may need adjustment based on downstream 
+      applications.
+    """
     with open(fastq_output, "w") as f:
         table = biom.load_table(biom_file)  # Load the BIOM table
         sequences = table.ids(axis='observation')  # Extract sequences
@@ -31,42 +57,111 @@ def biom_to_fastq(biom_file, fastq_output):
     print(f"BIOM file successfully converted to FASTQ format at: {fastq_output}")
 
 
-def sample_fastq(file_path, sample_size, sampled_indices=None):
-    '''Get random parts from the FASTQ file based on shared indices for paired-end reads.'''
-    sampled_reads = []  # List to hold sampled reads
-    open_func = gzip.open if file_path.endswith('.gz') else open
+def sample_fastq(file_path, sample_size, sampled_indices=None, seed=None):
+    """Read a (gzipped) fastQ file and randomly (without replacement) select
+       a given number of reads.
+
+       Assumes that each reads comes in exactly four lines. To save memory and
+       have reasonable speed, we first pass through the file "just" to count
+       the total amount of lines (div by 4 gives the number of reads).
+       Then we randomly draw (without replacement) sample_size many numbers
+       between 0 and line-numbers (div by 4 to only target header lines) and sort
+       this list. Next, we read the file again and keep track of the line
+       number (ln). Whenever the line number matches the current random line
+       number, the file line and the three following are pushed to the
+       sampled_readlines array.
+
+       Parameters
+       ----------
+       file_path : str
+            File path of the fastQ (gzipped) file containing the reads.
+        sample_size : int
+            Number of random reads taken from the fastQ file.
+        sampled_indices : list of int, optional
+            Consistent sampling across paired end files
+        seed : int
+            Random generator seed.
 
-    # Count total reads in the file by iterating line by line
-    with open_func(file_path, 'rt') as f:
-        total_reads = sum(1 for _ in f) // 4
+        Returns
+        -------
+        Tuple of list of random read lines and list of randomly selected line
+        numbers.
+     """
 
-    # Adjust sample_size if it's greater than total_reads
-    if sample_size > total_reads:
-        sample_size = total_reads
+    def _count_generator(reader):
+        b = reader(1024 * 1024)
+        while b:
+            yield b
+            b = reader(1024 * 1024)
 
-    if sampled_indices is None:
-        sampled_indices = sorted(random.sample(range(total_reads), sample_size))
+    def _get_filehandle(file_path):
+        # open file either plain or as gzip compressed file
+        FH = None
+        if file_path.endswith('.gz'):
+            FH = gzip.open(file_path, 'rb')
+        else:
+            FH = open(file_path, 'rb')
+        return FH
 
-    with open_func(file_path, 'rt') as f:
-        read_idx = 0
-        read_buffer = []
-        for line in f:
-            read_buffer.append(line)
-            if len(read_buffer) == 4:  # Once we have a full read
-                if read_idx in sampled_indices:
-                    sampled_reads.extend(read_buffer)  # Add the read if it matches the sampled index
-                read_buffer = []  # Clear the buffer for the next read
-                read_idx += 1  # Move to the next read
+    # first pass: count number of lines in the file
+    FH = _get_filehandle(file_path)
+    c_generator = _count_generator(FH.read)
+    number_lines = sum(buffer.count(b'\n') for buffer in c_generator)
+    assert number_lines % 4 == 0, "file %s does not contain a multiple of 4 lines! %i" % (file_path, number_lines)
+    FH.close()
 
-                # Stop if we've collected enough reads
-                if len(sampled_reads) >= sample_size * 4:
-                    break
+    # second pass: iterate through file with line counts
+    # add next 4 lines to sampled_reads, iff linecount is in random selection
+    sampled_readlines = []  # List to hold sampled reads
+    if seed is not None:
+        random.seed(seed)
+    sel_reads = sorted(list(map(lambda x: x*4, random.sample(range(int(number_lines / 4) + 1), sample_size))))
+    FH = _get_filehandle(file_path)
+    ln = 0  # line number iterator
+    j = 0  # iterator through sorted, random header line numbers
+    while ln < number_lines:
+        line = FH.readline()
+        if ln == sel_reads[j]:
+            sampled_readlines.append(str(line, encoding='utf-8'))
+            for i in range(3):
+                line = FH.readline()
+                sampled_readlines.append(str(line, encoding='utf-8'))
+                ln += 1
+            if j + 1 < len(sel_reads):
+                 j += 1
+            else:
+                break
+        ln += 1
+    FH.close()
 
-    return sampled_reads, sampled_indices
+    return sampled_readlines, sel_reads
 
 
 def save_sampled_fastq(sampled_reads, output_path):
-    '''Save sampled reads to a temporary FASTQ file.'''
+    """
+    Writes sampled sequencing reads into a FASTQ file.
+
+    The function accepts a list of sampled reads in FASTQ format and saves them to the specified output 
+    file. If the output file ends with '.gz', the file is automatically compressed using gzip. This function 
+    ensures that the sampled reads are stored efficiently for use in subsequent steps like alignment or 
+    analysis.
+
+    Parameters
+    ----------
+    sampled_reads : list of str
+        List of reads in FASTQ format, where each read consists of four lines:
+        - Line 1: Sequence identifier (e.g., @SEQ_ID).
+        - Line 2: Raw sequence.
+        - Line 3: Optional '+' separator.
+        - Line 4: Quality scores.
+    output_path : str
+        Path to the output file. If the file ends with '.gz', it will be written in a compressed format.
+
+    Returns
+    -------
+    None
+        Saves the sampled reads to the output file.
+    """
     open_func = gzip.open if output_path.endswith('.gz') else open  # Use gzip if output is .gz
 
     with open_func(output_path, 'wt') as f:  # Write in text mode ('wt')
@@ -74,13 +169,24 @@ def save_sampled_fastq(sampled_reads, output_path):
 
 
 def do_statistic(result):
-    '''Performs statistical analysis on the DataFrame
+    """
+    Performs statistical analysis on sequencing data.
+
+    This function analyzes the results from processing FASTQ files, calculating metrics such as the mean and 
+    standard deviation for numeric columns in the data. It also identifies the most common variable region 
+    across sequences, appending these statistics as new columns in the DataFrame.
 
-    Calculates the mean and standard deviation for all numeric columns
-    in the DataFrame, and determines the most common sequenced variable 
-    region. Adds these statistics to the DataFrame and returns the updated 
-    DataFrame.
-    '''
+    Parameters
+    ----------
+    result : pandas.DataFrame
+        DataFrame containing processed sequencing data. Expected columns include numeric metrics and 
+        categorical data such as variable regions.
+
+    Returns
+    -------
+    pandas.DataFrame
+        Updated DataFrame with additional columns for calculated statistics.
+    """
     average = result.mean(numeric_only=True).to_frame().T  # Calculate mean for numeric columns
     region = (result['Sequenced variable region'].mode().values)  # Find the most common variable region
     region = ' / '.join(str(r) for r in region)  # Format the result for better readability
@@ -101,11 +207,28 @@ def do_statistic(result):
 
 
 def do_output(result, new_file, single_file):
-    '''Writes the results into a CSV file
+    """
+    Writes processing results to a CSV file.
+
+    Converts sequencing results stored in a dictionary into a DataFrame, optionally calculates statistics, 
+    and writes the data to a CSV file for downstream analysis or record-keeping. If the results are from a 
+    single input file, no additional statistics are calculated.
 
-    Converts the dictionary of results into a DataFrame, calculates statistics 
-    if working with multiple files, and writes the data into a CSV file.
-    '''
+    Parameters
+    ----------
+    result : dict
+        Dictionary containing processing results, keyed by file name.
+    new_file : str
+        Path to the output CSV file where results will be saved.
+    single_file : bool
+        If True, skips statistical calculations as the results are from a single input file.
+
+    Returns
+    -------
+    None
+        Saves the results to the specified file.
+    """
+    
     warnings.simplefilter(action='ignore', category=FutureWarning)  # Ignore warnings about future behavior
     result = pd.DataFrame(result).T.sort_index()  # Convert the dictionary to a DataFrame and sort by index
     for column in result:
@@ -122,9 +245,30 @@ def do_output(result, new_file, single_file):
 
 def process_file(fq_file, path, bowtie2_params, sample_size=None):
     """
-    Processes a single FASTQ file, including optional read sampling, alignment with Bowtie2, 
-    and overlap analysis using Bedtools.
+    Processes an individual FASTQ file through sampling, alignment, and overlap analysis.
+
+    This function handles all steps for processing a single FASTQ file. It includes optional random sampling 
+    of reads to reduce data size, alignment to a reference genome using Bowtie2, and analysis of overlap 
+    regions using Bedtools.
+
+    Parameters
+    ----------
+    fq_file : str
+        Path to the input FASTQ file to process.
+    path : str
+        Path to the Bowtie2 library and index files.
+    bowtie2_params : str
+        Additional parameters for Bowtie2 alignment (e.g., "--fast --threads 4").
+    sample_size : int, optional
+        Number of reads to randomly sample. If None, the entire file is processed.
+
+    Returns
+    -------
+    dict or None
+        A dictionary with processing results, including file name and metrics. Returns None if an error 
+        occurs during processing.
     """
+    
     paired = False
     result = {}
     try:
@@ -160,9 +304,30 @@ def process_file(fq_file, path, bowtie2_params, sample_size=None):
 
 def workflow(file_dir, new_file, write_csv, sample_size, bowtie2_params):
     """
-    Manages the overall processing workflow, including directory traversal, BIOM file conversion, 
-    and multiprocessing for FASTQ file processing.
+    Executes the full workflow for processing sequencing data.
+
+    The workflow includes directory traversal, conversion of BIOM files to FASTQ format (if needed), and 
+    parallelized processing of FASTQ files. Results are aggregated and optionally written to a CSV file.
+
+    Parameters
+    ----------
+    file_dir : str
+        Path to the directory or file containing input sequencing data.
+    new_file : str
+        Path to the CSV file where results will be saved.
+    write_csv : bool
+        If True, writes the results to a CSV file.
+    sample_size : int
+        Number of reads to sample from each FASTQ file. If None, all reads are processed.
+    bowtie2_params : str
+        Additional parameters for Bowtie2 alignment.
+
+    Returns
+    -------
+    None
+        Executes the workflow and outputs results as specified.
     """
+    
     path = files_manager.get_lib()
     buildbowtie2(path)  # Prepare Bowtie2 library
     result = {}
@@ -188,7 +353,27 @@ def workflow(file_dir, new_file, write_csv, sample_size, bowtie2_params):
 
 
 def main():
-    '''Main function to parse user arguments and initiate the workflow.'''
+    """
+    Parses user input and initiates the workflow.
+
+    This function uses argparse to handle command-line arguments, passing them to the workflow function. 
+    It provides flexibility for customizing the input directory, Bowtie2 parameters, and output options.
+
+    Parameters
+    ----------
+    None
+
+    Returns
+    -------
+    None
+        Executes the script based on user-provided inputs.
+    
+    Notes
+    -----
+    - Command-line usage should follow the format specified in the argparse help message.
+    - Ensure all required dependencies (Bowtie2, Bedtools) are installed and accessible.
+    """
+    
     parser = argparse.ArgumentParser(prog='VX detector', description=(
         'This program tries to find which variable region of the 16S sequence was sequenced'))
     parser.add_argument('dir_path', help=('Directory path of the directory containing multiple fastq or fasta files.'))

From 7f0908cc7b49bc79c5af1bda07786608b4c88133 Mon Sep 17 00:00:00 2001
From: Landbarsch <j.meiss1312@gmail.com>
Date: Thu, 28 Nov 2024 13:09:35 +0100
Subject: [PATCH 11/37] Formatting

---
 vxdetector/VXdetector.py | 113 ++++++++++++++++++++++++---------------
 1 file changed, 71 insertions(+), 42 deletions(-)

diff --git a/vxdetector/VXdetector.py b/vxdetector/VXdetector.py
index 6c69617..5ce83ef 100644
--- a/vxdetector/VXdetector.py
+++ b/vxdetector/VXdetector.py
@@ -13,10 +13,10 @@
 import tempfile
 from multiprocessing import Pool
 import biom
-import shutil
 import gzip
 import random
 
+
 def biom_to_fastq(biom_file, fastq_output):
     """
     Converts a BIOM file into a FASTQ file.
@@ -38,7 +38,7 @@ def biom_to_fastq(biom_file, fastq_output):
     -------
     None
         Writes the converted sequences to the specified FASTQ file.
-    
+
     Notes
     -----
     - Ensure that the input BIOM file is properly formatted and includes sequence data.
@@ -50,11 +50,15 @@ def biom_to_fastq(biom_file, fastq_output):
         sequences = table.ids(axis='observation')  # Extract sequences
         for seq_id in sequences:
             # Truncate the sequence ID if it’s too long for Bowtie2's requirements
-            truncated_id = seq_id[:50]  # Cut the sequence ID to 50 characters (adjust if needed)
+            # Cut the sequence ID to 50 characters (adjust if needed)
+            truncated_id = seq_id[:50]
             # Check and sanitize the sequence to avoid unwanted characters
-            sanitized_seq = ''.join(filter(str.isalpha, seq_id))  # Ensure only letters are in the sequence
-            f.write(f"@{truncated_id}\n{sanitized_seq}\n+\n{'I' * len(sanitized_seq)}\n")  # Write in FASTQ format
-    print(f"BIOM file successfully converted to FASTQ format at: {fastq_output}")
+            # Ensure only letters are in the sequence
+            sanitized_seq = ''.join(filter(str.isalpha, seq_id))
+            f.write(f"@{truncated_id}\n{sanitized_seq}\n+\n{'I' *
+                    len(sanitized_seq)}\n")  # Write in FASTQ format
+    print(f"BIOM file successfully converted to FASTQ format at: {
+          fastq_output}")
 
 
 def sample_fastq(file_path, sample_size, sampled_indices=None, seed=None):
@@ -107,7 +111,8 @@ def _get_filehandle(file_path):
     FH = _get_filehandle(file_path)
     c_generator = _count_generator(FH.read)
     number_lines = sum(buffer.count(b'\n') for buffer in c_generator)
-    assert number_lines % 4 == 0, "file %s does not contain a multiple of 4 lines! %i" % (file_path, number_lines)
+    assert number_lines % 4 == 0, "file %s does not contain a multiple of 4 lines! %i" % (
+        file_path, number_lines)
     FH.close()
 
     # second pass: iterate through file with line counts
@@ -115,7 +120,8 @@ def _get_filehandle(file_path):
     sampled_readlines = []  # List to hold sampled reads
     if seed is not None:
         random.seed(seed)
-    sel_reads = sorted(list(map(lambda x: x*4, random.sample(range(int(number_lines / 4) + 1), sample_size))))
+    sel_reads = sorted(list(
+        map(lambda x: x*4, random.sample(range(int(number_lines / 4) + 1), sample_size))))
     FH = _get_filehandle(file_path)
     ln = 0  # line number iterator
     j = 0  # iterator through sorted, random header line numbers
@@ -128,7 +134,7 @@ def _get_filehandle(file_path):
                 sampled_readlines.append(str(line, encoding='utf-8'))
                 ln += 1
             if j + 1 < len(sel_reads):
-                 j += 1
+                j += 1
             else:
                 break
         ln += 1
@@ -162,7 +168,8 @@ def save_sampled_fastq(sampled_reads, output_path):
     None
         Saves the sampled reads to the output file.
     """
-    open_func = gzip.open if output_path.endswith('.gz') else open  # Use gzip if output is .gz
+    open_func = gzip.open if output_path.endswith(
+        '.gz') else open  # Use gzip if output is .gz
 
     with open_func(output_path, 'wt') as f:  # Write in text mode ('wt')
         f.writelines(sampled_reads)
@@ -187,22 +194,32 @@ def do_statistic(result):
     pandas.DataFrame
         Updated DataFrame with additional columns for calculated statistics.
     """
-    average = result.mean(numeric_only=True).to_frame().T  # Calculate mean for numeric columns
-    region = (result['Sequenced variable region'].mode().values)  # Find the most common variable region
-    region = ' / '.join(str(r) for r in region)  # Format the result for better readability
-    region = region.replace('\'', '').replace('[', '').replace(']', '')  # Clean up formatting
+    average = result.mean(numeric_only=True).to_frame(
+    ).T  # Calculate mean for numeric columns
+    # Find the most common variable region
+    region = (result['Sequenced variable region'].mode().values)
+    # Format the result for better readability
+    region = ' / '.join(str(r) for r in region)
+    region = region.replace('\'', '').replace(
+        '[', '').replace(']', '')  # Clean up formatting
     average['Sequenced variable region'] = region
     if 'Not properly paired' not in average.columns:
-        average['Not properly paired'] = 'not paired'  # Handle cases where reads are not paired
-    std_dev = result.std(numeric_only=True).to_frame().T  # Calculate standard deviation for numeric columns
-    statistic = pd.concat([average, std_dev], axis=0)  # Combine mean and standard deviation dataframes
+        # Handle cases where reads are not paired
+        average['Not properly paired'] = 'not paired'
+    # Calculate standard deviation for numeric columns
+    std_dev = result.std(numeric_only=True).to_frame().T
+    # Combine mean and standard deviation dataframes
+    statistic = pd.concat([average, std_dev], axis=0)
     # Select relevant columns and reorder them
     statistic = statistic[['Number of Reads', 'Unaligned Reads [%]', 'Not properly paired',
                            'Sequenced variable region', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6',
                            'V7', 'V8', 'V9', 'Not aligned to a variable region']]
-    statistic['row_descriptor'] = ['Average', 'Standard deviation']  # Add descriptors for the statistics
-    statistic = statistic.set_index('row_descriptor')  # Set the row descriptors as the index
-    result = pd.concat([statistic, result], axis=0)  # Combine the statistical results with the original data
+    # Add descriptors for the statistics
+    statistic['row_descriptor'] = ['Average', 'Standard deviation']
+    # Set the row descriptors as the index
+    statistic = statistic.set_index('row_descriptor')
+    # Combine the statistical results with the original data
+    result = pd.concat([statistic, result], axis=0)
     return result
 
 
@@ -228,13 +245,17 @@ def do_output(result, new_file, single_file):
     None
         Saves the results to the specified file.
     """
-    
-    warnings.simplefilter(action='ignore', category=FutureWarning)  # Ignore warnings about future behavior
-    result = pd.DataFrame(result).T.sort_index()  # Convert the dictionary to a DataFrame and sort by index
+
+    # Ignore warnings about future behavior
+    warnings.simplefilter(action='ignore', category=FutureWarning)
+    # Convert the dictionary to a DataFrame and sort by index
+    result = pd.DataFrame(result).T.sort_index()
     for column in result:
-        result[column] = pd.to_numeric(result[column], errors='ignore')  # Convert columns to numeric where possible
+        # Convert columns to numeric where possible
+        result[column] = pd.to_numeric(result[column], errors='ignore')
     if single_file is False:
-        result = do_statistic(result)  # If multiple files, calculate statistics
+        # If multiple files, calculate statistics
+        result = do_statistic(result)
     else:
         # Select relevant columns for single file processing
         result = result[['Number of Reads', 'Unaligned Reads [%]', 'Not properly paired',
@@ -268,30 +289,34 @@ def process_file(fq_file, path, bowtie2_params, sample_size=None):
         A dictionary with processing results, including file name and metrics. Returns None if an error 
         occurs during processing.
     """
-    
+
     paired = False
     result = {}
     try:
         with tempfile.TemporaryDirectory() as temp_path:
             file_name = os.path.basename(fq_file)
-            read2_file = os.path.join(os.path.dirname(fq_file), file_name.replace('_R1_', '_R2_'))
+            read2_file = os.path.join(os.path.dirname(
+                fq_file), file_name.replace('_R1_', '_R2_'))
             if '_R1_' in file_name and os.path.exists(read2_file):
                 paired = True  # Paired-end sequencing detected
 
             if sample_size:  # Perform sampling if required
-                sampled_reads_R1, sampled_indices = sample_fastq(fq_file, sample_size)
+                sampled_reads_R1, sampled_indices = sample_fastq(
+                    fq_file, sample_size)
                 temp_fastq_R1 = os.path.join(temp_path, 'sampled_R1.fastq')
                 save_sampled_fastq(sampled_reads_R1, temp_fastq_R1)
                 fq_file = temp_fastq_R1
 
                 if paired:  # Process the reverse reads for paired-end data
-                    sampled_reads_R2, _ = sample_fastq(read2_file, sample_size, sampled_indices)
+                    sampled_reads_R2, _ = sample_fastq(
+                        read2_file, sample_size, sampled_indices)
                     temp_fastq_R2 = os.path.join(temp_path, 'sampled_R2.fastq')
                     save_sampled_fastq(sampled_reads_R2, temp_fastq_R2)
                     read2_file = temp_fastq_R2
 
             # Perform alignment and overlap analysis
-            aligned_path, Error = mapbowtie2(fq_file, read2_file, path, temp_path, paired, bowtie2_params)
+            aligned_path, Error = mapbowtie2(
+                fq_file, read2_file, path, temp_path, paired, bowtie2_params)
             if Error:
                 return None
             overlap(path, temp_path, aligned_path)
@@ -327,14 +352,15 @@ def workflow(file_dir, new_file, write_csv, sample_size, bowtie2_params):
     None
         Executes the workflow and outputs results as specified.
     """
-    
+
     path = files_manager.get_lib()
     buildbowtie2(path)  # Prepare Bowtie2 library
     result = {}
     single_file = False
 
     if file_dir.endswith('.biom'):  # Handle BIOM file input
-        temp_fastq = os.path.join(tempfile.gettempdir(), 'temp_sequences.fastq')
+        temp_fastq = os.path.join(
+            tempfile.gettempdir(), 'temp_sequences.fastq')
         biom_to_fastq(file_dir, temp_fastq)
         result = process_file(temp_fastq, path, bowtie2_params, sample_size)
         single_file = True
@@ -367,29 +393,32 @@ def main():
     -------
     None
         Executes the script based on user-provided inputs.
-    
+
     Notes
     -----
     - Command-line usage should follow the format specified in the argparse help message.
     - Ensure all required dependencies (Bowtie2, Bedtools) are installed and accessible.
     """
-    
+
     parser = argparse.ArgumentParser(prog='VX detector', description=(
         'This program tries to find which variable region of the 16S sequence was sequenced'))
-    parser.add_argument('dir_path', help=('Directory path of the directory containing multiple fastq or fasta files.'))
-    parser.add_argument('-o', '--output', dest='output_file', default=sys.stdout, 
+    parser.add_argument('dir_path', help=(
+        'Directory path of the directory containing multiple fastq or fasta files.'))
+    parser.add_argument('-o', '--output', dest='output_file', default=sys.stdout,
                         help='User can specify a file format in which the output is written in the Output folder.')
-    parser.add_argument('-c', '--csv', dest='write_csv', action='store_true', 
+    parser.add_argument('-c', '--csv', dest='write_csv', action='store_true',
                         help='If set the output will be written in a .csv file in the Output folder')
-    parser.add_argument('-s', '--sample_size', dest='sample_size', type=int, default=1000, 
+    parser.add_argument('-s', '--sample_size', dest='sample_size', type=int, default=1000,
                         help='Number of reads to sample from each FASTQ file')
-    parser.add_argument('-b','--bowtie2_params', dest='bowtie2_params', default= None,
+    parser.add_argument('-b', '--bowtie2_params', dest='bowtie2_params', default=None,
                         help='Additional parameters to pass to Bowtie2')
     # Parse the user input arguments
     args = parser.parse_args()
-    
+
     # Start the workflow with provided arguments
-    workflow(args.dir_path, args.output_file, args.write_csv, args.sample_size, args.bowtie2_params)
+    workflow(args.dir_path, args.output_file, args.write_csv,
+             args.sample_size, args.bowtie2_params)
+
 
 if __name__ == '__main__':
-    main()  # Entry point of the script
\ No newline at end of file
+    main()  # Entry point of the script

From dd79888c89ef1ccb87cb1d302c6a6c683ac2d64d Mon Sep 17 00:00:00 2001
From: Landbarsch <j.meiss1312@gmail.com>
Date: Thu, 28 Nov 2024 13:40:36 +0100
Subject: [PATCH 12/37] more formatting

---
 vxdetector/VXdetector.py | 148 ++++++++++++++++++++++++---------------
 1 file changed, 90 insertions(+), 58 deletions(-)

diff --git a/vxdetector/VXdetector.py b/vxdetector/VXdetector.py
index 5ce83ef..8866a32 100644
--- a/vxdetector/VXdetector.py
+++ b/vxdetector/VXdetector.py
@@ -21,17 +21,20 @@ def biom_to_fastq(biom_file, fastq_output):
     """
     Converts a BIOM file into a FASTQ file.
 
-    This function reads a BIOM file, which is a binary JSON-like format often used in metagenomics to store 
-    sequence counts and metadata. It extracts sequence identifiers and sequences, and formats them into a 
-    FASTQ format. Each sequence is given a default quality score to make it compatible with downstream 
-    bioinformatics tools that require FASTQ files.
+    This function reads a BIOM file, which is a binary JSON-like format often
+    used in metagenomics to store sequence counts and metadata. It extracts
+    sequence identifiers and sequences, and formats them into a FASTQ format.
+    Each sequence is given a default quality score to make it compatible with
+    downstream bioinformatics tools that require FASTQ files.
 
     Parameters
     ----------
     biom_file : str
-        Path to the input BIOM file containing sequence data and associated metadata.
+        Path to the input BIOM file containing sequence data
+        and associated metadata.
     fastq_output : str
-        Path to the output FASTQ file where the converted sequences will be saved. Supports gzip compression 
+        Path to the output FASTQ file where the converted sequences will
+        be saved. Supports gzip compression
         if the filename ends with '.gz'.
 
     Returns
@@ -41,24 +44,26 @@ def biom_to_fastq(biom_file, fastq_output):
 
     Notes
     -----
-    - Ensure that the input BIOM file is properly formatted and includes sequence data.
-    - The FASTQ format generated assumes a fixed quality score, which may need adjustment based on downstream 
-      applications.
+    - Ensure that the input BIOM file is properly formatted and
+      includes sequence data.
+    - The FASTQ format generated assumes a fixed quality score,
+      which may need adjustment based on downstream applications.
     """
     with open(fastq_output, "w") as f:
         table = biom.load_table(biom_file)  # Load the BIOM table
         sequences = table.ids(axis='observation')  # Extract sequences
         for seq_id in sequences:
-            # Truncate the sequence ID if it’s too long for Bowtie2's requirements
+            # Truncate the sequence ID if it’s too long for Bowtie2's
+            # requirements.
             # Cut the sequence ID to 50 characters (adjust if needed)
             truncated_id = seq_id[:50]
             # Check and sanitize the sequence to avoid unwanted characters
             # Ensure only letters are in the sequence
             sanitized_seq = ''.join(filter(str.isalpha, seq_id))
-            f.write(f"@{truncated_id}\n{sanitized_seq}\n+\n{'I' *
-                    len(sanitized_seq)}\n")  # Write in FASTQ format
-    print(f"BIOM file successfully converted to FASTQ format at: {
-          fastq_output}")
+            f.write(f"@{truncated_id}\n{sanitized_seq}\n+ \
+                    \n{'I' * len(sanitized_seq)}\n")  # Write in FASTQ format
+    print(f"BIOM file successfully converted to FASTQ format at: \
+          {fastq_output}")
 
 
 def sample_fastq(file_path, sample_size, sampled_indices=None, seed=None):
@@ -69,8 +74,8 @@ def sample_fastq(file_path, sample_size, sampled_indices=None, seed=None):
        have reasonable speed, we first pass through the file "just" to count
        the total amount of lines (div by 4 gives the number of reads).
        Then we randomly draw (without replacement) sample_size many numbers
-       between 0 and line-numbers (div by 4 to only target header lines) and sort
-       this list. Next, we read the file again and keep track of the line
+       between 0 and line-numbers (div by 4 to only target header lines) and
+       sort this list. Next, we read the file again and keep track of the line
        number (ln). Whenever the line number matches the current random line
        number, the file line and the three following are pushed to the
        sampled_readlines array.
@@ -111,7 +116,8 @@ def _get_filehandle(file_path):
     FH = _get_filehandle(file_path)
     c_generator = _count_generator(FH.read)
     number_lines = sum(buffer.count(b'\n') for buffer in c_generator)
-    assert number_lines % 4 == 0, "file %s does not contain a multiple of 4 lines! %i" % (
+    assert number_lines % 4 == 0, "file %s does not contain \
+    a multiple of 4 lines! %i" % (
         file_path, number_lines)
     FH.close()
 
@@ -121,7 +127,8 @@ def _get_filehandle(file_path):
     if seed is not None:
         random.seed(seed)
     sel_reads = sorted(list(
-        map(lambda x: x*4, random.sample(range(int(number_lines / 4) + 1), sample_size))))
+        map(lambda x: x*4, random.sample(range(int(number_lines / 4) + 1),
+                                         sample_size))))
     FH = _get_filehandle(file_path)
     ln = 0  # line number iterator
     j = 0  # iterator through sorted, random header line numbers
@@ -147,10 +154,11 @@ def save_sampled_fastq(sampled_reads, output_path):
     """
     Writes sampled sequencing reads into a FASTQ file.
 
-    The function accepts a list of sampled reads in FASTQ format and saves them to the specified output 
-    file. If the output file ends with '.gz', the file is automatically compressed using gzip. This function 
-    ensures that the sampled reads are stored efficiently for use in subsequent steps like alignment or 
-    analysis.
+    The function accepts a list of sampled reads in FASTQ format and
+    saves them to the specified output file. If the output file ends
+    with '.gz', the file is automatically compressed using gzip. 
+    This function ensures that the sampled reads are stored efficiently
+    for use in subsequent steps like alignment or analysis.
 
     Parameters
     ----------
@@ -161,7 +169,8 @@ def save_sampled_fastq(sampled_reads, output_path):
         - Line 3: Optional '+' separator.
         - Line 4: Quality scores.
     output_path : str
-        Path to the output file. If the file ends with '.gz', it will be written in a compressed format.
+        Path to the output file. If the file ends with '.gz',
+        it will be written in a compressed format.
 
     Returns
     -------
@@ -179,15 +188,17 @@ def do_statistic(result):
     """
     Performs statistical analysis on sequencing data.
 
-    This function analyzes the results from processing FASTQ files, calculating metrics such as the mean and 
-    standard deviation for numeric columns in the data. It also identifies the most common variable region 
-    across sequences, appending these statistics as new columns in the DataFrame.
+    This function analyzes the results from processing FASTQ files,
+    calculating metrics such as the mean and standard deviation for
+    numeric columns in the data. It also identifies the most common
+    variable region across sequences, appending these statistics as
+    new columns in the DataFrame.
 
     Parameters
     ----------
     result : pandas.DataFrame
-        DataFrame containing processed sequencing data. Expected columns include numeric metrics and 
-        categorical data such as variable regions.
+        DataFrame containing processed sequencing data. Expected columns
+        include numeric metrics and categorical data such as variable regions.
 
     Returns
     -------
@@ -211,9 +222,10 @@ def do_statistic(result):
     # Combine mean and standard deviation dataframes
     statistic = pd.concat([average, std_dev], axis=0)
     # Select relevant columns and reorder them
-    statistic = statistic[['Number of Reads', 'Unaligned Reads [%]', 'Not properly paired',
-                           'Sequenced variable region', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6',
-                           'V7', 'V8', 'V9', 'Not aligned to a variable region']]
+    statistic = statistic[
+        ['Number of Reads', 'Unaligned Reads [%]', 'Not properly paired',
+         'Sequenced variable region', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6',
+         'V7', 'V8', 'V9', 'Not aligned to a variable region']]
     # Add descriptors for the statistics
     statistic['row_descriptor'] = ['Average', 'Standard deviation']
     # Set the row descriptors as the index
@@ -227,8 +239,9 @@ def do_output(result, new_file, single_file):
     """
     Writes processing results to a CSV file.
 
-    Converts sequencing results stored in a dictionary into a DataFrame, optionally calculates statistics, 
-    and writes the data to a CSV file for downstream analysis or record-keeping. If the results are from a 
+    Converts sequencing results stored in a dictionary into a DataFrame,
+    optionally calculates statistics, and writes the data to a CSV file
+    for downstream analysis or record-keeping. If the results are from a
     single input file, no additional statistics are calculated.
 
     Parameters
@@ -238,7 +251,8 @@ def do_output(result, new_file, single_file):
     new_file : str
         Path to the output CSV file where results will be saved.
     single_file : bool
-        If True, skips statistical calculations as the results are from a single input file.
+        If True, skips statistical calculations as the results
+        are from a single input file.
 
     Returns
     -------
@@ -258,19 +272,22 @@ def do_output(result, new_file, single_file):
         result = do_statistic(result)
     else:
         # Select relevant columns for single file processing
-        result = result[['Number of Reads', 'Unaligned Reads [%]', 'Not properly paired',
-                         'Sequenced variable region', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6',
-                         'V7', 'V8', 'V9', 'Not aligned to a variable region']]
+        result = result[
+            ['Number of Reads', 'Unaligned Reads [%]', 'Not properly paired',
+             'Sequenced variable region', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6',
+             'V7', 'V8', 'V9', 'Not aligned to a variable region']]
     result.to_csv(new_file, index=True)  # Write the results to a CSV file
 
 
 def process_file(fq_file, path, bowtie2_params, sample_size=None):
     """
-    Processes an individual FASTQ file through sampling, alignment, and overlap analysis.
+    Processes an individual FASTQ file through sampling, alignment,
+    and overlap analysis.
 
-    This function handles all steps for processing a single FASTQ file. It includes optional random sampling 
-    of reads to reduce data size, alignment to a reference genome using Bowtie2, and analysis of overlap 
-    regions using Bedtools.
+    This function handles all steps for processing a single FASTQ file.
+    It includes optional random sampling of reads to reduce data size,
+    alignment to a reference genome using Bowtie2, and analysis of
+    overlap regions using Bedtools.
 
     Parameters
     ----------
@@ -279,14 +296,17 @@ def process_file(fq_file, path, bowtie2_params, sample_size=None):
     path : str
         Path to the Bowtie2 library and index files.
     bowtie2_params : str
-        Additional parameters for Bowtie2 alignment (e.g., "--fast --threads 4").
+        Additional parameters for Bowtie2 alignment
+        (e.g., "--fast --threads 4").
     sample_size : int, optional
-        Number of reads to randomly sample. If None, the entire file is processed.
+        Number of reads to randomly sample.
+        If None, the entire file is processed.
 
     Returns
     -------
     dict or None
-        A dictionary with processing results, including file name and metrics. Returns None if an error 
+        A dictionary with processing results, including file name and metrics.
+        Returns None if an error
         occurs during processing.
     """
 
@@ -331,8 +351,9 @@ def workflow(file_dir, new_file, write_csv, sample_size, bowtie2_params):
     """
     Executes the full workflow for processing sequencing data.
 
-    The workflow includes directory traversal, conversion of BIOM files to FASTQ format (if needed), and 
-    parallelized processing of FASTQ files. Results are aggregated and optionally written to a CSV file.
+    The workflow includes directory traversal, conversion of BIOM files to
+    FASTQ format (if needed), and parallelized processing of FASTQ files.
+    Results are aggregated and optionally written to a CSV file.
 
     Parameters
     ----------
@@ -343,7 +364,8 @@ def workflow(file_dir, new_file, write_csv, sample_size, bowtie2_params):
     write_csv : bool
         If True, writes the results to a CSV file.
     sample_size : int
-        Number of reads to sample from each FASTQ file. If None, all reads are processed.
+        Number of reads to sample from each FASTQ file.
+        If None, all reads are processed.
     bowtie2_params : str
         Additional parameters for Bowtie2 alignment.
 
@@ -382,8 +404,9 @@ def main():
     """
     Parses user input and initiates the workflow.
 
-    This function uses argparse to handle command-line arguments, passing them to the workflow function. 
-    It provides flexibility for customizing the input directory, Bowtie2 parameters, and output options.
+    This function uses argparse to handle command-line arguments, passing them
+    to the workflow function. It provides flexibility for customizing the input
+    directory, Bowtie2 parameters, and output options.
 
     Parameters
     ----------
@@ -396,21 +419,30 @@ def main():
 
     Notes
     -----
-    - Command-line usage should follow the format specified in the argparse help message.
-    - Ensure all required dependencies (Bowtie2, Bedtools) are installed and accessible.
+    - Command-line usage should follow the format specified in
+      the argparse help message.
+    - Ensure all required dependencies (Bowtie2, Bedtools) are
+      installed and accessible.
     """
 
     parser = argparse.ArgumentParser(prog='VX detector', description=(
-        'This program tries to find which variable region of the 16S sequence was sequenced'))
+        'This program tries to find which variable region of the 16S \
+        sequence was sequenced'))
     parser.add_argument('dir_path', help=(
-        'Directory path of the directory containing multiple fastq or fasta files.'))
-    parser.add_argument('-o', '--output', dest='output_file', default=sys.stdout,
-                        help='User can specify a file format in which the output is written in the Output folder.')
+        'Directory path of the directory containing \
+        multiple fastq or fasta files.'))
+    parser.add_argument('-o', '--output', dest='output_file',
+                        default=sys.stdout,
+                        help='User can specify a file format in which the \
+                        output is written in the Output folder.')
     parser.add_argument('-c', '--csv', dest='write_csv', action='store_true',
-                        help='If set the output will be written in a .csv file in the Output folder')
-    parser.add_argument('-s', '--sample_size', dest='sample_size', type=int, default=1000,
+                        help='If set the output will be written in a .csv \
+                        file in the Output folder')
+    parser.add_argument('-s', '--sample_size', dest='sample_size', type=int,
+                        default=1000,
                         help='Number of reads to sample from each FASTQ file')
-    parser.add_argument('-b', '--bowtie2_params', dest='bowtie2_params', default=None,
+    parser.add_argument('-b', '--bowtie2_params', dest='bowtie2_params',
+                        default=None,
                         help='Additional parameters to pass to Bowtie2')
     # Parse the user input arguments
     args = parser.parse_args()

From 3cf6e0fb3f56383da27259eaf9bb9ac702e3b6a0 Mon Sep 17 00:00:00 2001
From: Landbarsch <j.meiss1312@gmail.com>
Date: Thu, 28 Nov 2024 13:44:07 +0100
Subject: [PATCH 13/37] formatted for flake8

---
 vxdetector/VXdetector.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/vxdetector/VXdetector.py b/vxdetector/VXdetector.py
index 8866a32..7c0f9d0 100644
--- a/vxdetector/VXdetector.py
+++ b/vxdetector/VXdetector.py
@@ -156,7 +156,7 @@ def save_sampled_fastq(sampled_reads, output_path):
 
     The function accepts a list of sampled reads in FASTQ format and
     saves them to the specified output file. If the output file ends
-    with '.gz', the file is automatically compressed using gzip. 
+    with '.gz', the file is automatically compressed using gzip.
     This function ensures that the sampled reads are stored efficiently
     for use in subsequent steps like alignment or analysis.
 
@@ -392,8 +392,10 @@ def workflow(file_dir, new_file, write_csv, sample_size, bowtie2_params):
     elif os.path.isdir(file_dir):  # Process directory of FASTQ files
         fastq_files = glob.glob(f'{file_dir}/**/*.fastq*', recursive=True)
         with Pool() as pool:
-            results = pool.starmap(process_file, [(fq_file, path, bowtie2_params, sample_size)
-                                                  for fq_file in fastq_files if '_R2_' not in fq_file])
+            results = pool.starmap(process_file, [(fq_file, path,
+                                                   bowtie2_params, sample_size)
+                                                  for fq_file in fastq_files
+                                                  if '_R2_' not in fq_file])
         for res in results:
             if res:
                 result.update(res)

From 3849a7d0bc5ce0dfe07f234609229fe094839184 Mon Sep 17 00:00:00 2001
From: Landbarsch <j.meiss1312@gmail.com>
Date: Thu, 12 Dec 2024 16:29:53 +0100
Subject: [PATCH 14/37] Delete .DS_Store

---
 .DS_Store | Bin 8196 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 .DS_Store

diff --git a/.DS_Store b/.DS_Store
deleted file mode 100644
index 6b56441b7de3097435e94e155910f8b2d2e61d93..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 8196
zcmeHMO>7%Q6n^8x%{r!dn;#lLfF-NIp$g7VQ&m0Sx~`~&P^vg8soLW1dN<B0>z!qH
zogYFZA1k=_zzHF7Mo5SQN7MscH~>N%s=^703kQxxJ+yCrHr_ZkAduQfosnj~w=?gX
znSJx-jb{x2q};Y<0Hy#y!^}|6V$mnDc9svMl&pCkksu%3=1i?YTmAt}RiP}PETAl)
zETAl)EO1yXfM+&K+JJLkw5n2BKw034v;d0_dCUw;5<3EU=)i(s0T73=nkSTXCLrb|
zmLzrrVhbiBsEC3}<Q0PmI@af<UP)p{pr8Yhmk%OGCh~?t_~_Vwo~r{X2~?%BfU>|r
z3$VNUFib-e9Eg(l_w?;gUrx5QXr*Gh)WkaN-rb8|HjCe!+uv^eZM(nL{Wg&P_Isql
zNIWjAK^x5E=m0}M9n?t9;fvqeGj%bCo_<=iJCng$$@$2bl>-|**g#7JKv8P(=Ll!H
zjj~&HX+v6o1sV4C(+_w;2_Oqq<XFhr!fZhuOo(8%hxPftW0{+wFUR`I^j#;6ip8Hv
zYG^opWJDX$vfB098r`f#M$`_=MsSrqEmP{6v22)^9ecB`Pn@S-WH_E3aDrpmsB-OX
z$Ft~WnYKME<lLHQ&@x)4u21dk%q|p9=I7?m?4Hc;oS8*=er{oRH=|7!UOT%~Uw2y`
z{eZtU5E+o+sCeEhKSXJOO_D2}`r((42Wuu*{FRU1k5_y^zsZ&VXYHg1q<-Y{9?<Vb
zE%03GHBpDX?Gac+e~rLr<^J_0FJODpD>ASI7tmKK@D}1U`jOScV0KC}GziguBJA5o
zeIK@a^#AJ7A4QMd;(iw)uYpVo!G2C^9Xay}U+v3z>^?U$hdwuV>VNuN-*<Bw8*)F9
zTk(GSHQrB|(d<)4pU&yWa$~vi@rlW2jvs%vu8*5WW4#%$J#wD9G$^}md&Ox`ujagG
zuSE7%#Duy&)}yy%TkmwMmbZP>ndnyVzTOCI+q>Ef9BXlPHMFCZgnZ>%h?}_@*{!O#
zO0oY9+{lZ5<WMiXYzHCY6(a}?uW6UJue*k6yNh0#MiFh*^&IEoBrgRv<MDoLzR_^J
zX5sZe(G4&aN|$2cRUzn55CR7GiCB_ImOZ0oXEKG-o3ciO6L6M$v<dIRO}GW0z-Mq9
zzJojP2izlsjFNFONnRu`ks>)o7RYJAOvSO{x&34CA8CEpEg31&7sq}_USGXroIsuX
zrZ^b}=yuNs)IBXvk-oag7?G}ijq3f!?}-ph@9?J~F~zAY;)TKA{}2C*s2M5?C=2{&
z3y|STtx`sM9G-i7Vt4I4<{Ow<VtO5coPq_vjwABxIO5(PhA8Jzm2pZEI|8u=(<A>7
Spw54F{_ovX(E*>(^Zy}UVQlXJ


From 41f3d6a2c56fe92d97798c15079673ce520d732e Mon Sep 17 00:00:00 2001
From: Landbarsch <j.meiss1312@gmail.com>
Date: Thu, 12 Dec 2024 16:31:35 +0100
Subject: [PATCH 15/37] Update .gitignore

ignore DS_Store
---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index f133658..d1dcd15 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 .coverage
 coverage.lcov
 bowtie2.*.bt2
+.DS_Store

From ceaa08e9b5ebe0b7bee576c37a73afd00a351676 Mon Sep 17 00:00:00 2001
From: Landbarsch <j.meiss1312@gmail.com>
Date: Thu, 12 Dec 2024 16:34:02 +0100
Subject: [PATCH 16/37] Delete vxdetector/testlog

---
 vxdetector/testlog | 9 ---------
 1 file changed, 9 deletions(-)
 delete mode 100644 vxdetector/testlog

diff --git a/vxdetector/testlog b/vxdetector/testlog
deleted file mode 100644
index 6e6ca14..0000000
--- a/vxdetector/testlog
+++ /dev/null
@@ -1,9 +0,0 @@
-sh: /bin/bowtie2-build: No such file or directory
-Traceback (most recent call last):
-  File "/Users/jeremy/Desktop/VX/algorithm_vxdetector/vxdetector/VXdetector.py", line 144, in <module>
-    main()
-  File "/Users/jeremy/Desktop/VX/algorithm_vxdetector/vxdetector/VXdetector.py", line 141, in main
-    workflow(args.dir_path, args.output_file, args.write_csv, args.sample_size)
-  File "/Users/jeremy/Desktop/VX/algorithm_vxdetector/vxdetector/VXdetector.py", line 77, in workflow
-    raise ValueError('There were no FASTQ files in this directory')
-ValueError: There were no FASTQ files in this directory

From 844c824fc5e8455fd15c8feca7246a913b565d3c Mon Sep 17 00:00:00 2001
From: Landbarsch <j.meiss1312@gmail.com>
Date: Thu, 12 Dec 2024 16:40:35 +0100
Subject: [PATCH 17/37] Update environment.yml

---
 environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environment.yml b/environment.yml
index 7358ce6..8b4c374 100644
--- a/environment.yml
+++ b/environment.yml
@@ -5,7 +5,7 @@ channels:
   - defaults
 dependencies:
   - flake8
-  - nose
+  - pytest
   - coverage >= 6  # to ensure lcov option is available
   - pandas
   - samtools

From 8867fa87906d1fdf832816d2050c7e319acd03d4 Mon Sep 17 00:00:00 2001
From: Landbarsch <j.meiss1312@gmail.com>
Date: Thu, 12 Dec 2024 16:45:48 +0100
Subject: [PATCH 18/37] Update environment.yml

---
 environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environment.yml b/environment.yml
index 8b4c374..a597d56 100644
--- a/environment.yml
+++ b/environment.yml
@@ -5,7 +5,7 @@ channels:
   - defaults
 dependencies:
   - flake8
-  - pytest
+  - pynose
   - coverage >= 6  # to ensure lcov option is available
   - pandas
   - samtools

From 2b2a01aab7acbfdc61fabcec1205a10475d2905c Mon Sep 17 00:00:00 2001
From: Landbarsch <j.meiss1312@gmail.com>
Date: Thu, 12 Dec 2024 16:47:50 +0100
Subject: [PATCH 19/37] Update environment.yml

---
 environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environment.yml b/environment.yml
index a597d56..7358ce6 100644
--- a/environment.yml
+++ b/environment.yml
@@ -5,7 +5,7 @@ channels:
   - defaults
 dependencies:
   - flake8
-  - pynose
+  - nose
   - coverage >= 6  # to ensure lcov option is available
   - pandas
   - samtools

From 37f860122c8c19303db58515bb143a413de101fb Mon Sep 17 00:00:00 2001
From: Landbarsch <j.meiss1312@gmail.com>
Date: Sat, 18 Jan 2025 22:37:10 +0100
Subject: [PATCH 20/37] Update environment.yml

---
 environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environment.yml b/environment.yml
index 7358ce6..a597d56 100644
--- a/environment.yml
+++ b/environment.yml
@@ -5,7 +5,7 @@ channels:
   - defaults
 dependencies:
   - flake8
-  - nose
+  - pynose
   - coverage >= 6  # to ensure lcov option is available
   - pandas
   - samtools

From 85b5d96f65d5b5251b2542c6a0ca81ddbec89ee0 Mon Sep 17 00:00:00 2001
From: Landbarsch <j.meiss1312@gmail.com>
Date: Sat, 18 Jan 2025 22:47:21 +0100
Subject: [PATCH 21/37] Update environment.yml

---
 environment.yml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/environment.yml b/environment.yml
index a597d56..b4cb5d0 100644
--- a/environment.yml
+++ b/environment.yml
@@ -5,9 +5,12 @@ channels:
   - defaults
 dependencies:
   - flake8
-  - pynose
-  - coverage >= 6  # to ensure lcov option is available
   - pandas
   - samtools
   - bedtools
   - bowtie2
+  - coverage >= 6  # to ensure lcov option is available
+  - pip  # Add pip to handle packages not available in conda
+  - pip:
+      - pynose  # Install pynose via pip, if available
+

From 9f15b023d479d4877156623c8b1c73035c3bf322 Mon Sep 17 00:00:00 2001
From: Landbarsch <j.meiss1312@gmail.com>
Date: Sat, 18 Jan 2025 23:00:24 +0100
Subject: [PATCH 22/37] Update VXdetector.py

---
 vxdetector/VXdetector.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vxdetector/VXdetector.py b/vxdetector/VXdetector.py
index 7c0f9d0..a8e62e8 100644
--- a/vxdetector/VXdetector.py
+++ b/vxdetector/VXdetector.py
@@ -6,7 +6,7 @@
 import sys
 import warnings
 import pandas as pd
-import Output_counter as Output_counter
+import Output_counter
 import files_manager as files_manager
 from interact_bowtie2 import mapbowtie2, buildbowtie2
 from interact_bedtools import overlap

From 015bbf5e9659d02f581845cb1034994b89f3dc5c Mon Sep 17 00:00:00 2001
From: Landbarsch <j.meiss1312@gmail.com>
Date: Sat, 18 Jan 2025 23:12:53 +0100
Subject: [PATCH 23/37] Update test_VXdetector.py

---
 vxdetector/tests/test_VXdetector.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vxdetector/tests/test_VXdetector.py b/vxdetector/tests/test_VXdetector.py
index a260e47..dbf6981 100644
--- a/vxdetector/tests/test_VXdetector.py
+++ b/vxdetector/tests/test_VXdetector.py
@@ -34,7 +34,7 @@ def tearDown(self):
         shutil.rmtree(self.fp_tmpdir)
 
     def test_csv_output(self):
-        new_file = f'{self.fp_tmpdir}test.csv'
+        new_file = f'{self.fp_tmpdir}/test.csv'
         single_file = True
         vx.do_output(self.result, new_file, single_file)
         content = []
@@ -63,7 +63,7 @@ def test_numeric_conversion(self):
                                            'V6': 0.0122571189279732,
                                            }}
         single_file = True
-        new_file = f'{self.fp_tmpdir}test3.csv'
+        new_file = f'{self.fp_tmpdir}/test3.csv'
         vx.do_output(mixed_result, new_file, single_file)
         content = []
         with open(self.output_test)as f:
@@ -90,7 +90,7 @@ def test_output_options(self):
         vx.do_output(self.result, new_file, single_file)
         sys.stdout = sys.__stdout__
         self.assertEqual(capturedOutput.getvalue(), expected)
-        new_file = f'{self.fp_tmpdir}test2.csv'
+        new_file = f'{self.fp_tmpdir}/test2.csv'
         vx.do_output(self.result, new_file, single_file)
         self.assertTrue(os.path.exists(new_file))
 

From 528ab57672a60c804c562db54b7ee31b0dde919b Mon Sep 17 00:00:00 2001
From: Landbarsch <j.meiss1312@gmail.com>
Date: Sat, 18 Jan 2025 23:20:09 +0100
Subject: [PATCH 24/37] Update test_VXdetector.py

---
 vxdetector/tests/test_VXdetector.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vxdetector/tests/test_VXdetector.py b/vxdetector/tests/test_VXdetector.py
index dbf6981..a260e47 100644
--- a/vxdetector/tests/test_VXdetector.py
+++ b/vxdetector/tests/test_VXdetector.py
@@ -34,7 +34,7 @@ def tearDown(self):
         shutil.rmtree(self.fp_tmpdir)
 
     def test_csv_output(self):
-        new_file = f'{self.fp_tmpdir}/test.csv'
+        new_file = f'{self.fp_tmpdir}test.csv'
         single_file = True
         vx.do_output(self.result, new_file, single_file)
         content = []
@@ -63,7 +63,7 @@ def test_numeric_conversion(self):
                                            'V6': 0.0122571189279732,
                                            }}
         single_file = True
-        new_file = f'{self.fp_tmpdir}/test3.csv'
+        new_file = f'{self.fp_tmpdir}test3.csv'
         vx.do_output(mixed_result, new_file, single_file)
         content = []
         with open(self.output_test)as f:
@@ -90,7 +90,7 @@ def test_output_options(self):
         vx.do_output(self.result, new_file, single_file)
         sys.stdout = sys.__stdout__
         self.assertEqual(capturedOutput.getvalue(), expected)
-        new_file = f'{self.fp_tmpdir}/test2.csv'
+        new_file = f'{self.fp_tmpdir}test2.csv'
         vx.do_output(self.result, new_file, single_file)
         self.assertTrue(os.path.exists(new_file))
 

From d2833b7181ddd266bc21f777e51bb1edd814b3e8 Mon Sep 17 00:00:00 2001
From: Landbarsch <j.meiss1312@gmail.com>
Date: Fri, 31 Jan 2025 16:13:06 +0100
Subject: [PATCH 25/37] cleanuo

---
 Output/.DS_Store           | Bin 6148 -> 0 bytes
 environment.yml            |  15 +++++++++++++++
 vxdetector/.DS_Store       | Bin 6148 -> 0 bytes
 vxdetector/tests/.DS_Store | Bin 6148 -> 0 bytes
 4 files changed, 15 insertions(+)
 delete mode 100644 Output/.DS_Store
 create mode 100644 environment.yml
 delete mode 100644 vxdetector/.DS_Store
 delete mode 100644 vxdetector/tests/.DS_Store

diff --git a/Output/.DS_Store b/Output/.DS_Store
deleted file mode 100644
index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6148
zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3
zem<@ulZcFPQ@L2!n>{z**<q8>++&mCkOWA81W14cNZ<zv;LbK1Poaz?KmsK2CSc!(
z0ynLxE!0092;Krf2c+FF_Fe*7ECH>lEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ
zLs35+`xjp>T0<F0fCPF1$Cyrb|F7^5{eNG?83~ZUUlGt@xh*qZDeu<Z%US-OSsOPv
j)R!Z4KLME7ReXlK;d!wEw5GODWMKRea10D2@KpjYNUI8I

diff --git a/environment.yml b/environment.yml
new file mode 100644
index 0000000..3388e23
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,15 @@
+name: vxdetector
+channels:
+  - bioconda
+  - conda-forge
+  - defaults
+dependencies:
+  - flake8
+  - pandas
+  - samtools
+  - bedtools
+  - bowtie2
+  - coverage >= 6  # to ensure lcov option is available
+  - pip  # Add pip to handle packages not available in conda
+  - pip:
+      - pynose  # Install pynose via pip, if available
\ No newline at end of file
diff --git a/vxdetector/.DS_Store b/vxdetector/.DS_Store
deleted file mode 100644
index b731e8a11f420afc3e722ba25df51d9e5f08f722..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6148
zcmeHKU2oGc6us^`mQqbBVn}<8#2cEn(oK^FFQJSFo{$I{NPtSRlo80{s!54ZRVmNo
z4<UXC5B!;OuI-{!O&^B<`9}8f$2~XpNp06eB>I!+K2e8=93;l(O=LC3{hU^;VOm;1
zp<=|8(*VytjaIyEgH^yP@V6<zYqzb|Y)mOt*YB67i98Pz8N#D8pU)3~4}i+Jjo!mQ
zp&@19Us3>mKo_Ww)%suJV}tO)x%$l}+5tPoe!>0$ovWQAL@h!|;z<yf_mR3dKPz!=
z*T#`lAWn>Uy>(H<y-Ji@^o%B8j8s0rcqB@T@vRw0<6|YlCfKY+PFYns?5kff^Zk0y
zu*gP6<8R?8iL<=h{UI6~&CMHI&X&`5-ufS8;pag<$@;<gHP>E9nS_gS5T3=uV$j)n
zEYm!Q)8SYV;%Eq$H!tHfl0{!m(kN40Uw1eyr#0y8PN%)&?w)&ibTZp>rzbs>j}DJ#
zvzBw`?!m+7gHOq2Dz7kln7|INY}?=hcy^V!fA+^|BGU`BGGm+0=oP-{P^rUTS=RqP
z=n+xldqt&1e2dqmfcc_M`?_9W<5~r*0{^T6yg#@|jGn<#quM%<sVe}miDqRe^Id{D
zu0hXWsS!Oep{YPk73PW|G#%}(@$(Fp8a17SxqJxoW?^n9LcJaFUFA-~(`a+6fK^~w
zfel^l@cI8_{r!KLWNTIdtH6JyfM}ljr+rMxoUJR9<FnR9dWXcuex*i5L1vC)Rp6s|
c7fBiVTrL1TgQZ5a!0aCZC4<eZ0)JG2UvEvldjJ3c

diff --git a/vxdetector/tests/.DS_Store b/vxdetector/tests/.DS_Store
deleted file mode 100644
index 00cda90bc2bc6b36c05e292e0508f64d6be0bf8d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6148
zcmeHK&ui2`6n?W^ZPp?zVrxAJ0k5U5Z7CElajl1*1S3^YiAl58&@{6oSwmSCa@Ie_
ztAC0Aizofw%q(QP5idog%v;`kZ+^UQXTFe`3=xUpBz{cPCn5*V*u90K#kik+$vUQm
zJB8v1D5j7~3XjsI(YC=VU={e^6ySHahc|Xa1<J<zD^H?SM(E|O;VFFkmoh}H8BtY?
zO~&_VoF+v%82k{OmF|t3tIn#k=Dhbm$;vOoa#9S#{4LjBNts6TIE;>yQ5E#>K9gA)
zCfO)g2}wM{kaw?>ESA+!PO`XAxxVgjdQLCsZ%(H>`-3fackf`f<xUTF5by2o&t^Sm
z{o$i0FN3r6Jd<D4+(}@sQg++m9KIrG(c-h8XQ|9i(aV|Z4YfW}LaEX&pmXee8T=IW
zoVIC97f0!Wir<jkRPla;DM2njg&Yhg>PbYX#fT(+DV|OXqj+$M@!nq;B~=)afVqv0
z*WaAqMuY7IO;8)FGQbQaqWO&5xPD&Zy3^n~rZ-sGNVV`<WxHM-R=hbMRIOd10_Ht%
z)~=A#QdtG80@p(U-XB~zqi3+zsJ0Fi>IwktqFEVY{$Su7*Pv&x)`%XM(5XP3D$Er_
z=ybHZW}IiR)~M4-n9GMS|18W6MW}xVzpI9m@HE=eDqs~@RA5C{`+WXC|MUHSG0E1f
z0#<=*rGV%j`iDa-$(*fAi{rD_g@1sv@wi%}N<m?cV-@gGya`u^K9>WaXRy|Y7MT4I
NP%_xUD)3Jg_zgWaxibI&


From 7d60e26afc4435b4fa7f0f19516dbd69b6431efe Mon Sep 17 00:00:00 2001
From: Landbarsch <j.meiss1312@gmail.com>
Date: Fri, 31 Jan 2025 16:23:55 +0100
Subject: [PATCH 26/37] TestChanges

---
 vxdetector/VXdetector.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vxdetector/VXdetector.py b/vxdetector/VXdetector.py
index a8e62e8..3f79bd2 100644
--- a/vxdetector/VXdetector.py
+++ b/vxdetector/VXdetector.py
@@ -6,10 +6,10 @@
 import sys
 import warnings
 import pandas as pd
-import Output_counter
-import files_manager as files_manager
-from interact_bowtie2 import mapbowtie2, buildbowtie2
-from interact_bedtools import overlap
+import vxdetector.Output_counter as Output_counter
+import vxdetector.files_manager as files_manager
+from vxdetector.interact_bowtie2 import mapbowtie2, buildbowtie2
+from vxdetector.interact_bedtools import overlap
 import tempfile
 from multiprocessing import Pool
 import biom

From 10398e93cda8cb00dbbdc23dc88c32d97064a819 Mon Sep 17 00:00:00 2001
From: Landbarsch <j.meiss1312@gmail.com>
Date: Fri, 31 Jan 2025 16:37:46 +0100
Subject: [PATCH 27/37] added biom to env

---
 environment.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/environment.yml b/environment.yml
index 3388e23..d21489e 100644
--- a/environment.yml
+++ b/environment.yml
@@ -12,4 +12,5 @@ dependencies:
   - coverage >= 6  # to ensure lcov option is available
   - pip  # Add pip to handle packages not available in conda
   - pip:
-      - pynose  # Install pynose via pip, if available
\ No newline at end of file
+      - pynose  # Install pynose via pip, if available
+      - biom
\ No newline at end of file

From 4ab20f703009f07e5993003c30dc36b623a3c1fe Mon Sep 17 00:00:00 2001
From: Landbarsch <j.meiss1312@gmail.com>
Date: Fri, 31 Jan 2025 16:39:54 +0100
Subject: [PATCH 28/37] -format

---
 environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environment.yml b/environment.yml
index d21489e..0815b9e 100644
--- a/environment.yml
+++ b/environment.yml
@@ -13,4 +13,4 @@ dependencies:
   - pip  # Add pip to handle packages not available in conda
   - pip:
       - pynose  # Install pynose via pip, if available
-      - biom
\ No newline at end of file
+      - biom-format
\ No newline at end of file

From 0edb2894a7263855dd619281cf4104de3bdce53f Mon Sep 17 00:00:00 2001
From: Landbarsch <j.meiss1312@gmail.com>
Date: Mon, 3 Feb 2025 13:48:29 +0100
Subject: [PATCH 29/37] Changed test to include new params

---
 vxdetector/tests/test_VXdetector.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/vxdetector/tests/test_VXdetector.py b/vxdetector/tests/test_VXdetector.py
index a260e47..d68e4a3 100644
--- a/vxdetector/tests/test_VXdetector.py
+++ b/vxdetector/tests/test_VXdetector.py
@@ -151,7 +151,7 @@ def test_singleFile(self):
         expected = f'{self.path}test_data/Output_test.csv'
         actual = f'{self.fp_tmpdir}/singleFile_test.csv'
         test_file = f'{self.path}test_data/5011_S225_L001_R1_001.fastq.gz'
-        vx.workflow(test_file, actual, False)
+        vx.workflow(test_file, actual, False, 1000, None)
         content = []
         with open(expected)as f:
             for line in f:
@@ -166,7 +166,7 @@ def test_directory(self):
         expected = f'{self.path}test_data/dir_test.csv'
         actual = f'{self.path}/test_data/dir_test_actual.csv'
         test_file = f'{self.path}test_data/test_dir/'
-        vx.workflow(test_file, actual, False)
+        vx.workflow(test_file, actual, False, 1000, None)
         content = []
         with open(expected)as f:
             for line in f:
@@ -181,7 +181,7 @@ def test_c_option(self):
         expected = f'{self.path}test_data/Output_test.csv'
         actual = sys.stdout
         test_file = f'{self.path}test_data/5011_S225_L001_R1_001.fastq.gz'
-        vx.workflow(test_file, actual, True)
+        vx.workflow(test_file, actual, False, 1000, None)
         content = []
         with open(expected)as f:
             for line in f:
@@ -194,25 +194,25 @@ def test_c_option(self):
 
     def test_no_fastq(self):
         with self.assertRaises(ValueError) as cm:
-            vx.workflow(f'{self.path}test_data/Indexed_bt2', sys.stdout, False)
+            vx.workflow(f'{self.path}test_data/Indexed_bt2', sys.stdout, False, 1000, None)
         self.assertEqual('There were no FASTQ files in this directory',
                          str(cm.exception))
 
     def test_Error(self):
         with self.assertRaises(ValueError) as cm:
             vx.workflow(f'{self.path}test_data/Output_test.csv',
-                        sys.stdout, False)
+                        sys.stdout, False, 1000, None)
         self.assertEqual('This file does not look like a fastq file',
                          str(cm.exception))
 
     def test_raise(self):
         with self.assertRaises(ValueError) as cm:
             vx.workflow(f'{self.path}test_data/test_dir/no_qual_test.fastq',
-                        sys.stdout, False)
+                        sys.stdout, False, 1000, None)
         self.assertEqual('This file has no Reads of the required '
                          'mapping-quality', str(cm.exception))
         with self.assertRaises(ValueError) as cm:
             vx.workflow(f'{self.path}test_data/test_dir/no_qual_'
-                        'paired_R1_001.fastq', sys.stdout, False)
+                        'paired_R1_001.fastq', sys.stdout, False, 1000, None)
         self.assertEqual('This file has no Reads of the required '
                          'mapping-quality', str(cm.exception))

From bf931b651b8bcc235f80e6fc73e64be178554407 Mon Sep 17 00:00:00 2001
From: Landbarsch <j.meiss1312@gmail.com>
Date: Mon, 3 Feb 2025 13:51:20 +0100
Subject: [PATCH 30/37] Formatiing for Lint

---
 vxdetector/tests/test_VXdetector.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vxdetector/tests/test_VXdetector.py b/vxdetector/tests/test_VXdetector.py
index d68e4a3..547e5d8 100644
--- a/vxdetector/tests/test_VXdetector.py
+++ b/vxdetector/tests/test_VXdetector.py
@@ -194,7 +194,8 @@ def test_c_option(self):
 
     def test_no_fastq(self):
         with self.assertRaises(ValueError) as cm:
-            vx.workflow(f'{self.path}test_data/Indexed_bt2', sys.stdout, False, 1000, None)
+            vx.workflow(f'{self.path}test_data/Indexed_bt2', sys.stdout,
+                        False, 1000, None)
         self.assertEqual('There were no FASTQ files in this directory',
                          str(cm.exception))
 

From b910cef7383a2022ab0131d7b2ccacfde97cc734 Mon Sep 17 00:00:00 2001
From: Landbarsch <j.meiss1312@gmail.com>
Date: Mon, 3 Feb 2025 14:43:44 +0100
Subject: [PATCH 31/37] Changed the interact_bowtie2 Script

---
 vxdetector/interact_bowtie2.py | 144 ++++++++++++---------------------
 1 file changed, 52 insertions(+), 92 deletions(-)

diff --git a/vxdetector/interact_bowtie2.py b/vxdetector/interact_bowtie2.py
index 912657a..2438cee 100644
--- a/vxdetector/interact_bowtie2.py
+++ b/vxdetector/interact_bowtie2.py
@@ -2,7 +2,9 @@
 
 import os
 import shutil
+import subprocess
 
+# Check for the installation paths of Bowtie2, Samtools, and Bedtools
 bowtie2_path = shutil.which('bowtie2')
 if bowtie2_path is None:
     bowtie2_path = '$CONDA/bin/bowtie2'
@@ -12,109 +14,67 @@
 bedtools_path = shutil.which('bedtools')
 if bedtools_path is None:
     bedtools_path = '$CONDA/bin/bedtools'
-# finds where bowtie2, samtools and bedtools
-# are installed
-
 
 def buildbowtie2(path):
-    r'''Builds bowtie2 index
+    """Builds Bowtie2 index for the reference genome."""
+    input_ref = f'{path}Indexed_bt2/85_otus.fasta'  # Path to the reference genome
+    output_path = f'{path}Indexed_bt2/bowtie2'      # Output path for the index
+    # Check if the index already exists; if not, build it
+    if not os.path.exists(f'{output_path}.1.bt2'):
+        os.system(f'{bowtie2_path}-build -f {input_ref} {output_path} > /dev/null')
 
-    This function builds an index for bowtie2 it is the equivalent
-    of building it manually with the "" command.
+def mapbowtie2(fasta_file, read2_file, path, temp_path, paired, bowtie2_params=None):    
+    """Maps reads against a previously built Bowtie2 index."""
+    
+    index_path = f'{path}Indexed_bt2/bowtie2'  # Path to the Bowtie2 index
+    # Check if the index files exist
+    if not os.path.exists(f'{index_path}.1.bt2'):
+        raise FileNotFoundError(f'No index files found under "{index_path}"')
 
-    Parameters
-    ----------
-    path : str
-        The program filepath. The directory where it needs to look
-        for the reference genome and where the index should be saved.
+    log_path = f'{temp_path}bowtie2.log'       # Log file for Bowtie2
+    bed_logpath = f'{temp_path}bed.log'         # Log file for Bedtools
+    Error = False
 
-    '''
-    input_ref = f'{path}Indexed_bt2/85_otus.fasta'
-    # reference genome greengenes is used
-    output_path = f'{path}Indexed_bt2/bowtie2'
-    if os.path.exists(f'{output_path}.1.bt2'):
-        # only builds an index for the alignment if there isn't already one
-        pass
-    else:
-        os.system(f'{bowtie2_path}-build -f {input_ref} {output_path} '
-                  '> /dev/null')
-        # if a indexfile is missing a new index is build
+    # Base command for Bowtie2
+    bowtie2_base_cmd = [bowtie2_path, '-x', index_path, '--fast']
 
+    # Add additional Bowtie2 parameters if provided
+    if bowtie2_params:
+        bowtie2_base_cmd += bowtie2_params.split()
 
-def mapbowtie2(fasta_file, read2_file, path, temp_path, paired):
-    r'''Maps reads against index
+    # Handle paired-end reads
+    if paired:
+        aligned_path = f'{temp_path}paired.bed'  # Output path for paired reads
+        bowtie2_base_cmd += ['-1', fasta_file, '-2', read2_file]
+        
+        # Commands for Samtools and Bedtools
+        cmd_samtools = [samtools_path, 'view', '-b', '-q', '30', '-S', '-F', '4']
+        cmd_bedtools = [bedtools_path, 'bamtobed', '-bedpe', '-i', 'stdin']
 
-    This function maps read files against a previously build index.
-    The bowtie standard output .sam is piped into samtools view and
-    converted to .bam. In this step all unmapped reads and those with a
-    lower mapquality than requested are discarded.
-    If the reads are paired the .bam file is converted to .bed with bedtools
-    bamtobed that way they are properly paired and can be intersected.
+        # Run the Bowtie2, Samtools, and Bedtools pipeline
+        with open(log_path, 'w') as log_file, open(bed_logpath, 'w') as bed_log, open(aligned_path, 'w') as output_file:
+            bowtie_process = subprocess.Popen(bowtie2_base_cmd, stderr=log_file, stdout=subprocess.PIPE)
+            samtools_process = subprocess.Popen(cmd_samtools, stdin=bowtie_process.stdout, stdout=subprocess.PIPE)
+            subprocess.run(cmd_bedtools, stdin=samtools_process.stdout, stdout=output_file, stderr=bed_log)
 
-    Parameters
-    ----------
-    fasta_file : str
-        Path to a .fastq file (or .fastq.gz). This file contains the (forward)
-        reads which are mapped against the reference.
-    read2_file : str
-        Path to a .fastq file (or .fastq.gz) containing backwards reads.
-        Is disregarded if paired = False. If not needed can be ''.
-    path : str
-        The program filepath. The directory where it needs to look
-        for the index files.
-    temp_path : str
-        Path to a (temporary) folder where the resulting .log files and output
-        files are saved
-    paired : Bool
-        Dictates wether bowtie2 alignes paired or unpaired reads.
-        If paired is set to False bowtie2 will do an unpaired alignment
-        otherwise it will do a paired one.
+    # Handle unpaired reads
+    else:
+        aligned_path = f'{temp_path}unpaired.bed'  # Output path for unpaired reads
+        bowtie2_base_cmd += ['-U', fasta_file]
+        
+        # Commands for Samtools and Bedtools
+        cmd_samtools = [samtools_path, 'view', '-b', '-q', '30', '-S', '-F', '4']
+        cmd_bedtools = [bedtools_path, 'bamtobed', '-i', 'stdin']
 
-    Returns
-    -------
-    aligned_path : str
-        Path to the converted bowtie2 Output file in which either paired or
-        unpaired reads have been mapped against the reference.
-    Error : Bool
-        Wether or not bowtie2 ended with an error.
-        In the context of the whole program Error = True leads to either
-        a raised Exception (if only a single file was given) or to the current
-        file being skipped (if a directory was given).
+        # Run the Bowtie2 and Samtools pipeline
+        with open(log_path, 'w') as log_file, open(aligned_path, 'w') as output_file:
+            bowtie_process = subprocess.Popen(bowtie2_base_cmd, stderr=log_file, stdout=subprocess.PIPE)
+            samtools_process = subprocess.Popen(cmd_samtools, stdin=bowtie_process.stdout, stdout=subprocess.PIPE)
+            subprocess.run(cmd_bedtools, stdin=samtools_process.stdout, stdout=output_file)
 
-    '''
-    index_path = f'{path}Indexed_bt2/bowtie2'
-    if os.path.exists(f'{index_path}.1.bt2') is False:
-        raise FileNotFoundError(f'No Index files found under "{index_path}"')
-        # raises an Exception if the Index files cannot be found
-    log_path = f'{temp_path}bowtie2.log'
-    bed_logpath = f'{temp_path}bed.log'
-    # declares various filepaths
-    Error = False
-    if paired is True:
-        aligned_path = f'{temp_path}paired.bed'
-        cmd = f'{bowtie2_path} -x {index_path} -1 {fasta_file} \
-                -2 {read2_file} --fast 2> {log_path} | \
-                {samtools_path} view -b -q 30 -S -F 4 \
-                | {bedtools_path} bamtobed -bedpe -i stdin > {aligned_path} \
-                2> {bed_logpath}'
-        # Should a backward read be found both files will be given to bowtie2.
-        # After converting .sam to .bam a conversion to .bed is done to
-        # properly mate the pairs
-    else:
-        aligned_path = f'{temp_path}unpaired.bam'
-        cmd = f'{bowtie2_path} -x {index_path} -q -U {fasta_file} \
-                 --fast 2> {log_path} | {samtools_path} view -b -q 30 -S -F 4 \
-                 -o {aligned_path}'
-        # Should no backward read be found it will just use the forward
-        # read and does an alignment followed by a pipe to convert
-        # the bowtie2 output .sam to a .bam file
-    os.system(cmd)
+    # Check for errors in the Bowtie2 log file
     with open(log_path, 'r') as log:
-        lines = log.readlines()
-        try:
-            int(lines[0].split()[0])
-        except ValueError:
+        if any("error" in line.lower() for line in log):
             Error = True
-        # Checks if bowtie2 exited with an error
 
-    return aligned_path, Error
+    return aligned_path, Error  # Return the path to the aligned file and error status

From 99018bc14585541f0e26c266f7ea187392c55be6 Mon Sep 17 00:00:00 2001
From: Landbarsch <j.meiss1312@gmail.com>
Date: Tue, 4 Feb 2025 09:46:35 +0100
Subject: [PATCH 32/37] formatting of interact_bowtie2

---
 vxdetector/interact_bowtie2.py | 55 +++++++++++++++++++++-------------
 1 file changed, 34 insertions(+), 21 deletions(-)

diff --git a/vxdetector/interact_bowtie2.py b/vxdetector/interact_bowtie2.py
index 2438cee..595e7c6 100644
--- a/vxdetector/interact_bowtie2.py
+++ b/vxdetector/interact_bowtie2.py
@@ -15,17 +15,21 @@
 if bedtools_path is None:
     bedtools_path = '$CONDA/bin/bedtools'
 
+
 def buildbowtie2(path):
     """Builds Bowtie2 index for the reference genome."""
-    input_ref = f'{path}Indexed_bt2/85_otus.fasta'  # Path to the reference genome
+    input_ref = f'{path}Indexed_bt2/85_otus.fasta'  # Path to reference genome
     output_path = f'{path}Indexed_bt2/bowtie2'      # Output path for the index
     # Check if the index already exists; if not, build it
     if not os.path.exists(f'{output_path}.1.bt2'):
-        os.system(f'{bowtie2_path}-build -f {input_ref} {output_path} > /dev/null')
+        os.system(
+            f'{bowtie2_path}-build -f {input_ref}{output_path} > /dev/null')
+
 
-def mapbowtie2(fasta_file, read2_file, path, temp_path, paired, bowtie2_params=None):    
+def mapbowtie2(
+        fasta_file, read2_file, path, temp_path, paired, bowtie2_params=None):
     """Maps reads against a previously built Bowtie2 index."""
-    
+
     index_path = f'{path}Indexed_bt2/bowtie2'  # Path to the Bowtie2 index
     # Check if the index files exist
     if not os.path.exists(f'{index_path}.1.bt2'):
@@ -46,35 +50,44 @@ def mapbowtie2(fasta_file, read2_file, path, temp_path, paired, bowtie2_params=N
     if paired:
         aligned_path = f'{temp_path}paired.bed'  # Output path for paired reads
         bowtie2_base_cmd += ['-1', fasta_file, '-2', read2_file]
-        
+
         # Commands for Samtools and Bedtools
-        cmd_samtools = [samtools_path, 'view', '-b', '-q', '30', '-S', '-F', '4']
-        cmd_bedtools = [bedtools_path, 'bamtobed', '-bedpe', '-i', 'stdin']
+        cmd_sam = [samtools_path, 'view', '-b', '-q', '30', '-S', '-F', '4']
+        cmd_bed = [bedtools_path, 'bamtobed', '-bedpe', '-i', 'stdin']
 
         # Run the Bowtie2, Samtools, and Bedtools pipeline
-        with open(log_path, 'w') as log_file, open(bed_logpath, 'w') as bed_log, open(aligned_path, 'w') as output_file:
-            bowtie_process = subprocess.Popen(bowtie2_base_cmd, stderr=log_file, stdout=subprocess.PIPE)
-            samtools_process = subprocess.Popen(cmd_samtools, stdin=bowtie_process.stdout, stdout=subprocess.PIPE)
-            subprocess.run(cmd_bedtools, stdin=samtools_process.stdout, stdout=output_file, stderr=bed_log)
+        with open(log_path, 'w') as log_file, \
+             open(bed_logpath, 'w') as bed_log, \
+             open(aligned_path, 'w') as output_file:
+            bowtie_process = subprocess.Popen(
+                bowtie2_base_cmd, stderr=log_file, stdout=subprocess.PIPE)
+            samtools_process = subprocess.Popen(
+                cmd_sam, stdin=bowtie_process.stdout, stdout=subprocess.PIPE)
+            subprocess.run(cmd_bed, stdin=samtools_process.stdout,
+                           stdout=output_file, stderr=bed_log)
 
     # Handle unpaired reads
     else:
-        aligned_path = f'{temp_path}unpaired.bed'  # Output path for unpaired reads
+        # Output path for unpaired reads
+        aligned_path = f'{temp_path}unpaired.bed'
         bowtie2_base_cmd += ['-U', fasta_file]
-        
+
         # Commands for Samtools and Bedtools
-        cmd_samtools = [samtools_path, 'view', '-b', '-q', '30', '-S', '-F', '4']
-        cmd_bedtools = [bedtools_path, 'bamtobed', '-i', 'stdin']
+        cmd_sam = [samtools_path, 'view', '-b', '-q', '30', '-S', '-F', '4']
+        cmd_bed = [bedtools_path, 'bamtobed', '-i', 'stdin']
 
         # Run the Bowtie2 and Samtools pipeline
-        with open(log_path, 'w') as log_file, open(aligned_path, 'w') as output_file:
-            bowtie_process = subprocess.Popen(bowtie2_base_cmd, stderr=log_file, stdout=subprocess.PIPE)
-            samtools_process = subprocess.Popen(cmd_samtools, stdin=bowtie_process.stdout, stdout=subprocess.PIPE)
-            subprocess.run(cmd_bedtools, stdin=samtools_process.stdout, stdout=output_file)
+        with open(log_path, 'w') as log_f, open(aligned_path, 'w') as output_f:
+            bowtie_process = subprocess.Popen(
+                bowtie2_base_cmd, stderr=log_f, stdout=subprocess.PIPE)
+            samtools_process = subprocess.Popen(
+                cmd_sam, stdin=bowtie_process.stdout, stdout=subprocess.PIPE)
+            subprocess.run(
+                cmd_bed, stdin=samtools_process.stdout, stdout=output_f)
 
     # Check for errors in the Bowtie2 log file
     with open(log_path, 'r') as log:
         if any("error" in line.lower() for line in log):
             Error = True
-
-    return aligned_path, Error  # Return the path to the aligned file and error status
+    # Return the path to the aligned file and error status
+    return aligned_path, Error

From cc4fadc681c010580abccf5e7c77ccfbb3c7a524 Mon Sep 17 00:00:00 2001
From: Landbarsch <j.meiss1312@gmail.com>
Date: Tue, 4 Feb 2025 11:35:56 +0100
Subject: [PATCH 33/37] fixes for testing

---
 vxdetector/interact_bowtie2.py           |  9 +++++----
 vxdetector/tests/test_interact_bowtie.py | 20 +++++++++++++-------
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/vxdetector/interact_bowtie2.py b/vxdetector/interact_bowtie2.py
index 595e7c6..0bd10a8 100644
--- a/vxdetector/interact_bowtie2.py
+++ b/vxdetector/interact_bowtie2.py
@@ -23,7 +23,7 @@ def buildbowtie2(path):
     # Check if the index already exists; if not, build it
     if not os.path.exists(f'{output_path}.1.bt2'):
         os.system(
-            f'{bowtie2_path}-build -f {input_ref}{output_path} > /dev/null')
+            f'{bowtie2_path}-build -f {input_ref} {output_path} > /dev/null')
 
 
 def mapbowtie2(
@@ -77,13 +77,14 @@ def mapbowtie2(
         cmd_bed = [bedtools_path, 'bamtobed', '-i', 'stdin']
 
         # Run the Bowtie2 and Samtools pipeline
-        with open(log_path, 'w') as log_f, open(aligned_path, 'w') as output_f:
+        with open(log_path, 'w') as log_file, \
+             open(aligned_path, 'w') as output_file:
             bowtie_process = subprocess.Popen(
-                bowtie2_base_cmd, stderr=log_f, stdout=subprocess.PIPE)
+                bowtie2_base_cmd, stderr=log_file, stdout=subprocess.PIPE)
             samtools_process = subprocess.Popen(
                 cmd_sam, stdin=bowtie_process.stdout, stdout=subprocess.PIPE)
             subprocess.run(
-                cmd_bed, stdin=samtools_process.stdout, stdout=output_f)
+                cmd_bed, stdin=samtools_process.stdout, stdout=output_file)
 
     # Check for errors in the Bowtie2 log file
     with open(log_path, 'r') as log:
diff --git a/vxdetector/tests/test_interact_bowtie.py b/vxdetector/tests/test_interact_bowtie.py
index 860d1f0..429ace7 100644
--- a/vxdetector/tests/test_interact_bowtie.py
+++ b/vxdetector/tests/test_interact_bowtie.py
@@ -31,12 +31,14 @@ def test_aligned_path(self):
         paired = False
         temp_path = self.fp_tmpdir
         aligned_path, Error = ibo.mapbowtie2(self.fasta_file, self.read2_file,
-                                             path, temp_path, paired)
-        self.assertEqual(aligned_path, f'{temp_path}unpaired.bam')
+                                             path, temp_path, paired,
+                                             bowtie2_params="")
+        self.assertEqual(aligned_path, f'{temp_path}unpaired.bed')
         self.assertEqual(Error, False)
         paired = True
         aligned_path, Error = ibo.mapbowtie2(self.fasta_file, self.read2_file,
-                                             path, temp_path, paired)
+                                             path, temp_path, paired,
+                                             bowtie2_params="")
         self.assertEqual(aligned_path, f'{temp_path}paired.bed')
         self.assertEqual(Error, False)
 
@@ -52,7 +54,8 @@ def test_pipe(self):
             for line in f:
                 content.append(line.strip().split())
         ibo.mapbowtie2(self.fasta_file, self.read2_file,
-                       path, temp_path, paired)
+                       path, temp_path, paired,
+                       bowtie2_params="")
         os.system(f'{samtools_path} view {temp_path}unpaired.bam '
                   f'> {temp_path}unpaired.sam')
         output = []
@@ -66,7 +69,8 @@ def test_pipe(self):
             for line in f:
                 content.append(line.strip().split())
         ibo.mapbowtie2(self.fasta_file, self.read2_file,
-                       path, temp_path, paired)
+                       path, temp_path, paired,
+                       bowtie2_params="")
         output = []
         with open(f'{temp_path}paired.bed')as f:
             for line in f:
@@ -78,12 +82,14 @@ def test_wrong_file_type(self):
         read2_file = f'{path}test_data/paired/BED.bed'
         paired = True
         aligned_path, Error = ibo.mapbowtie2(self.fasta_file, read2_file,
-                                             path, self.fp_tmpdir, paired)
+                                             path, self.fp_tmpdir, paired,
+                                             bowtie2_params="")
         self.assertEqual(Error, True)
         paired = False
         fasta_file_local = f'{path}test_data/paired/BED.bed'
         aligned_path, Error = ibo.mapbowtie2(fasta_file_local, read2_file,
-                                             path, self.fp_tmpdir, paired)
+                                             path, self.fp_tmpdir, paired,
+                                             bowtie2_params="")
         self.assertEqual(Error, True)
 
     def test_no_index(self):

From b2743c3f7e1816a4ca8173d320cc56f475bbe7c9 Mon Sep 17 00:00:00 2001
From: Landbarsch <j.meiss1312@gmail.com>
Date: Tue, 4 Feb 2025 11:41:51 +0100
Subject: [PATCH 34/37] reduced sample size for the test

---
 vxdetector/tests/test_VXdetector.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vxdetector/tests/test_VXdetector.py b/vxdetector/tests/test_VXdetector.py
index 547e5d8..ba17a58 100644
--- a/vxdetector/tests/test_VXdetector.py
+++ b/vxdetector/tests/test_VXdetector.py
@@ -151,7 +151,7 @@ def test_singleFile(self):
         expected = f'{self.path}test_data/Output_test.csv'
         actual = f'{self.fp_tmpdir}/singleFile_test.csv'
         test_file = f'{self.path}test_data/5011_S225_L001_R1_001.fastq.gz'
-        vx.workflow(test_file, actual, False, 1000, None)
+        vx.workflow(test_file, actual, False, 100, None)
         content = []
         with open(expected)as f:
             for line in f:
@@ -166,7 +166,7 @@ def test_directory(self):
         expected = f'{self.path}test_data/dir_test.csv'
         actual = f'{self.path}/test_data/dir_test_actual.csv'
         test_file = f'{self.path}test_data/test_dir/'
-        vx.workflow(test_file, actual, False, 1000, None)
+        vx.workflow(test_file, actual, False, 100, None)
         content = []
         with open(expected)as f:
             for line in f:
@@ -181,7 +181,7 @@ def test_c_option(self):
         expected = f'{self.path}test_data/Output_test.csv'
         actual = sys.stdout
         test_file = f'{self.path}test_data/5011_S225_L001_R1_001.fastq.gz'
-        vx.workflow(test_file, actual, False, 1000, None)
+        vx.workflow(test_file, actual, False, 100, None)
         content = []
         with open(expected)as f:
             for line in f:
@@ -195,21 +195,21 @@ def test_c_option(self):
     def test_no_fastq(self):
         with self.assertRaises(ValueError) as cm:
             vx.workflow(f'{self.path}test_data/Indexed_bt2', sys.stdout,
-                        False, 1000, None)
+                        False, 100, None)
         self.assertEqual('There were no FASTQ files in this directory',
                          str(cm.exception))
 
     def test_Error(self):
         with self.assertRaises(ValueError) as cm:
             vx.workflow(f'{self.path}test_data/Output_test.csv',
-                        sys.stdout, False, 1000, None)
+                        sys.stdout, False, 100, None)
         self.assertEqual('This file does not look like a fastq file',
                          str(cm.exception))
 
     def test_raise(self):
         with self.assertRaises(ValueError) as cm:
             vx.workflow(f'{self.path}test_data/test_dir/no_qual_test.fastq',
-                        sys.stdout, False, 1000, None)
+                        sys.stdout, False, 100, None)
         self.assertEqual('This file has no Reads of the required '
                          'mapping-quality', str(cm.exception))
         with self.assertRaises(ValueError) as cm:

From 545c8c2df09dd4a2cae4e48639a4766ee7976854 Mon Sep 17 00:00:00 2001
From: Landbarsch <j.meiss1312@gmail.com>
Date: Tue, 4 Feb 2025 12:39:09 +0100
Subject: [PATCH 35/37] changed a parameter

---
 vxdetector/tests/test_VXdetector.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vxdetector/tests/test_VXdetector.py b/vxdetector/tests/test_VXdetector.py
index ba17a58..2c9c72b 100644
--- a/vxdetector/tests/test_VXdetector.py
+++ b/vxdetector/tests/test_VXdetector.py
@@ -214,6 +214,6 @@ def test_raise(self):
                          'mapping-quality', str(cm.exception))
         with self.assertRaises(ValueError) as cm:
             vx.workflow(f'{self.path}test_data/test_dir/no_qual_'
-                        'paired_R1_001.fastq', sys.stdout, False, 1000, None)
+                        'paired_R1_001.fastq', sys.stdout, False, 100, None)
         self.assertEqual('This file has no Reads of the required '
                          'mapping-quality', str(cm.exception))

From e2d5f2ba31a0cecbda8765f9092473a81b41afba Mon Sep 17 00:00:00 2001
From: Landbarsch <j.meiss1312@gmail.com>
Date: Tue, 4 Feb 2025 13:16:38 +0100
Subject: [PATCH 36/37] Changes to test_interact_bowtie

---
 vxdetector/tests/test_interact_bowtie.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/vxdetector/tests/test_interact_bowtie.py b/vxdetector/tests/test_interact_bowtie.py
index 429ace7..2256178 100644
--- a/vxdetector/tests/test_interact_bowtie.py
+++ b/vxdetector/tests/test_interact_bowtie.py
@@ -54,8 +54,7 @@ def test_pipe(self):
             for line in f:
                 content.append(line.strip().split())
         ibo.mapbowtie2(self.fasta_file, self.read2_file,
-                       path, temp_path, paired,
-                       bowtie2_params="")
+                       path, temp_path, paired)
         os.system(f'{samtools_path} view {temp_path}unpaired.bam '
                   f'> {temp_path}unpaired.sam')
         output = []
@@ -69,8 +68,7 @@ def test_pipe(self):
             for line in f:
                 content.append(line.strip().split())
         ibo.mapbowtie2(self.fasta_file, self.read2_file,
-                       path, temp_path, paired,
-                       bowtie2_params="")
+                       path, temp_path, paired)
         output = []
         with open(f'{temp_path}paired.bed')as f:
             for line in f:

From 7f6cfc8f61ab917bab2c24a029ff22cee4009aa2 Mon Sep 17 00:00:00 2001
From: Landbarsch <j.meiss1312@gmail.com>
Date: Tue, 4 Feb 2025 13:22:32 +0100
Subject: [PATCH 37/37] undo changes

---
 vxdetector/tests/test_interact_bowtie.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vxdetector/tests/test_interact_bowtie.py b/vxdetector/tests/test_interact_bowtie.py
index 2256178..429ace7 100644
--- a/vxdetector/tests/test_interact_bowtie.py
+++ b/vxdetector/tests/test_interact_bowtie.py
@@ -54,7 +54,8 @@ def test_pipe(self):
             for line in f:
                 content.append(line.strip().split())
         ibo.mapbowtie2(self.fasta_file, self.read2_file,
-                       path, temp_path, paired)
+                       path, temp_path, paired,
+                       bowtie2_params="")
         os.system(f'{samtools_path} view {temp_path}unpaired.bam '
                   f'> {temp_path}unpaired.sam')
         output = []
@@ -68,7 +69,8 @@ def test_pipe(self):
             for line in f:
                 content.append(line.strip().split())
         ibo.mapbowtie2(self.fasta_file, self.read2_file,
-                       path, temp_path, paired)
+                       path, temp_path, paired,
+                       bowtie2_params="")
         output = []
         with open(f'{temp_path}paired.bed')as f:
             for line in f: