Change the logging behavior

hzi-bifo · Dec 20, 2023 · 7bf7f14 · 7bf7f14
1 parent 496683e
commit 7bf7f14
Show file tree

Hide file tree

Showing 6 changed files with 84 additions and 35 deletions.
diff --git a/README.md b/README.md
@@ -85,12 +85,13 @@ optional arguments:
   -t THREADS, --threads THREADS
                         number of threads to use. (default: 10)
   -m MEMORY, --memory MEMORY
-                        amount (GB) of GPU RAM. (default: 12)
+                        Amount (GB) of GPU RAM. (default: 12)
   --chunk_size CHUNK_SIZE
                         Use this parameter when having low memory. Parsing the file in chunks.
                         Not needed when free RAM >=5 * your_file_size (uncompressed, sum of paired ends).
                         When chunk_size=256, memory=16 it will load 256 * 16 * 1024 reads each chunk (use ~20 GB for 100bp paired end).
-  -v, --version         show program's version number and exit
+  --log LOG             Log file name
+  -v, --version         Show program's version number and exit
 ```
 
 #### CPU mode
@@ -104,7 +105,7 @@ ribodetector_cpu -t 20 \
   --chunk_size 256 \
   -o outputs/reads.nonrrna.1.fq outputs/reads.nonrrna.2.fq
 ```
-The above command line excutes ribodetector for paired-end reads with mean length 100 using 20 CPU cores. The input reads do not need to be same length. RiboDetector supports reads with variable length. Setting `-l` to the mean read length is recommended. 
+The above command line excutes ribodetector for paired-end reads with mean length 100 using 20 CPU cores. The input reads do not need to be same length. RiboDetector supports reads with variable length. Setting `-l` to the mean read length is recommended. If you need to save the log into a file, you can specify it with `--log <logfile>`
 
 Note: when using **SLURM** job submission system, you need to specify `--cpus-per-task` to the number you CPU cores you need and set `--threads-per-core` to 1.
 
@@ -143,7 +144,8 @@ optional arguments:
   --chunk_size CHUNK_SIZE
                         chunk_size * 1024 reads to load each time.
                         When chunk_size=1000 and threads=20, consumming ~20G memory, better to be multiples of the number of threads..
-  -v, --version         show program's version number and exit
+  --log LGO             Log file name
+  -v, --version         Show program's version number and exit
 ```
 
 **Note**: RiboDetector uses multiprocessing with shared memory, thus the memory use of a single process indicated in `htop` or `top` is actually the total memory used by RiboDector. Some job submission system like SGE mis-calculated the total memory use by adding up the memory use of all process. If you see this do not worry it will cause out of memory issue. 

diff --git a/ribodetector/__init__.py b/ribodetector/__init__.py
@@ -4,5 +4,5 @@
 Accurate and rapid RiboRNA sequences Detector based on deep learning.
 """
 
-__version__ = "0.2.7"
+__version__ = "0.2.9"
 __author__ = 'ZL Deng'
diff --git a/ribodetector/detect.py b/ribodetector/detect.py
@@ -207,15 +207,21 @@ def run(self):
 
                     # del r1_data, r2_data, r1_output, r2_output, r1_batch_labels, r2_batch_labels
 
+            self.logger.info('Processed {}{}{}{} sequences in total'.format(
+                        colors.BOLD,
+                        colors.OKCYAN,
+                        num_seqs,
+                        colors.ENDC))
+
             # Write predicted rRNA sequences if the rRNA output file is given
-            self.logger.info('Detected {}{}{}{} non-rRNA sequences.'.format(
+            self.logger.info('Detected {}{}{}{} non-rRNA sequences'.format(
                 colors.BOLD,
                 colors.OKCYAN,
                 num_nonrrna,
                 colors.ENDC
             ))
 
-            self.logger.info('Detected {}{}{}{} rRNA sequences.'.format(
+            self.logger.info('Detected {}{}{}{} rRNA sequences'.format(
                 colors.BOLD,
                 colors.OKCYAN,
                 num_rrna,
@@ -293,7 +299,13 @@ def run(self):
 
                     # del data, output, batch_labels
 
-            self.logger.info('Detected {}{}{}{} non-rRNA sequences.'.format(
+            self.logger.info('Processed {}{}{}{} sequences in total'.format(
+                        colors.BOLD,
+                        colors.OKCYAN,
+                        num_seqs,
+                        colors.ENDC))
+
+            self.logger.info('Detected {}{}{}{} non-rRNA sequences'.format(
                 colors.BOLD,
                 colors.OKCYAN,
                 num_nonrrna,
@@ -393,14 +405,21 @@ def run_with_chunks(self):
                         colors.ENDC))
 
                     # del r1_data, r2_data, r1_output, r2_output, r1_batch_labels, r2_batch_labels
-            self.logger.info('Detected {}{}{}{} non-rRNA sequences.'.format(
+
+            self.logger.info('Processed {}{}{}{} sequences in total'.format(
+                        colors.BOLD,
+                        colors.OKCYAN,
+                        num_read,
+                        colors.ENDC))
+
+            self.logger.info('Detected {}{}{}{} non-rRNA sequences'.format(
                 colors.BOLD,
                 colors.OKCYAN,
                 num_nonrrna,
                 colors.ENDC
             ))
 
-            self.logger.info('Detected {}{}{}{} rRNA sequences.'.format(
+            self.logger.info('Detected {}{}{}{} rRNA sequences'.format(
                 colors.BOLD,
                 colors.OKCYAN,
                 num_rrna,
@@ -413,7 +432,7 @@ def run_with_chunks(self):
                 rrna2_fh.close()
 
             if self.args.ensure == 'both':
-                self.logger.info('Done! Detected {}{}{}{} rRNA sequences, discarded {}{}{}{} unclassified sequences.'.format(
+                self.logger.info('Discarded {}{}{}{} unclassified sequences'.format(
                     colors.BOLD,
                     colors.OKCYAN,
                     num_unknown,
@@ -477,6 +496,12 @@ def run_with_chunks(self):
                         num_read,
                         colors.ENDC))
 
+            self.logger.info('Processed {}{}{}{} sequences in total'.format(
+                        colors.BOLD,
+                        colors.OKCYAN,
+                        num_read,
+                        colors.ENDC))
+
             self.logger.info('Detected {}{}{}{} non-rRNA sequences'.format(
                 colors.BOLD,
                 colors.OKCYAN,
@@ -757,15 +782,15 @@ def main():
       (Only applicable for paired end reads, discard the read pair when their predicitons are discordant)''')
 
     args.add_argument('-t', '--threads', default=10, type=int,
-                      help='number of threads to use. (default: 10)')
+                      help='Number of threads to use. (default: 10)')
     args.add_argument('-m', '--memory', default=32, type=int,
-                      help='amount (GB) of GPU RAM. (default: 12)')
+                      help='Amount (GB) of GPU RAM. (default: 12)')
     args.add_argument('--chunk_size', default=None, type=int,
                       help='Use this parameter when having low memory. Parsing the file in chunks.\n{}.\n{}.'.format(
                           'Not needed when free RAM >=5 * your_file_size (uncompressed, sum of paired ends)',
                           'When chunk_size=256, memory=16 it will load 256 * 16 * 1024 reads each chunk (use ~20 GB for 100bp paired end)'
                       ))
-    args.add_argument('--log', default='ribodetector.log', type=str,
+    args.add_argument('--log', default=None, type=str,
                       help='Log file name')
     args.add_argument('-v', '--version', action='version',
                       version='%(prog)s {version}'.format(version=__version__))

diff --git a/ribodetector/detect_cpu.py b/ribodetector/detect_cpu.py
@@ -206,15 +206,21 @@ def run(self):
                     num_unknown += len(r1_dict[-1])
 
                     # del r1_data, r2_data, r1_output, r2_output, r1_batch_labels, r2_batch_labels
+
+            self.logger.info('Processed {}{}{}{} sequences in total'.format(
+                        colors.BOLD,
+                        colors.OKCYAN,
+                        num_seqs,
+                        colors.ENDC))
 
-            self.logger.info('Detected {}{}{}{} non-rRNA sequences.'.format(
+            self.logger.info('Detected {}{}{}{} non-rRNA sequences'.format(
                 colors.BOLD,
                 colors.OKCYAN,
                 num_nonrrna,
                 colors.ENDC
             ))
 
-            self.logger.info('Detected {}{}{}{} rRNA sequences.'.format(
+            self.logger.info('Detected {}{}{}{} rRNA sequences'.format(
                 colors.BOLD,
                 colors.OKCYAN,
                 num_rrna,
@@ -226,7 +232,7 @@ def run(self):
                 rrna2_fh.close()
 
             if self.args.ensure == 'both':
-                self.logger.info('Discarded {}{}{}{} unclassified sequences.'.format(
+                self.logger.info('Discarded {}{}{}{} unclassified sequences'.format(
                     colors.BOLD,
                     colors.OKCYAN,
                     num_unknown,
@@ -301,6 +307,12 @@ def run(self):
                 if self.rrna is not None and r_dict[1]:
                     rrna_fh.write('\n'.join(r_dict[1]) + '\n')
 
+            self.logger.info('Processed {}{}{}{} sequences in total'.format(
+                        colors.BOLD,
+                        colors.OKCYAN,
+                        num_seqs,
+                        colors.ENDC))
+
             self.logger.info('Detected {}{}{}{} non-rRNA sequences'.format(
                 colors.BOLD,
                 colors.OKCYAN,
@@ -333,6 +345,10 @@ def run_with_chunks(self):
         num_rrna = 0
 
         if self.is_paired:
+            self.logger.info('Classify paired-end reads with chunk size {}{}{}'.format(
+                colors.BOLD,
+                self.chunk_size,
+                colors.ENDC))
             if self.rrna is not None:
                 self.logger.info('Writing output rRNA sequences into file: {}{}{}'.format(
                     colors.OKBLUE,
@@ -416,20 +432,25 @@ def run_with_chunks(self):
                         # del r1_data, r2_data, r1_output, r2_output, r1_batch_labels, r2_batch_labels
                 num_read += len(chunk[0])
 
-                self.logger.info('{}{}{}{} reads classified!'.format(
-                    colors.BOLD,
-                    colors.OKCYAN,
+                self.logger.info('{}{}{}{} reads finished!'.format(
+                    colors.OKGREEN,
                     num_read,
                     colors.ENDC))
 
-            self.logger.info('Detected {}{}{}{} non-rRNA sequences.'.format(
+            self.logger.info('Processed {}{}{}{} sequences in total'.format(
+                        colors.BOLD,
+                        colors.OKCYAN,
+                        num_read,
+                        colors.ENDC))
+
+            self.logger.info('Detected {}{}{}{} non-rRNA sequences'.format(
                 colors.BOLD,
                 colors.OKCYAN,
                 num_nonrrna,
                 colors.ENDC
             ))
 
-            self.logger.info('Detected {}{}{}{} rRNA sequences.'.format(
+            self.logger.info('Detected {}{}{}{} rRNA sequences'.format(
                 colors.BOLD,
                 colors.OKCYAN,
                 num_rrna,
@@ -441,7 +462,7 @@ def run_with_chunks(self):
                 rrna2_fh.close()
 
             if self.args.ensure == 'both':
-                self.logger.info('Discarded {}{}{}{} unclassified sequences.'.format(
+                self.logger.info('Discarded {}{}{}{} unclassified sequences'.format(
                     colors.BOLD,
                     colors.OKCYAN,
                     num_unknown,
@@ -455,7 +476,7 @@ def run_with_chunks(self):
 
         else:
             # num_read = 0
-            self.logger.info('Classify paired end reads with chunk size {}{}{}'.format(
+            self.logger.info('Classify reads with chunk size {}{}{}'.format(
                 colors.BOLD,
                 self.chunk_size,
                 colors.ENDC))
@@ -514,12 +535,17 @@ def run_with_chunks(self):
 
                 num_read += len(chunk)
 
-                self.logger.info('{}{}{}{} reads classified!'.format(
-                    colors.BOLD,
-                    colors.OKCYAN,
+                self.logger.info('{}{}{} reads finished!'.format(
+                    colors.OKGREEN,
                     num_read,
                     colors.ENDC))
 
+            self.logger.info('Processed {}{}{}{} sequences in total'.format(
+                        colors.BOLD,
+                        colors.OKCYAN,
+                        num_read,
+                        colors.ENDC))
+
             self.logger.info('Detected {}{}{}{} non-rRNA sequences'.format(
                 colors.BOLD,
                 colors.OKCYAN,
@@ -570,10 +596,6 @@ def detect(self):
         if self.chunk_size is None:
             self.run()
         else:
-            self.logger.info('Classify reads with chunk size {}{}{}'.format(
-                colors.BOLD,
-                self.chunk_size,
-                colors.ENDC))
             self.run_with_chunks()
 
     @staticmethod
@@ -771,12 +793,12 @@ def main():
       (Only applicable for paired end reads, discard the read pair when their predicitons are discordant)''')
 
     args.add_argument('-t', '--threads', default=20, type=int,
-                      help='number of threads to use. (default: 20)')
+                      help='Number of threads to use. (default: 20)')
 
     args.add_argument('--chunk_size', default=None, type=int,
                       help='chunk_size * 1024 reads to load each time. \n{}.'.format(
                           'When chunk_size=1000 and threads=20, consumming ~20G memory, better to be multiples of the number of threads.'))
-    args.add_argument('--log', default='ribodetector.log', type=str, 
+    args.add_argument('--log', default=None, type=str,
                       help='Log file name')
     args.add_argument('-v', '--version', action='version',
                       version='%(prog)s {version}'.format(version=__version__))

diff --git a/ribodetector/utils/__version__.py b/ribodetector/utils/__version__.py
@@ -1 +1 @@
-__version__ = '0.2.2'
+__version__ = '0.2.9'
diff --git a/setup.py b/setup.py
@@ -17,7 +17,7 @@
 
 setup(
     name="ribodetector",
-    version="0.2.8",
+    version="0.2.9",
     python_requires=">=3.8, <=3.10",
     author="Z-L Deng",
     author_email="[email protected]",