diff --git a/README.md b/README.md index 7e3bd05..44c3878 100644 --- a/README.md +++ b/README.md @@ -85,12 +85,13 @@ optional arguments: -t THREADS, --threads THREADS number of threads to use. (default: 10) -m MEMORY, --memory MEMORY - amount (GB) of GPU RAM. (default: 12) + Amount (GB) of GPU RAM. (default: 12) --chunk_size CHUNK_SIZE Use this parameter when having low memory. Parsing the file in chunks. Not needed when free RAM >=5 * your_file_size (uncompressed, sum of paired ends). When chunk_size=256, memory=16 it will load 256 * 16 * 1024 reads each chunk (use ~20 GB for 100bp paired end). - -v, --version show program's version number and exit + --log LOG Log file name + -v, --version Show program's version number and exit ``` #### CPU mode @@ -104,7 +105,7 @@ ribodetector_cpu -t 20 \ --chunk_size 256 \ -o outputs/reads.nonrrna.1.fq outputs/reads.nonrrna.2.fq ``` -The above command line excutes ribodetector for paired-end reads with mean length 100 using 20 CPU cores. The input reads do not need to be same length. RiboDetector supports reads with variable length. Setting `-l` to the mean read length is recommended. +The above command line excutes ribodetector for paired-end reads with mean length 100 using 20 CPU cores. The input reads do not need to be same length. RiboDetector supports reads with variable length. Setting `-l` to the mean read length is recommended. If you need to save the log into a file, you can specify it with `--log <logfile>` Note: when using **SLURM** job submission system, you need to specify `--cpus-per-task` to the number you CPU cores you need and set `--threads-per-core` to 1. @@ -143,7 +144,8 @@ optional arguments: --chunk_size CHUNK_SIZE chunk_size * 1024 reads to load each time. When chunk_size=1000 and threads=20, consumming ~20G memory, better to be multiples of the number of threads.. - -v, --version show program's version number and exit + --log LGO Log file name + -v, --version Show program's version number and exit ``` **Note**: RiboDetector uses multiprocessing with shared memory, thus the memory use of a single process indicated in `htop` or `top` is actually the total memory used by RiboDector. Some job submission system like SGE mis-calculated the total memory use by adding up the memory use of all process. If you see this do not worry it will cause out of memory issue. diff --git a/ribodetector/__init__.py b/ribodetector/__init__.py index 1fc30c1..4c45ff8 100644 --- a/ribodetector/__init__.py +++ b/ribodetector/__init__.py @@ -4,5 +4,5 @@ Accurate and rapid RiboRNA sequences Detector based on deep learning. """ -__version__ = "0.2.7" +__version__ = "0.2.9" __author__ = 'ZL Deng' diff --git a/ribodetector/detect.py b/ribodetector/detect.py index 902b484..b08a7b6 100644 --- a/ribodetector/detect.py +++ b/ribodetector/detect.py @@ -207,15 +207,21 @@ def run(self): # del r1_data, r2_data, r1_output, r2_output, r1_batch_labels, r2_batch_labels + self.logger.info('Processed {}{}{}{} sequences in total'.format( + colors.BOLD, + colors.OKCYAN, + num_seqs, + colors.ENDC)) + # Write predicted rRNA sequences if the rRNA output file is given - self.logger.info('Detected {}{}{}{} non-rRNA sequences.'.format( + self.logger.info('Detected {}{}{}{} non-rRNA sequences'.format( colors.BOLD, colors.OKCYAN, num_nonrrna, colors.ENDC )) - self.logger.info('Detected {}{}{}{} rRNA sequences.'.format( + self.logger.info('Detected {}{}{}{} rRNA sequences'.format( colors.BOLD, colors.OKCYAN, num_rrna, @@ -293,7 +299,13 @@ def run(self): # del data, output, batch_labels - self.logger.info('Detected {}{}{}{} non-rRNA sequences.'.format( + self.logger.info('Processed {}{}{}{} sequences in total'.format( + colors.BOLD, + colors.OKCYAN, + num_seqs, + colors.ENDC)) + + self.logger.info('Detected {}{}{}{} non-rRNA sequences'.format( colors.BOLD, colors.OKCYAN, num_nonrrna, @@ -393,14 +405,21 @@ def run_with_chunks(self): colors.ENDC)) # del r1_data, r2_data, r1_output, r2_output, r1_batch_labels, r2_batch_labels - self.logger.info('Detected {}{}{}{} non-rRNA sequences.'.format( + + self.logger.info('Processed {}{}{}{} sequences in total'.format( + colors.BOLD, + colors.OKCYAN, + num_read, + colors.ENDC)) + + self.logger.info('Detected {}{}{}{} non-rRNA sequences'.format( colors.BOLD, colors.OKCYAN, num_nonrrna, colors.ENDC )) - self.logger.info('Detected {}{}{}{} rRNA sequences.'.format( + self.logger.info('Detected {}{}{}{} rRNA sequences'.format( colors.BOLD, colors.OKCYAN, num_rrna, @@ -413,7 +432,7 @@ def run_with_chunks(self): rrna2_fh.close() if self.args.ensure == 'both': - self.logger.info('Done! Detected {}{}{}{} rRNA sequences, discarded {}{}{}{} unclassified sequences.'.format( + self.logger.info('Discarded {}{}{}{} unclassified sequences'.format( colors.BOLD, colors.OKCYAN, num_unknown, @@ -477,6 +496,12 @@ def run_with_chunks(self): num_read, colors.ENDC)) + self.logger.info('Processed {}{}{}{} sequences in total'.format( + colors.BOLD, + colors.OKCYAN, + num_read, + colors.ENDC)) + self.logger.info('Detected {}{}{}{} non-rRNA sequences'.format( colors.BOLD, colors.OKCYAN, @@ -757,15 +782,15 @@ def main(): (Only applicable for paired end reads, discard the read pair when their predicitons are discordant)''') args.add_argument('-t', '--threads', default=10, type=int, - help='number of threads to use. (default: 10)') + help='Number of threads to use. (default: 10)') args.add_argument('-m', '--memory', default=32, type=int, - help='amount (GB) of GPU RAM. (default: 12)') + help='Amount (GB) of GPU RAM. (default: 12)') args.add_argument('--chunk_size', default=None, type=int, help='Use this parameter when having low memory. Parsing the file in chunks.\n{}.\n{}.'.format( 'Not needed when free RAM >=5 * your_file_size (uncompressed, sum of paired ends)', 'When chunk_size=256, memory=16 it will load 256 * 16 * 1024 reads each chunk (use ~20 GB for 100bp paired end)' )) - args.add_argument('--log', default='ribodetector.log', type=str, + args.add_argument('--log', default=None, type=str, help='Log file name') args.add_argument('-v', '--version', action='version', version='%(prog)s {version}'.format(version=__version__)) diff --git a/ribodetector/detect_cpu.py b/ribodetector/detect_cpu.py index 21eddef..b7a20f7 100644 --- a/ribodetector/detect_cpu.py +++ b/ribodetector/detect_cpu.py @@ -206,15 +206,21 @@ def run(self): num_unknown += len(r1_dict[-1]) # del r1_data, r2_data, r1_output, r2_output, r1_batch_labels, r2_batch_labels + + self.logger.info('Processed {}{}{}{} sequences in total'.format( + colors.BOLD, + colors.OKCYAN, + num_seqs, + colors.ENDC)) - self.logger.info('Detected {}{}{}{} non-rRNA sequences.'.format( + self.logger.info('Detected {}{}{}{} non-rRNA sequences'.format( colors.BOLD, colors.OKCYAN, num_nonrrna, colors.ENDC )) - self.logger.info('Detected {}{}{}{} rRNA sequences.'.format( + self.logger.info('Detected {}{}{}{} rRNA sequences'.format( colors.BOLD, colors.OKCYAN, num_rrna, @@ -226,7 +232,7 @@ def run(self): rrna2_fh.close() if self.args.ensure == 'both': - self.logger.info('Discarded {}{}{}{} unclassified sequences.'.format( + self.logger.info('Discarded {}{}{}{} unclassified sequences'.format( colors.BOLD, colors.OKCYAN, num_unknown, @@ -301,6 +307,12 @@ def run(self): if self.rrna is not None and r_dict[1]: rrna_fh.write('\n'.join(r_dict[1]) + '\n') + self.logger.info('Processed {}{}{}{} sequences in total'.format( + colors.BOLD, + colors.OKCYAN, + num_seqs, + colors.ENDC)) + self.logger.info('Detected {}{}{}{} non-rRNA sequences'.format( colors.BOLD, colors.OKCYAN, @@ -333,6 +345,10 @@ def run_with_chunks(self): num_rrna = 0 if self.is_paired: + self.logger.info('Classify paired-end reads with chunk size {}{}{}'.format( + colors.BOLD, + self.chunk_size, + colors.ENDC)) if self.rrna is not None: self.logger.info('Writing output rRNA sequences into file: {}{}{}'.format( colors.OKBLUE, @@ -416,20 +432,25 @@ def run_with_chunks(self): # del r1_data, r2_data, r1_output, r2_output, r1_batch_labels, r2_batch_labels num_read += len(chunk[0]) - self.logger.info('{}{}{}{} reads classified!'.format( - colors.BOLD, - colors.OKCYAN, + self.logger.info('{}{}{}{} reads finished!'.format( + colors.OKGREEN, num_read, colors.ENDC)) - self.logger.info('Detected {}{}{}{} non-rRNA sequences.'.format( + self.logger.info('Processed {}{}{}{} sequences in total'.format( + colors.BOLD, + colors.OKCYAN, + num_read, + colors.ENDC)) + + self.logger.info('Detected {}{}{}{} non-rRNA sequences'.format( colors.BOLD, colors.OKCYAN, num_nonrrna, colors.ENDC )) - self.logger.info('Detected {}{}{}{} rRNA sequences.'.format( + self.logger.info('Detected {}{}{}{} rRNA sequences'.format( colors.BOLD, colors.OKCYAN, num_rrna, @@ -441,7 +462,7 @@ def run_with_chunks(self): rrna2_fh.close() if self.args.ensure == 'both': - self.logger.info('Discarded {}{}{}{} unclassified sequences.'.format( + self.logger.info('Discarded {}{}{}{} unclassified sequences'.format( colors.BOLD, colors.OKCYAN, num_unknown, @@ -455,7 +476,7 @@ def run_with_chunks(self): else: # num_read = 0 - self.logger.info('Classify paired end reads with chunk size {}{}{}'.format( + self.logger.info('Classify reads with chunk size {}{}{}'.format( colors.BOLD, self.chunk_size, colors.ENDC)) @@ -514,12 +535,17 @@ def run_with_chunks(self): num_read += len(chunk) - self.logger.info('{}{}{}{} reads classified!'.format( - colors.BOLD, - colors.OKCYAN, + self.logger.info('{}{}{} reads finished!'.format( + colors.OKGREEN, num_read, colors.ENDC)) + self.logger.info('Processed {}{}{}{} sequences in total'.format( + colors.BOLD, + colors.OKCYAN, + num_read, + colors.ENDC)) + self.logger.info('Detected {}{}{}{} non-rRNA sequences'.format( colors.BOLD, colors.OKCYAN, @@ -570,10 +596,6 @@ def detect(self): if self.chunk_size is None: self.run() else: - self.logger.info('Classify reads with chunk size {}{}{}'.format( - colors.BOLD, - self.chunk_size, - colors.ENDC)) self.run_with_chunks() @staticmethod @@ -771,12 +793,12 @@ def main(): (Only applicable for paired end reads, discard the read pair when their predicitons are discordant)''') args.add_argument('-t', '--threads', default=20, type=int, - help='number of threads to use. (default: 20)') + help='Number of threads to use. (default: 20)') args.add_argument('--chunk_size', default=None, type=int, help='chunk_size * 1024 reads to load each time. \n{}.'.format( 'When chunk_size=1000 and threads=20, consumming ~20G memory, better to be multiples of the number of threads.')) - args.add_argument('--log', default='ribodetector.log', type=str, + args.add_argument('--log', default=None, type=str, help='Log file name') args.add_argument('-v', '--version', action='version', version='%(prog)s {version}'.format(version=__version__)) diff --git a/ribodetector/utils/__version__.py b/ribodetector/utils/__version__.py index 020ed73..cd9b137 100644 --- a/ribodetector/utils/__version__.py +++ b/ribodetector/utils/__version__.py @@ -1 +1 @@ -__version__ = '0.2.2' +__version__ = '0.2.9' diff --git a/setup.py b/setup.py index aee93e2..512b1b4 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ setup( name="ribodetector", - version="0.2.8", + version="0.2.9", python_requires=">=3.8, <=3.10", author="Z-L Deng", author_email="dawnmsg@gmail.com",