Skip to content

Commit

Permalink
Change the logging behavior
Browse files Browse the repository at this point in the history
  • Loading branch information
dawnmy committed Dec 20, 2023
1 parent 496683e commit 7bf7f14
Show file tree
Hide file tree
Showing 6 changed files with 84 additions and 35 deletions.
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,13 @@ optional arguments:
-t THREADS, --threads THREADS
number of threads to use. (default: 10)
-m MEMORY, --memory MEMORY
amount (GB) of GPU RAM. (default: 12)
Amount (GB) of GPU RAM. (default: 12)
--chunk_size CHUNK_SIZE
Use this parameter when having low memory. Parsing the file in chunks.
Not needed when free RAM >=5 * your_file_size (uncompressed, sum of paired ends).
When chunk_size=256, memory=16 it will load 256 * 16 * 1024 reads each chunk (use ~20 GB for 100bp paired end).
-v, --version show program's version number and exit
--log LOG Log file name
-v, --version Show program's version number and exit
```
#### CPU mode
Expand All @@ -104,7 +105,7 @@ ribodetector_cpu -t 20 \
--chunk_size 256 \
-o outputs/reads.nonrrna.1.fq outputs/reads.nonrrna.2.fq
```
The above command line excutes ribodetector for paired-end reads with mean length 100 using 20 CPU cores. The input reads do not need to be same length. RiboDetector supports reads with variable length. Setting `-l` to the mean read length is recommended.
The above command line excutes ribodetector for paired-end reads with mean length 100 using 20 CPU cores. The input reads do not need to be same length. RiboDetector supports reads with variable length. Setting `-l` to the mean read length is recommended. If you need to save the log into a file, you can specify it with `--log <logfile>`
Note: when using **SLURM** job submission system, you need to specify `--cpus-per-task` to the number you CPU cores you need and set `--threads-per-core` to 1.
Expand Down Expand Up @@ -143,7 +144,8 @@ optional arguments:
--chunk_size CHUNK_SIZE
chunk_size * 1024 reads to load each time.
When chunk_size=1000 and threads=20, consumming ~20G memory, better to be multiples of the number of threads..
-v, --version show program's version number and exit
--log LGO Log file name
-v, --version Show program's version number and exit
```
**Note**: RiboDetector uses multiprocessing with shared memory, thus the memory use of a single process indicated in `htop` or `top` is actually the total memory used by RiboDector. Some job submission system like SGE mis-calculated the total memory use by adding up the memory use of all process. If you see this do not worry it will cause out of memory issue.
Expand Down
2 changes: 1 addition & 1 deletion ribodetector/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@
Accurate and rapid RiboRNA sequences Detector based on deep learning.
"""

__version__ = "0.2.7"
__version__ = "0.2.9"
__author__ = 'ZL Deng'
43 changes: 34 additions & 9 deletions ribodetector/detect.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,15 +207,21 @@ def run(self):

# del r1_data, r2_data, r1_output, r2_output, r1_batch_labels, r2_batch_labels

self.logger.info('Processed {}{}{}{} sequences in total'.format(
colors.BOLD,
colors.OKCYAN,
num_seqs,
colors.ENDC))

# Write predicted rRNA sequences if the rRNA output file is given
self.logger.info('Detected {}{}{}{} non-rRNA sequences.'.format(
self.logger.info('Detected {}{}{}{} non-rRNA sequences'.format(
colors.BOLD,
colors.OKCYAN,
num_nonrrna,
colors.ENDC
))

self.logger.info('Detected {}{}{}{} rRNA sequences.'.format(
self.logger.info('Detected {}{}{}{} rRNA sequences'.format(
colors.BOLD,
colors.OKCYAN,
num_rrna,
Expand Down Expand Up @@ -293,7 +299,13 @@ def run(self):

# del data, output, batch_labels

self.logger.info('Detected {}{}{}{} non-rRNA sequences.'.format(
self.logger.info('Processed {}{}{}{} sequences in total'.format(
colors.BOLD,
colors.OKCYAN,
num_seqs,
colors.ENDC))

self.logger.info('Detected {}{}{}{} non-rRNA sequences'.format(
colors.BOLD,
colors.OKCYAN,
num_nonrrna,
Expand Down Expand Up @@ -393,14 +405,21 @@ def run_with_chunks(self):
colors.ENDC))

# del r1_data, r2_data, r1_output, r2_output, r1_batch_labels, r2_batch_labels
self.logger.info('Detected {}{}{}{} non-rRNA sequences.'.format(

self.logger.info('Processed {}{}{}{} sequences in total'.format(
colors.BOLD,
colors.OKCYAN,
num_read,
colors.ENDC))

self.logger.info('Detected {}{}{}{} non-rRNA sequences'.format(
colors.BOLD,
colors.OKCYAN,
num_nonrrna,
colors.ENDC
))

self.logger.info('Detected {}{}{}{} rRNA sequences.'.format(
self.logger.info('Detected {}{}{}{} rRNA sequences'.format(
colors.BOLD,
colors.OKCYAN,
num_rrna,
Expand All @@ -413,7 +432,7 @@ def run_with_chunks(self):
rrna2_fh.close()

if self.args.ensure == 'both':
self.logger.info('Done! Detected {}{}{}{} rRNA sequences, discarded {}{}{}{} unclassified sequences.'.format(
self.logger.info('Discarded {}{}{}{} unclassified sequences'.format(
colors.BOLD,
colors.OKCYAN,
num_unknown,
Expand Down Expand Up @@ -477,6 +496,12 @@ def run_with_chunks(self):
num_read,
colors.ENDC))

self.logger.info('Processed {}{}{}{} sequences in total'.format(
colors.BOLD,
colors.OKCYAN,
num_read,
colors.ENDC))

self.logger.info('Detected {}{}{}{} non-rRNA sequences'.format(
colors.BOLD,
colors.OKCYAN,
Expand Down Expand Up @@ -757,15 +782,15 @@ def main():
(Only applicable for paired end reads, discard the read pair when their predicitons are discordant)''')

args.add_argument('-t', '--threads', default=10, type=int,
help='number of threads to use. (default: 10)')
help='Number of threads to use. (default: 10)')
args.add_argument('-m', '--memory', default=32, type=int,
help='amount (GB) of GPU RAM. (default: 12)')
help='Amount (GB) of GPU RAM. (default: 12)')
args.add_argument('--chunk_size', default=None, type=int,
help='Use this parameter when having low memory. Parsing the file in chunks.\n{}.\n{}.'.format(
'Not needed when free RAM >=5 * your_file_size (uncompressed, sum of paired ends)',
'When chunk_size=256, memory=16 it will load 256 * 16 * 1024 reads each chunk (use ~20 GB for 100bp paired end)'
))
args.add_argument('--log', default='ribodetector.log', type=str,
args.add_argument('--log', default=None, type=str,
help='Log file name')
args.add_argument('-v', '--version', action='version',
version='%(prog)s {version}'.format(version=__version__))
Expand Down
60 changes: 41 additions & 19 deletions ribodetector/detect_cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,15 +206,21 @@ def run(self):
num_unknown += len(r1_dict[-1])

# del r1_data, r2_data, r1_output, r2_output, r1_batch_labels, r2_batch_labels

self.logger.info('Processed {}{}{}{} sequences in total'.format(
colors.BOLD,
colors.OKCYAN,
num_seqs,
colors.ENDC))

self.logger.info('Detected {}{}{}{} non-rRNA sequences.'.format(
self.logger.info('Detected {}{}{}{} non-rRNA sequences'.format(
colors.BOLD,
colors.OKCYAN,
num_nonrrna,
colors.ENDC
))

self.logger.info('Detected {}{}{}{} rRNA sequences.'.format(
self.logger.info('Detected {}{}{}{} rRNA sequences'.format(
colors.BOLD,
colors.OKCYAN,
num_rrna,
Expand All @@ -226,7 +232,7 @@ def run(self):
rrna2_fh.close()

if self.args.ensure == 'both':
self.logger.info('Discarded {}{}{}{} unclassified sequences.'.format(
self.logger.info('Discarded {}{}{}{} unclassified sequences'.format(
colors.BOLD,
colors.OKCYAN,
num_unknown,
Expand Down Expand Up @@ -301,6 +307,12 @@ def run(self):
if self.rrna is not None and r_dict[1]:
rrna_fh.write('\n'.join(r_dict[1]) + '\n')

self.logger.info('Processed {}{}{}{} sequences in total'.format(
colors.BOLD,
colors.OKCYAN,
num_seqs,
colors.ENDC))

self.logger.info('Detected {}{}{}{} non-rRNA sequences'.format(
colors.BOLD,
colors.OKCYAN,
Expand Down Expand Up @@ -333,6 +345,10 @@ def run_with_chunks(self):
num_rrna = 0

if self.is_paired:
self.logger.info('Classify paired-end reads with chunk size {}{}{}'.format(
colors.BOLD,
self.chunk_size,
colors.ENDC))
if self.rrna is not None:
self.logger.info('Writing output rRNA sequences into file: {}{}{}'.format(
colors.OKBLUE,
Expand Down Expand Up @@ -416,20 +432,25 @@ def run_with_chunks(self):
# del r1_data, r2_data, r1_output, r2_output, r1_batch_labels, r2_batch_labels
num_read += len(chunk[0])

self.logger.info('{}{}{}{} reads classified!'.format(
colors.BOLD,
colors.OKCYAN,
self.logger.info('{}{}{}{} reads finished!'.format(
colors.OKGREEN,
num_read,
colors.ENDC))

self.logger.info('Detected {}{}{}{} non-rRNA sequences.'.format(
self.logger.info('Processed {}{}{}{} sequences in total'.format(
colors.BOLD,
colors.OKCYAN,
num_read,
colors.ENDC))

self.logger.info('Detected {}{}{}{} non-rRNA sequences'.format(
colors.BOLD,
colors.OKCYAN,
num_nonrrna,
colors.ENDC
))

self.logger.info('Detected {}{}{}{} rRNA sequences.'.format(
self.logger.info('Detected {}{}{}{} rRNA sequences'.format(
colors.BOLD,
colors.OKCYAN,
num_rrna,
Expand All @@ -441,7 +462,7 @@ def run_with_chunks(self):
rrna2_fh.close()

if self.args.ensure == 'both':
self.logger.info('Discarded {}{}{}{} unclassified sequences.'.format(
self.logger.info('Discarded {}{}{}{} unclassified sequences'.format(
colors.BOLD,
colors.OKCYAN,
num_unknown,
Expand All @@ -455,7 +476,7 @@ def run_with_chunks(self):

else:
# num_read = 0
self.logger.info('Classify paired end reads with chunk size {}{}{}'.format(
self.logger.info('Classify reads with chunk size {}{}{}'.format(
colors.BOLD,
self.chunk_size,
colors.ENDC))
Expand Down Expand Up @@ -514,12 +535,17 @@ def run_with_chunks(self):

num_read += len(chunk)

self.logger.info('{}{}{}{} reads classified!'.format(
colors.BOLD,
colors.OKCYAN,
self.logger.info('{}{}{} reads finished!'.format(
colors.OKGREEN,
num_read,
colors.ENDC))

self.logger.info('Processed {}{}{}{} sequences in total'.format(
colors.BOLD,
colors.OKCYAN,
num_read,
colors.ENDC))

self.logger.info('Detected {}{}{}{} non-rRNA sequences'.format(
colors.BOLD,
colors.OKCYAN,
Expand Down Expand Up @@ -570,10 +596,6 @@ def detect(self):
if self.chunk_size is None:
self.run()
else:
self.logger.info('Classify reads with chunk size {}{}{}'.format(
colors.BOLD,
self.chunk_size,
colors.ENDC))
self.run_with_chunks()

@staticmethod
Expand Down Expand Up @@ -771,12 +793,12 @@ def main():
(Only applicable for paired end reads, discard the read pair when their predicitons are discordant)''')

args.add_argument('-t', '--threads', default=20, type=int,
help='number of threads to use. (default: 20)')
help='Number of threads to use. (default: 20)')

args.add_argument('--chunk_size', default=None, type=int,
help='chunk_size * 1024 reads to load each time. \n{}.'.format(
'When chunk_size=1000 and threads=20, consumming ~20G memory, better to be multiples of the number of threads.'))
args.add_argument('--log', default='ribodetector.log', type=str,
args.add_argument('--log', default=None, type=str,
help='Log file name')
args.add_argument('-v', '--version', action='version',
version='%(prog)s {version}'.format(version=__version__))
Expand Down
2 changes: 1 addition & 1 deletion ribodetector/utils/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.2.2'
__version__ = '0.2.9'
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

setup(
name="ribodetector",
version="0.2.8",
version="0.2.9",
python_requires=">=3.8, <=3.10",
author="Z-L Deng",
author_email="[email protected]",
Expand Down

0 comments on commit 7bf7f14

Please sign in to comment.