3.1 convenience update push2

SegataLab · Jan 20, 2021 · 9036d34 · 9036d34
1 parent 94b7032
commit 9036d34
Show file tree

Hide file tree

Showing 5 changed files with 140 additions and 89 deletions.
diff --git a/README.md b/README.md
@@ -1,23 +1,22 @@
 
-## PanPhlAn 3 - strain detection and characterization 
+# PanPhlAn 3 - strain detection and characterization
 
-#### Pangenome-based Phylogenomic Analysis
+### Pangenome-based Phylogenomic Analysis
 
 PanPhlAn is a strain-level metagenomic profiling tool for identifying
 the gene composition of individual strains in metagenomic samples.
 PanPhlAn’s ability for strain-tracking and functional analysis of unknown
 pathogens makes it an efficient tool for culture-free microbial population studies.
 
-PanPhlAn is written in Python and covers the 4 main tasks:
-
-* `panphlan_download_pangenome.py`, to download pangenome files (fasta, BowTie2 indexes and general information) for over 3,000 species
-
-For custom pangenome generation (advanced) see the [PanPhlAn exporter](https://github.com/SegataLab/PanPhlAn_pangenome_exporter)
+PanPhlAn is written in Python and covers the 3 main tasks:
 
+* `panphlan_download_pangenome.py`, to download pangenome files (fasta file of contigs, BowTie2 indexes and general information) for over 3,000 species
 * `panphlan_map.py`, to profile each metagenomic sample by mapping it against the species of interest
 * `panphlan_profile.py`, to merge and process the mapping results in order to get the final gene presence/absence matrix
-* `panphlan_find_gene_grp.py`, organise OPTICS clustering to find some group of gene with similar profile and assess if they could be mobile elements in the genome. Also plot the presence/absence matrix as Heatmap. 
 
+For custom pangenome generation (advanced) see the [PanPhlAn exporter](https://github.com/SegataLab/PanPhlAn_pangenome_exporter)
+
+---
 PanPhlAn runs under Ubuntu/Linux and requires the following software tools to be installed on your system:
 
 * Bowtie2
@@ -29,13 +28,25 @@ And the following Python libraries:
 * numpy
 * pandas
 * scipy
-* sklearn (only if using `panphlan_find_gene_grp.py`)  
-If visualizations are made, one also needs :
-* matplotlib
-* seaborn
 
-For any help see the wiki or the [bioBakery forum](https://forum.biobakery.org/) for overall discussions. Purely technical issues should better be raised on GitHub than on the forum.
+---
+
+If you use PanPhlAn, please cite:
+
+[**Integrating taxonomic, functional, and strain-level profiling of diverse microbial communities with bioBakery 3**](https://www.biorxiv.org/content/10.1101/2020.11.19.388223v1) *Francesco Beghini, Lauren J McIver, Aitor Blanco-Miguez, Leonard Dubois, Francesco Asnicar, Sagun Maharjan, Ana Mailyan, Andrew Maltez Thomas, Paolo Manghi, Mireia Valles-Colomer, George Weingart, Yancong Zhang, Moreno Zolfo, Curtis Huttenhower, Eric A Franzosa, Nicola Segata*. bioRxiv preprint (2020)
+
+---
+
+### Help
+
+For any help see :
+* The [PanPhlan wiki](https://github.com/SegataLab/panphlan/wiki/Home_3_0)
+* The PanPhlAn step-by-step [tutorial](https://github.com/SegataLab/panphlan/wiki/Tutorial-3_0)
+* The [bioBakery help forum](https://forum.biobakery.org/) for overall discussions on various metagenomics softwares. (Purely technical issues should better be raised on GitHub than on the forum.)
+
+
+
 
 ----
 
-[PanPhlAn] is a project of the [Computational Metagenomics Lab at CIBIO](http://segatalab.cibio.unitn.it/), University of Trento, Italy
+PanPhlAn is a project of the [Computational Metagenomics Lab at CIBIO](http://segatalab.cibio.unitn.it/), University of Trento, Italy
diff --git a/panphlan_download_pangenome.py b/panphlan_download_pangenome.py
@@ -13,10 +13,9 @@
 from misc import info
 
 author__ = 'Leonard Dubois and Nicola Segata (contact on https://forum.biobakery.org/)'
-__version__ = '3.0'
-__date__ = '24 April 2020'
+__version__ = '3.1'
+__date__ = '12 Jan 2021'
 
-#DOWNLOAD_URL = "https://www.dropbox.com/s/1gxpwk8ba0rmopp/panphlan_pangenomes_links.tsv?dl=1"
 DOWNLOAD_URL = "https://www.dropbox.com/s/c6fkhz4g42w4pf2/panphlan_pangenomes_links_md5.tsv?dl=1"
 
 
@@ -26,13 +25,18 @@
 :returns: the parsed arguments
 """
 def read_params():
-    p = ap.ArgumentParser(description="")
-    p.add_argument('-i','--input_name', type=str, default = None,
-                    help='')
-    p.add_argument('-o', '--output', type = str, default = ".",
-                    help='')
+    p = ap.ArgumentParser()
+    required = p.add_argument_group('required arguments')
+    required.add_argument('-i','--input_name', type = str, default = None,
+                        help='Name of species to download', required=True)
+    required.add_argument('-o', '--output', type = str, default = ".",
+                        help='output location', required=True)
     p.add_argument('-v', '--verbose', action='store_true',
                     help='Show progress information')
+    p.add_argument('--retry', type = int, default = 5,
+                    help='Number of retry in pangenome download. Default is 5')
+    p.add_argument('--wait', type = int, default = 30,
+                    help='Number of second spend waiting between download retries. Default 60')
     return p.parse_args()
 
 
@@ -87,7 +91,7 @@ def download(url, download_file, overwrite=False, verbose=False):
             urlretrieve(url, download_file, reporthook=ReportHook().report)
             info('\n')
         except EnvironmentError:
-            info('unable to download "{}"'.format(url), exit=True)
+            info('unable to download "{}"'.format(url), exit = True, exit_value = 1)
     else:
         info('File "{}" already present\n'.format(download_file))
 
@@ -97,12 +101,12 @@ def download(url, download_file, overwrite=False, verbose=False):
 # ------------------------------------------------------------------------------
 
 
-def find_url(query, verbose):
+def find_url(query, verbose, output_path):
     if verbose:
         print("Retrieving mapping file...")
     mapping_file = os.path.basename(DOWNLOAD_URL).replace('?dl=1', '')
-    download(DOWNLOAD_URL, mapping_file, overwrite=True, verbose=False)
-    IN = open(mapping_file, mode='r')
+    download(DOWNLOAD_URL, os.path.join(output_path, mapping_file), overwrite=False, verbose=False)
+    IN = open(os.path.join(output_path, mapping_file), mode='r')
 
     url = ""
     for line in IN:
@@ -115,18 +119,24 @@ def find_url(query, verbose):
     IN.close()
     if url == "":
         print("Pangenome not found.\nEither the species is not available or the name provided is incorrect")
-        sys.exit()
+        sys.exit(2)
 
     url = url.replace('?dl=0', '?dl=1')
     return(url, filename, true_md5)
 
 
-def extract_pangenome(archive_name, output_path):
-    print("Extracting archive")
-    process = subprocess.Popen(['tar', 'jxvf', os.path.join(output_path, archive_name), '-C', output_path],
-                     stdout=subprocess.PIPE,
-                     stderr=subprocess.PIPE)
-    stdout, stderr = process.communicate()
+def extract_pangenome(archive_name, output_path, verbose):
+    print("Extracting archive")                     
+    try:
+        proc = subprocess.check_output(['tar', 'jxvf', os.path.join(output_path, archive_name), '-C', output_path], 
+                        stderr=subprocess.STDOUT)
+        if verbose:
+            sys.stdout.write('[I] Archive extracted !\n')
+        os.remove(os.path.join(output_path, archive_name))    
+    except subprocess.CalledProcessError:
+        print("Error in extracting pangenome archive \n")
+        sys.exit(3)
+
 
 
 # ------------------------------------------------------------------------------
@@ -141,16 +151,23 @@ def main():
     if not os.path.exists(args.output) :
         os.mkdir(args.output)
 
-    url, filename, true_md5 = find_url(args.input_name, args.verbose)
+    url, filename, true_md5 = find_url(args.input_name, args.verbose, args.output)
     download(url, os.path.join(args.output, filename) )
     current_hash = hashlib.md5(open( os.path.join(args.output, filename),'rb').read()).hexdigest()
-    while current_hash != true_md5:
+    retries_done = 0
+    while current_hash != true_md5 and retries_done < args.retry :
         sys.stdout.write('[W] Incorrect MD5 sum. PanPhlAn will try to re-dowload the file...\n')
+        retries_done += 1
+        time.sleep(args.wait)
         download(url, os.path.join(args.output, filename), overwrite=True )
         current_hash = hashlib.md5(open( os.path.join(args.output, filename),'rb').read()).hexdigest()
+
+    if current_hash != true_md5:
+        info("Incorrect file integrity after retries", exit=True, exit_value = 3)
+
     sys.stdout.write('[I] File downloaded ! MD5 checked\n')
-    extract_pangenome(filename, args.output)
-    sys.stdout.write('[I] Archive extracted !\n')
+    extract_pangenome(filename, args.output, args.verbose)
+
 
 
 if __name__ == '__main__':

diff --git a/panphlan_map.py b/panphlan_map.py
@@ -13,8 +13,8 @@
 from misc import check_bowtie2
 
 __author__ = 'Leonard Dubois, Matthias Scholz, Thomas Tolio and Nicola Segata (contact on https://forum.biobakery.org/)'
-__version__ = '3.0'
-__date__ = '20 April 2020'
+__version__ = '3.1'
+__date__ = '13 Jan 2021'
 
 
 DEFAULT_MIN_READ_LENGTH = 70
@@ -26,14 +26,18 @@
 """
 def read_params():
     p = ap.ArgumentParser(description="")
-    p.add_argument('-i', '--input', type = str, default=sys.stdin,
-                   help='Metagenomic sample to map')
-    p.add_argument('--indexes', type = str,
-                   help='Bowtie2 indexes path and file prefix')
-    p.add_argument('-p', '--pangenome', type = str,
+    required = p.add_argument_group('required arguments')
+    required.add_argument('-i', '--input', type = str, default=sys.stdin,
+                   help='Metagenomic sample to map', required=True)
+    required.add_argument('--indexes', type = str,
+                   help='Bowtie2 indexes path and file prefix', required=True)
+    required.add_argument('-p', '--pangenome', type = str, required=True,
                    help='Path to pangenome tsv file exported from ChocoPhlAn')
-    p.add_argument('-o', '--output', type = str, default=None,
-                   help='Path to output file')
+    required.add_argument('-o', '--output', type = str,
+                   help='Path to output file', required=True)
+
+    p.add_argument('--tmp', type=str, default='/tmp/',
+                   help='Location used for tmp files ')
     p.add_argument('--bt2', type=str, default='--very-sensitive',
                    help='Additional bowtie2 mapping options, separated by slash: /-D/20/-R/3/, default: -bt2 /--very-sensitive/')
     p.add_argument('-b','--out_bam', type=str, default=None,
@@ -140,12 +144,12 @@ def samtools_sam2bam(in_sam, args):
             sort_cmd = ['samtools', 'sort', '-m', str(int(args.sam_memory * 1024*1024*1024))]
 
             if args.out_bam == None: # .bam file is not saved, only temporary bam file
-                tmp_bam = tempfile.NamedTemporaryFile(delete=False, prefix='panphlan_', suffix='.bam')
-                is_tmp = True
+                tmp_bam = tempfile.NamedTemporaryFile(dir = args.tmp, delete=False, prefix='panphlan_', suffix='.bam')
+                is_bam_tmp = True
                 out_bam = tmp_bam.name
             else:
                 out_bam = args.out_bam
-                is_tmp = False
+                is_bam_tmp = False
 
             sort_cmd += ['-o', out_bam]
             # sort_cmd += ['-', '-o', out_bam]
@@ -156,7 +160,7 @@ def samtools_sam2bam(in_sam, args):
             if args.verbose:
                 print('[I] .bam file ' + out_bam + ' has been sorted')
                 print('Samtools SAM->BAM translation (view+sort) completed.')
-            outcome = (is_tmp, out_bam)
+            outcome = (is_bam_tmp, out_bam)
 
         except (KeyboardInterrupt, SystemExit):
             p3.kill()
@@ -167,6 +171,8 @@ def samtools_sam2bam(in_sam, args):
         p2.kill()
         sys.stderr.flush()
         sys.stderr.write('\r')
+        if is_bam_tmp:
+            os.unlink(tmp_bam.name)
         sys.exit('[E] Execution has been manually halted.\n')
     finally:
         os.unlink(in_sam.name)
@@ -211,7 +217,7 @@ def mapping(args):
             p1 = subprocess.Popen(bowtie2_cmd, stdin=p0.stdout, stdout=subprocess.PIPE)
         else:
             p1 = subprocess.Popen(bowtie2_cmd, stdout=subprocess.PIPE)
-        tmp_sam = tempfile.NamedTemporaryFile(delete=False, prefix='panphlan_', suffix='.sam')
+        tmp_sam = tempfile.NamedTemporaryFile(dir = args.tmp, delete=False, prefix='panphlan_', suffix='.sam')
         if args.verbose:
             print('[I] Created temporary file ' + tmp_sam.name)
             print('[W] Please wait. The computation may take several minutes...')
@@ -253,7 +259,7 @@ def mapping(args):
 # ------------------------------------------------------------------------------
 #   STEP 3
 # ------------------------------------------------------------------------------
-def piling_up(bam_file, is_tmp, csv_file, args):
+def piling_up(bam_file, is_bam_tmp, csv_file, args):
     """Create the indexes and then call the Samtool's mpileup command
         CSV file: meaning of the columns
             Each line consists of 5 (or optionally 6) tab-separated columns:
@@ -281,36 +287,42 @@ def piling_up(bam_file, is_tmp, csv_file, args):
         index_cmd = ['samtools', 'index', bam_file]
         print('[I] ' + ' '.join(index_cmd))
         p4 = subprocess.Popen(index_cmd)
+        p4.wait()
+        if p4.returncode != 0 :
+            os.unlink(csv_file)
+            sys.exit('[E] Samtools index encountered some error.\n')
         if args.verbose: print('[I] BAM file ' + bam_file + ' has been indexed')
         try:
-            with open(csv_file, mode='w') as ocsv:
+            with open(csv_file, mode='w') as OUT_csv:
                 # command: samtools mpileup <INPUT BAM FILE> > <OUTPUT CSV FILE>
                 mpileup_cmd = ['samtools', 'mpileup', bam_file]
                 print('[I] ' + ' '.join(mpileup_cmd) + ' > ' + csv_file)
                 try:
-                    p5 = subprocess.Popen(mpileup_cmd, stdout=ocsv)
+                    p5 = subprocess.Popen(mpileup_cmd, stdout=OUT_csv)
                     p5.wait()
                 except Exception as err:
                     show_error_message(err)
                     sys.stderr.flush()
                     sys.stderr.write('\r')
                     sys.exit('[E] Samtools encountered some error.\n')
-            # delete tmp file
-            if is_tmp: os.unlink(bam_file)
-            os.unlink(bam_file + '.bai')
             if args.verbose: print('Samtools piling up (view+mpileup) completed.')
-        except (KeyboardInterrupt, SystemExit):
+        except KeyboardInterrupt:
             p5.kill()
-            if isTemp: os.unlink(bam_file)
             sys.stderr.flush()
             sys.stderr.write('\r')
             sys.exit('[E] Execution has been manually halted.\n')
-    except (KeyboardInterrupt, SystemExit):
+
+    except KeyboardInterrupt:
         p4.kill()
-        if isTemp: os.unlink(bam_file)
         sys.stderr.flush()
         sys.stderr.write('\r')
-        sys.exit('[E] Execution has been manually halted.\n')
+        sys.exit('[E] Execution has been manually halted.\n')    
+    finally :
+        # delete tmp file
+        if is_bam_tmp: 
+            os.unlink(bam_file)
+        if os.path.isfile(bam_file + '.bai'): 
+            os.unlink(bam_file + '.bai')
 
 # ------------------------------------------------------------------------------
 #   STEP 4
@@ -343,8 +355,8 @@ def genes_abundances(reads_file, contig2gene, args):
                 contig, position, abundance = words[0], int(words[1]), int(words[3])
                 # For each gene in the contig, if position in range of gene, increase its abundance
                 if contig in contig2gene.keys():
-                    for gene, (fr,to) in contig2gene[contig].items():
-                        if position in range(fr, to+1):
+                    for gene, (begin, end) in contig2gene[contig].items():
+                        if begin <= position and position <= end +1:
                             genes_abundances[gene] += abundance
         # WRITE
         if args.output == None:
@@ -353,15 +365,16 @@ def genes_abundances(reads_file, contig2gene, args):
                     sys.stdout.write(str(g) + '\t' + str(genes_abundances[g]) + '\n')
         else:
             # WRITE AND THEN COMPRESS WITH copyobj()
-            with bz2.open(args.output + '.bz2', 'wt', compresslevel=9) as OUT:
+            with open(args.output , 'wt') as OUT:
                 for g in genes_abundances:
                     if genes_abundances[g] > 0:
                         OUT.write(str(g) + '\t' + str(genes_abundances[g]) + '\n')
     except (KeyboardInterrupt, SystemExit):
-        os.unlink(reads_file)
         sys.stderr.flush()
         sys.stderr.write('\r')
         sys.exit('[E] Execution has been manually halted.\n')
+    finally:
+        os.unlink(reads_file)
     if args.verbose: print('Gene abundances computing has just been completed.')
 
 # ------------------------------------------------------------------------------
@@ -382,16 +395,16 @@ def main():
 
     if args.verbose: print('\nSTEP 2.  Mapping the reads...')
     tmp_sam =  mapping(args)
-    is_tmp, out_bam = samtools_sam2bam(tmp_sam, args)
+    is_bam_tmp, out_bam = samtools_sam2bam(tmp_sam, args)
 
     if args.verbose: print('\nSTEP 3. Piling up...')
-    tmp_csv = tempfile.NamedTemporaryFile(delete=False, prefix='panphlan_', suffix='.csv')
-    piling_up(out_bam, is_tmp, tmp_csv.name, args)
+    tmp_csv = tempfile.NamedTemporaryFile(dir = args.tmp, delete=False, prefix='panphlan_', suffix='.csv')
+    piling_up(out_bam, is_bam_tmp, tmp_csv.name, args)
 
     if args.verbose: print('\nSTEP 4. Exporting results...')
     contig2gene = build_pangenome_dicts(args)
     genes_abundances(tmp_csv.name, contig2gene, args)
-    os.unlink(tmp_csv.name)
+
 
 
 if __name__ == '__main__':