Some changes in the pipeline

PRIDE-Cluster · Jun 13, 2017 · 5b1b122 · 5b1b122
1 parent ed3456e
commit 5b1b122
Show file tree

Hide file tree

Showing 72 changed files with 486,354 additions and 0 deletions.
diff --git a/qc-archive-import/-p/cache/CurrentRelease_ID_identify.txt b/qc-archive-import/-p/cache/CurrentRelease_ID_identify.txt
diff --git a/qc-archive-import/-p/cache/CurrentRelease_SEQ.txt b/qc-archive-import/-p/cache/CurrentRelease_SEQ.txt
diff --git a/qc-archive-import/-p/cache/CurrentRelease_modifications.txt b/qc-archive-import/-p/cache/CurrentRelease_modifications.txt
diff --git a/qc-archive-import/-p/cache/CurrentRelease_spectrum_identify.txt b/qc-archive-import/-p/cache/CurrentRelease_spectrum_identify.txt
diff --git a/qc-archive-import/-p/cache/CurrentRelease_taxonomy_identify.txt b/qc-archive-import/-p/cache/CurrentRelease_taxonomy_identify.txt
diff --git a/qc-archive-import/QC_MGF_Report.Rmd b/qc-archive-import/QC_MGF_Report.Rmd
diff --git a/qc-archive-import/QC_MGF_Report.html b/qc-archive-import/QC_MGF_Report.html
diff --git a/qc-archive-import/QC_exporter_run.sh b/qc-archive-import/QC_exporter_run.sh
@@ -0,0 +1,7 @@
+#!/bin/sh
+
+OUTPUT_DIRECTORY="/Users/yperez/"
+
+python3.4 file_counter.py -i /Users/yperez/work/ms_work/cluster-work/archive_identified_2017-05 -o ${OUTPUT_DIRECTORY} -p /Users/yperez/work/ms_work/cluster-work/archive_identified_2015-05
+
+Rscript -e "rmarkdown::render('QC_MGF_Report.Rmd', params=list(output='/Users/yperez/cache'))"
diff --git a/qc-archive-import/file_counter.py b/qc-archive-import/file_counter.py
@@ -0,0 +1,236 @@
+"""
+This program takes different information from .MGF files in order to generate different outputs that will be used by the R
+Script.
+"""
+import glob
+import os
+import re
+import sys, getopt
+from collections import Counter
+
+
+def file_checker(pathCurrentRelease, pathPreviousRelease, pathOutput):
+    """
+    The PATH of these function must be changed in each release
+    :param pathCurrentRelease:
+    :param pathPreviousRelease:
+    :param pathOutput:
+    :return:
+    """
+
+    listCurrentReleaseFiles = glob.glob(pathCurrentRelease + "/" + '*.mgf')
+    listPreviousReleaseFiles = glob.glob(pathPreviousRelease + "/" + '*.mgf')
+
+    # Check if exist cache folder. If not will make it.
+    # RELEASE 1
+    if not os.path.exists(pathOutput + '/cache'):
+        os.makedirs(pathOutput + '/cache')
+
+    return listCurrentReleaseFiles, listPreviousReleaseFiles
+
+
+def read_files(list_of_files, path, tagCache):
+    """
+     This function creates a list in order to classificate different releases between identify and unidentify.
+     This also create outputs with different information that will be used in the R script
+
+    :param list_of_files:
+    :param path:
+    :return:
+    """
+
+    results = {True: [], False: []}
+
+    for fileName in list_of_files:
+        results = {True: [], False: []}
+        with open(fileName) as fp:
+            register = {}
+            identified = False
+            for line in fp:
+                line = line.rstrip()
+                if (line.find('TITLE') != -1 or line.find('PEPMASS') != -1 or line.find('CHARGE') != -1 or line.find(
+                        'SEQ') != -1 or line.find('USER03') != -1 or line.find('TAXONOMY') != -1):
+                    if line.find('TITLE=id') != -1:
+                        parts = line.split(';')
+                        elements = parts[0].split('=')
+                        spectrum = parts[2].split('=')
+                        composition = {'id': elements[2], 'file': parts[1], 'spectrum': spectrum[1]}
+
+                        register[elements[0]] = composition
+                    else:
+                        elements = line.split('=')
+                        register[elements[0]] = elements[1]
+                    if (line.find('SEQ') != -1):
+                        identified = True
+                elif line.find('END IONS') != -1:
+                    results[identified].append(register)
+                    identified = False
+                    register = {}
+        fp.close()
+
+        for result in results:
+            if result:
+                if len(results[True]) != 0:
+                    with open(os.path.join(path + tagCache + 'ID_identify.txt'), 'a') as output:
+                        for register in results[result]:
+                            output.write(register['TITLE']['id'] + "\n")
+                    output.close()
+
+                    with open(os.path.join(path + tagCache + 'spectrum_identify.txt'), 'a') as output:
+                        for register in results[result]:
+                            output.write(register['TITLE']['spectrum'] + "\n")
+                    output.close()
+
+                    with open(os.path.join(path + tagCache + 'taxonomy_identify.txt'), 'a') as output:
+                        for register in results[result]:
+                            if 'TAXONOMY' in register:
+                                output.write(register['TAXONOMY'] + "\n")
+                            else:
+                                continue
+                    output.close()
+
+                    if (result):
+                        with open(os.path.join(path + tagCache + 'SEQ.txt'), 'a') as output:
+                            for register in results[result]:
+                                output.write(register['SEQ'] + "\n")
+                        output.close()
+
+                        with open(os.path.join(path + tagCache + 'modifications.txt'), 'a') as output:
+                            for register in results[result]:
+                                if bool(re.match('MS:', register["USER03"])):
+                                    with open(path + 'MS_files.txt', 'a') as MS_file:
+                                        MS_file.write(register['USER03'] + "\n")
+                                    MS_file.close()
+                                line1 = re.sub("MS:" + "\d+", "", register['USER03'])
+                                line2 = line1.replace(";", ",").replace("\n", ",")
+                                new_string = re.sub(',+', ',', line2)
+                                new_string2 = new_string.replace(",", "\n")
+                                new_string3 = re.sub('\d+\-', '', new_string2)
+
+                                if not new_string3.strip():
+                                    continue
+                                if new_string3:
+                                    output.write(new_string3 + "\n")
+                        output.close()
+
+
+            else:
+                if len(results[False]) != 0:
+                    with open(os.path.join(path + tagCache + 'ID_unidentify.txt'), 'a') as output:
+                        for register in results[result]:
+                            output.write(register['TITLE']['id'] + "\n")
+                    output.close()
+
+                    with open(os.path.join(path + tagCache + 'spectrum_unidentify.txt'), 'a') as output:
+                        for register in results[result]:
+                            output.write(register['TITLE']['spectrum'] + "\n")
+                    output.close()
+
+                    with open(os.path.join(path + tagCache + 'taxonomy_unidentify.txt'), 'a') as output:
+                        for register in results[result]:
+                            if 'TAXONOMY' in register:
+                                output.write(register['TAXONOMY'] + "\n")
+                            else:
+                                continue
+                    output.close()
+
+
+def column_creator(outputPath, tagCache):
+    """
+     This function creates a table with the frequency of each element
+    :param outputPath:
+    :return:
+    """
+
+    # Sequences
+    if os.path.exists(outputPath + tagCache + 'SEQ.txt'):
+        with open(os.path.join(outputPath + tagCache + 'SEQ.txt')) as f1, open(os.path.join(outputPath + tagCache + 'tables_sequences_table.txt'),'w') as f2:
+            c = Counter(x.strip() for x in f1)
+            for x in c:
+                f2.write("%s\t%s\n" % (x, str(c[x])))
+        f1.close()
+        f2.close()
+
+    # Modifications
+    if os.path.exists(outputPath + tagCache + 'modifications.txt'):
+
+        with open(os.path.join(outputPath + tagCache + 'modifications.txt')) as f1, open(os.path.join(outputPath + tagCache + 'tables_modifications_table.txt'), 'w') as f2:
+            c = Counter(x.strip() for x in f1)
+            for x in c:
+                f2.write("%s\t%s\n" % (x, str(c[x])))
+        f1.close()
+        f2.close()
+
+    # Spectrum identify:
+    if os.path.exists(outputPath + tagCache + 'spectrum_identify.txt'):
+        with open(os.path.join(outputPath + tagCache + 'spectrum_identify.txt')) as f1, open(outputPath + tagCache + 'tables_spectrum_ide_table.txt','w') as f3:
+            lines1 = f1.read().count('\n')
+            f3.write("%s\n%s\n" % ("Spectrum Number", lines1))
+        f1.close()
+        f3.close()
+
+    if os.path.exists(outputPath + tagCache + 'spectrum_unidentify.txt'):
+        with open(os.path.join(outputPath + tagCache + 'spectrum_unidentify.txt')) as f2, open(outputPath + tagCache + 'tables_spectrum_unide_table.txt','w') as f3:
+            lines2 = f2.read().count('\n')
+            f3.write("%s\n%s\n" % ("Spectrum Number", lines2))
+        f2.close()
+        f3.close()
+
+    if os.path.exists(outputPath + tagCache + 'taxonomy_identify.txt'):
+        # Taxonomy ide:
+        with open(os.path.join(outputPath + tagCache   + 'taxonomy_identify.txt')) as f1, open(os.path.join(outputPath   + tagCache   + 'tables_taxonomy_ide_table.txt'), 'w') as f2:
+            c = Counter(x.strip() for x in f1)
+            for x in c:
+                f2.write("%s\t%s\n" % (x, str(c[x])))
+        f1.close()
+        f2.close()
+
+    if os.path.exists(outputPath + tagCache + 'taxonomy_unidentify.txt'):
+        # Taxonomy unide:
+        with open(os.path.join(outputPath + tagCache + 'taxonomy_unidentify.txt')) as f1, open(os.path.join(outputPath + tagCache + 'tables_taxonomy_unide_table.txt'), 'w') as f2:
+            c = Counter(x.strip() for x in f1)
+            for x in c:
+                f2.write("%s\t%s\n" % (x, str(c[x])))
+        f1.close()
+        f2.close()
+
+
+def main(argv):
+    # Getting the corresponding parameters.
+
+    pathPreviousRelease = ''
+    pathCurrentRelease = ''
+    pathOutput = ''
+
+    try:
+        opts, args = getopt.getopt(argv, "hi:p:o:", ["ifile=", "ifileprev=", "ofile="])
+    except getopt.GetoptError:
+        print('file_counter.py -i <inputcurrent> -p <inputprevious> -o <outputfile>')
+        sys.exit(2)
+    for opt, arg in opts:
+        if opt == '-h':
+            print('file_counter.py -i <inputcurrent> -p <inputprevious> -o <outputfile>')
+            sys.exit()
+        elif opt in ("-i", "--ifile"):
+            pathCurrentRelease = arg
+        elif opt in ("-p", "--ifileprev"):
+            pathPreviousRelease = arg
+        elif opt in ("-o", "--ofile"):
+            pathOutput = arg
+
+    print('Current Cluster Release Import Folder', pathCurrentRelease)
+    print('Previous Cluster Release Import Folder', pathPreviousRelease)
+
+    (listCurrentFiles, listPreviousFiles) = file_checker(pathCurrentRelease, pathPreviousRelease, pathOutput)
+
+    # Create main files.
+    read_files(listCurrentFiles, pathOutput  +  '/cache/', "CurrentRelease_")  # RELEASE1_identify
+    read_files(listPreviousFiles, pathOutput +  '/cache/', "PreviousRelease_")  # RELEASE2_identify
+
+    # Create frequency tables
+    column_creator(pathOutput  + '/cache/', "CurrentRelease_")
+    column_creator(pathOutput  + '/cache/', "PreviousRelease_")
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/qc-clustering-results/duplicate_remover.pl b/qc-clustering-results/duplicate_remover.pl
@@ -0,0 +1,22 @@
+#!/usr/bin/perl
+use warnings; use strict;
+#This script remove duplicated spectrum from file input. 
+# Run the script: Perl deplucate_remover.pl [Name of the file]
+# This script generate and output called total.norepeat.clustering
+
+my %seen; 
+my $count = 0; 
+my $output;
+
+open STDOUT, '>', "total.norepeat.clustering" or die "Can't create filehandle: $!";
+while (  <> ) {
+	s/;+/;/g;
+	next if ( m/spectrum=(\d+)/ and $seen{$1}++ );
+	if ( m/==Cluster==/ ) { 
+		open ( $output, ">", "temp".$count++ ) or die $!; 
+		select $output;
+	}
+	s/;+/;/g;
+	print;
+}
+	close STDOUT
diff --git a/qc-clustering-results/duplicate_remover.py b/qc-clustering-results/duplicate_remover.py
@@ -0,0 +1,21 @@
+from itertools import groupby
+import re
+
+data = (k.rstrip().split("=Cluster=") for k in open("test1.txt", 'r'))
+final = list(k for k,_ in groupby(list(data)))
+
+with open("file_repeated_removed.txt", 'a') as f:
+    for k in final:
+        if k == ['','']:
+            f.write("=Cluster=\n")
+        elif k == ['']:
+            f.write("\n\n")
+        else:
+            f.write("{}\n".join(k)+"\n")
+
+
+
+with open("file_repeated_removed.txt", 'r') as f_in, open("file_repeated_removed_final.txt", 'w') as f_out:
+    for line in f_in:
+        file = re.sub(';+',';',line)
+        f_out.write(file)
diff --git a/qc-clustering-results/spectra-cluster-py-comparer-dev/.gitignore b/qc-clustering-results/spectra-cluster-py-comparer-dev/.gitignore
@@ -0,0 +1,6 @@
+.idea
+build/
+dist/
+docs/build
+*.spec
+*.pyc