Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

signalP integration #61

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
171 changes: 171 additions & 0 deletions micronota/bfillings/signalp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2015--, micronota development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

from os import makedirs
from os.path import join
import re

from burrito.parameters import FlagParameter, ValuedParameter
from burrito.util import CommandLineApplication, ResultPath


class SignalP(CommandLineApplication):
'''SignalP (version 4.1) application controller.'''
_command = 'signalp'
_valued_path_options = [
# Logfile if -v is defined. Default: 'STDERR'
'-l',
# Specify temporary file directory. Default: /var/tmp
'-T',
# Make fasta file with mature sequence. Default: None
'-m',
# Make gff file of processed sequences. Default: 'Off'
'-n'
]
_valued_nonpath_options = [
# Setting the output format ('short', 'long', 'summary' or 'all').
# Default: 'short'
'-f',
# Graphics 'png' or 'png+eps'. Requires GNUPLOT. Default: 'Off'
'-g',
# Signal peptide networks to use ('best' or 'notm'). Default: 'best'
'-s',
# Organism type> (euk, gram+, gram-). Default: 'euk'
'-t',
# user defined D-cutoff for noTM networks
'-u',
# user defined D-cutoff for TM networks
'-U',
# Minimal predicted signal peptide length. Default: [10]
'-M',
# truncate to sequence length - 0 means no truncation. Default '70'
'-c'
]
_flag_options = [
# Output this handy help message
'-h',
# Print SignalP version and exit
'-V',
# Verbose mode
'-v',
# Keep temporary directory. Default: 'Off'
'-k',
# web predictions. Default: 'Off'
'-w'
]

_parameters = {}
_parameters.update({
i: ValuedParameter(
Prefix=i[0], Name=i[1:], Delimiter=' ',
IsPath=True)
for i in _valued_path_options})
_parameters.update({
i: ValuedParameter(
Prefix=i[0], Name=i[1:], Delimiter=' ')
for i in _valued_nonpath_options})
_parameters.update({
i: FlagParameter(
Prefix=i[0], Name=i[1:])
for i in _flag_options})
_suppress_stderr = False

def _accept_exit_status(self, exit_status):
return exit_status == 0

def _get_result_paths(self, data):
result = {}

# if `-k` flag is defined get temporaty file dir from `-T`
if self.Parameters['-k'].isOn():
if self.Parameters['-T'].isOn():
tmp_fp = self._absolute(self.Parameters['-T'].Value)
else:
# taken from default definition in `_valued_path_options`
tmp_fp = '/var/tmp/'
result['tmp'] = ResultPath(Path=tmp_fp, IsWritten=True)

# get log, fasta and gff file paths
for option, vals in [('-l', 'log'), ('-m', 'fasta'), ('-n', 'gff')]:
if self.Parameters[option].isOn():
fp = self._absolute(self.Parameters[option].Value)
result[vals] = ResultPath(Path=fp, IsWritten=True)

# get png (and eps) files
if self.Parameters['-g'].isOn():
# get inp_fp GI_IDs
gis = []
with open(data[0]) as f:
for line in f.readlines():
if line.startswith('>'):
line = re.sub('^>\ *', '', line)
gis.append(line.split()[0])
g_fp = self.WorkingDir
# get png files
for i, gi in enumerate(gis):
png_fp = join(g_fp, '.'.join([gi, 'png']))
result['png'+str(i)] = ResultPath(Path=png_fp, IsWritten=True)
if self.Parameters['-g'].Value is 'gff+eps':
# get eps files
eps_fp = join(g_fp, '.'.join([gi, 'eps']))
result['eps'+str(i)] = ResultPath(Path=eps_fp,
IsWritten=True)
return result


def predict_signal(in_fp, out_dir, prefix, params=None):
'''Predict signal peptide cleavage sites for the input file.

Notes
-----
It will create an output, depending on the selected parameter:
A. short
B. long
C. summary
D. all
SignalP accepts any input and does not raise input file errors.
Please check for correct input before running SignalP.

Parameters
----------
in_fp : str
input file path
out_dir : str
output file directory path
prefix : str
name of the output file
params : dict
Other command line parameters for SignalP. key is the option
(e.g. "-t") and value is the value for the option (e.g. "euk").
If the option is a flag, set the value to None.

Returns
-------
burrito.util.CommandLineAppResult
It contains opened file handlers of stdout, stderr, and the
output files, which can be accessed in a dict style with the
keys of "StdOut", "StdErr", "tmp" (if specified), "gff" (if specified),
"fasta" (if specified), "png[0-9]" & "eps[0-9]" (if specified)
and "log" (if specified). Main output is passed to "StdOut".
The exit status can be similarly fetched with the key "ExitStatus".
'''
# create dir if does not exist
makedirs(out_dir, exist_ok=True)

if params is None:
params = {}

# change -m, -n and -l options to include path if there isn't one!
for i in ['-l', '-m', '-n']:
if i in params:
if '/' not in params[i]:
params[i] = join(out_dir, params[i])

app = SignalP(InputHandler='_input_as_paths', WorkingDir=out_dir,
params=params)
return app([in_fp])
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
16 changes: 16 additions & 0 deletions micronota/bfillings/tests/data/signalp/euk10.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
>IPI:IPI00000013.1 SWISS-PROT:O60911 TREMBL:B2R717 ENSEMBL:ENSP00000259470 REFSEQ:NP_001324 H-INV:HIT000252685 VEGA:OTTHUMP00000021738 Tax_Id=9606 Gene_Symbol=CTSL2 Cathepsin L2; MatureChain: 18-334
VPKFDQNLDTKWYQWKATHRRLYGANEEGWRRAVWEKNMKMIELHNGEYSQGKHGFTMAM
NAFGDMTNEEFRQMMGCFRNQKFRKGKVFREPLFLDLPKSVDWRKKGYVTPVKNQKQCGS
CWAFSATGALEGQMFRKTGKLVSLSEQNLVDCSRPQGNQGCNGGFMARAFQYVKENGGLD
SEESYPYVAVDEICKYRPENSVANDTGFTVVAPGKEKALMKAVATVGPISVAMDAGHSSF
QFYKSGIYFEPDCSSKNLDHGVLVVGYGFEGANSNNSKYWLVKNSWGPEWGSNGYVKIAK
DKNNHCGIATAASYPNV
>IPI:IPI00000023.4 SWISS-PROT:P18507 TREMBL:B4DSA1 ENSEMBL:ENSP00000354651 REFSEQ:NP_000807 H-INV:HIT000263500 VEGA:OTTHUMP00000160874 Tax_Id=9606 Gene_Symbol=GABRG2 Gamma-aminobutyric acid receptor subunit gamma-2; MatureChain: 41-467
KSDDDYEDYASNKTWVLTPKVPEGDVTVILNNLLEGYDNKLRPDIGVKPTLIHTDMYVNS
IGPVNAINMEYTIDIFFAQTWYDRRLKFNSTIKVLRLNSNMVGKIWIPDTFFRNSKKADA
HWITTPNRMLRIWNDGRVLYTLRLTIDAECQLQLHNFPMDEHSCPLEFSSYGYPREEIVY
QWKRSSVEVGDTRSWRLYQFSFVGLRNTTEVVKTTSGDYVVMSVYFDLSRRMGYFTIQTY
IPCTLIVVLSWVSFWINKDAVPARTSLGITTVLTMTTLSTIARKSLPKVSYVTAMDLFVS
VCFIFVFSALVEYGTLHYFVSNRKPSKDKDKKKKNPAPTIDIRPRSATIQMNNATHLQER
DEEYGYECLDGKDCASFFCCFEDCRTGAWRHGRIHIRIAKMDSYARIFFPTAFCLFNLVY
WVSYLYL
69 changes: 69 additions & 0 deletions micronota/bfillings/tests/data/signalp/euk10.fsa
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
>IPI:IPI00000001.2 SWISS-PROT:O95793-1 TREMBL:A8K622;Q59F99 ENSEMBL:ENSP00000360922;ENSP00000379466 REFSEQ:NP_059347 H-INV:HIT000329496 VEGA:OTTHUMP00000031233 Tax_Id=9606 Gene_Symbol=STAU1 Isoform Long of Double-stranded RNA-binding protein Staufen homolog 1
MSQVQVQVQNPSAALSGSQILNKNQSLLSQPLMSIPSTTSSLPSENAGRPIQNSALPSAS
ITSTSAAAESITPTVELNALCMKLGKKPMYKPVDPYSRMQSTYNYNMRGGAYPPRYFYPF
PVPPLLYQVELSVGGQQFNGKGKTRQAAKHDAAAKALRILQNEPLPERLEVNGRESEEEN
LNKSEISQVFEIALKRNLPVNFEVARESGPPHMKNFVTKVSVGEFVGEGEGKSKKISKKN
AAIAVLEELKKLPPLPAVERVKPRIKKKTKPIVKPQTSPEYGQGINPISRLAQIQQAKKE
KEPEYTLLTERGLPRRREFVMQVKVGNHTAEGTGTNKKVAKRNAAENMLEILGFKVPQAQ
PTKPALKSEEKTPIKKPGDGRKVTFFEPGSGDENGTSNKEDEFRMPYLSHQQLPAGILPM
VPEVAQAVGVSQGHHTKDFTRAAPNPAKATVTAMIARELLYGGTSPTAETILKNNISSGH
VPHGPLTRPSEQLDYLSRVQGFQVEYKDFPKNNKNEFVSLINCSSQPPLISHGIGKDVES
CHDMAALNILKLLSELDQQSTEMPRTGNGPMSVCGRC
>IPI:IPI00000005.1 SWISS-PROT:P01111 TREMBL:Q5U091 ENSEMBL:ENSP00000358548 REFSEQ:NP_002515 VEGA:OTTHUMP00000013879 Tax_Id=9606 Gene_Symbol=NRAS GTPase NRas
MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAG
QEEYSAMRDQYMRTGEGFLCVFAINNSKSFADINLYREQIKRVKDSDDVPMVLVGNKCDL
PTRTVDTKQAHELAKSYGIPFIETSAKTRQGVEDAFYTLVREIRQYRMKKLNSSDDGTQG
CMGLPCVVM
>IPI:IPI00000006.1 SWISS-PROT:P01112 ENSEMBL:ENSP00000309845;ENSP00000373382;ENSP00000380723;ENSP00000407586 REFSEQ:NP_001123914;NP_005334 H-INV:HIT000080764 VEGA:OTTHUMP00000162769;OTTHUMP00000166055 Tax_Id=9606 Gene_Symbol=HRAS GTPase HRas
MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAG
QEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDL
AARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPG
CMSCKCVLS
>IPI:IPI00000012.4 SWISS-PROT:Q6XR72-4 TREMBL:B3KR19;B3KUL2 ENSEMBL:ENSP00000355893 REFSEQ:NP_061183 H-INV:HIT000251072 VEGA:OTTHUMP00000035563 Tax_Id=9606 Gene_Symbol=SLC30A10 Cation efflux protein family protein
MGRYSGKTCRLLFMLVLTVAFFVAELVSGYLGNSIALLSDSFNMFSDLISLCVGLSAGYI
ARRPTRGFSATYGYARAEVVGALSNAVFLTALCFTIFVEAVLRLARPERIDDPELVLIVG
VLGLLVNVVGLLIFQDCAAWFACCLRGRSRRLQQRQQLAEGCVPGAFGGPQGAEDPRRAA
DPTAPGSDSAVTLRGTSVERKREKGATVFANVAGDSFNTQNEPEDMMKKEKKSEALNIRG
VLLHVMGDALGSVVVVITAIIFYVLPLKSEDPCNWQCYIDPSLTVLMVIIILSSAFPLIK
ETAAILLQMVPKGVNMEELMSKLSAVPGISSVHEVHIWELVSGKIIATLHIKYPKDRGYQ
DASTKIREIFHHAGIHNVTIQFENVDLKEPLEQKDLLLLCNSPCISKGCAKQLCCPPGAL
PLAHVNGCAEHFLCHVNGCAEHNGGPSLDTYGSDGLSRRDAREVAIEVSLDSCLSDHGQS
LNKTQEDQCYVNRTHF
>IPI:IPI00000013.1 SWISS-PROT:O60911 TREMBL:B2R717 ENSEMBL:ENSP00000259470 REFSEQ:NP_001324 H-INV:HIT000252685 VEGA:OTTHUMP00000021738 Tax_Id=9606 Gene_Symbol=CTSL2 Cathepsin L2
MNLSLVLAAFCLGIASAVPKFDQNLDTKWYQWKATHRRLYGANEEGWRRAVWEKNMKMIE
LHNGEYSQGKHGFTMAMNAFGDMTNEEFRQMMGCFRNQKFRKGKVFREPLFLDLPKSVDW
RKKGYVTPVKNQKQCGSCWAFSATGALEGQMFRKTGKLVSLSEQNLVDCSRPQGNQGCNG
GFMARAFQYVKENGGLDSEESYPYVAVDEICKYRPENSVANDTGFTVVAPGKEKALMKAV
ATVGPISVAMDAGHSSFQFYKSGIYFEPDCSSKNLDHGVLVVGYGFEGANSNNSKYWLVK
NSWGPEWGSNGYVKIAKDKNNHCGIATAASYPNV
>IPI:IPI00000015.2 SWISS-PROT:Q08170 TREMBL:A8K644;Q53F45;Q59EF5 ENSEMBL:ENSP00000362900 REFSEQ:NP_005617 H-INV:HIT000275139 VEGA:OTTHUMP00000003790 Tax_Id=9606 Gene_Symbol=SRSF4 Serine/arginine-rich splicing factor 4
MPRVYIGRLSYQARERDVERFFKGYGKILEVDLKNGYGFVEFDDLRDADDAVYELNGKDL
CGERVIVEHARGPRRDGSYGSGRSGYGYRRSGRDKYGPPTRTEYRLIVENLSSRCSWQDL
KDYMRQAGEVTYADAHKGRKNEGVIEFVSYSDMKRALEKLDGTEVNGRKIRLVEDKPGSR
RRRSYSRSRSHSRSRSRSRHSRKSRSRSGSSKSSHSKSRSRSRSGSRSRSKSRSRSQSRS
RSKKEKSRSPSKEKSRSRSHSAGKSRSKSKDQAEEKIQNNDNVGKPKSRSPSRHKSKSKS
RSRSQERRVEEEKRGSVSRGRSQEKSLRQSRSRSRSKGGSRSRSRSRSKSKDKRKGRKRS
REESRSRSRSRSKSERSRKRGSKRDSKAGSSKKKKKEDTDRSQSRSPSRSVSKEREHAKS
ESSQREGRGESENAGTNQETRSRSRSNSKSKPNLPSESRSRSKSASKTRSRSKSRSRSAS
RSPSRSRSRSHSRS
>IPI:IPI00000017.1 TREMBL:Q16191 Tax_Id=9606 Gene_Symbol=- Orf1 5' to PD-ECGF/TP protein
MGLGAGRPDANSDAPRLRLGHDPCGRAPPPSPSARASPRSRRRAAPGQATWCPLA
>IPI:IPI00000020.1 TREMBL:Q16192 Tax_Id=9606 Gene_Symbol=ODF3B Orf3 5' of PD-ECGF/TP protein
MLPGYALAMTRAAARPRLHLRRALPHAADDVRPRARPPGARSHDRARHRRRPRLLHLRPP
TPLSALPHSGTWSGPPGPWPPQRRTASREAHLGTPDLNPESPSDTLTRYSVPPYPDLKSQ
TPNPRGFDKSWLRLPTSPRTPSRVPTRLPRSSSPPHP
>IPI:IPI00000021.5 TREMBL:A1L1A8 Tax_Id=9606 Gene_Symbol=- ODF3B protein (Fragment)
GAASRKRPPPALWARTPGWAFGGHTGPAAPSRRTTEAPGPNTSCRPTPQTTCGPGPGHLV
PARMTVRGTDGAPAYSIYGRPRRSAPFLTPGPGRYFPERAGNATYPSAPRHTIAPRNWGV
QAEQQSPGPAAYTVPSLLGPRVIGKVSAPTCSIYGRRAAGSFFEDLSKTPGPCAYQVVSP
GVYKSRAPQFTILARTSLPQDNTRKPGPAAYNVDQHRKPRGWSFGIRHSDYLAPLVTDAD
N
>IPI:IPI00000023.4 SWISS-PROT:P18507 TREMBL:B4DSA1 ENSEMBL:ENSP00000354651 REFSEQ:NP_000807 H-INV:HIT000263500 VEGA:OTTHUMP00000160874 Tax_Id=9606 Gene_Symbol=GABRG2 Gamma-aminobutyric acid receptor subunit gamma-2
MSSPNIWSTGSSVYSTPVFSQKMTVWILLLLSLYPGFTSQKSDDDYEDYASNKTWVLTPK
VPEGDVTVILNNLLEGYDNKLRPDIGVKPTLIHTDMYVNSIGPVNAINMEYTIDIFFAQT
WYDRRLKFNSTIKVLRLNSNMVGKIWIPDTFFRNSKKADAHWITTPNRMLRIWNDGRVLY
TLRLTIDAECQLQLHNFPMDEHSCPLEFSSYGYPREEIVYQWKRSSVEVGDTRSWRLYQF
SFVGLRNTTEVVKTTSGDYVVMSVYFDLSRRMGYFTIQTYIPCTLIVVLSWVSFWINKDA
VPARTSLGITTVLTMTTLSTIARKSLPKVSYVTAMDLFVSVCFIFVFSALVEYGTLHYFV
SNRKPSKDKDKKKKNPAPTIDIRPRSATIQMNNATHLQERDEEYGYECLDGKDCASFFCC
FEDCRTGAWRHGRIHIRIAKMDSYARIFFPTAFCLFNLVYWVSYLYL
5 changes: 5 additions & 0 deletions micronota/bfillings/tests/data/signalp/euk10.gff
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
##gff-version 2
##sequence-name source feature start end score N/A ?
## -----------------------------------------------------------
IPI:IPI00000013.1 SignalP-4.1 SIGNAL 1 17 0.834 . . YES
IPI:IPI00000023.4 SignalP-4.1 SIGNAL 1 40 0.533 . . YES
Loading