analise_de_dados

'''
Created on 20/01/2016

@author: Pedro
'''

import re
from Bio import SeqIO
from Bio import Entrez
import cds


file = open("ProteinTable.txt","r")

prots = []
for prot in file:
    prots.append(prot)

prots.pop(0)

prots6 = []
for prot in prots:
    chave = re.search("TP_RS[0-9]+",prot)
    locus_tag = chave.group()
    locus_tag = locus_tag[5:10]
    if int(locus_tag) >= 2955 and int(locus_tag) <= 3540:
        prots6.append(prot)
funcs = []
for prot in prots6:
    proc = re.finditer("\\t",prot)
    i = 0
    for num in proc:
        i += 1
        if i == 10:
            func = prot[int(num.span()[1]):len(prot) - 1]
            if "MULTISPECIES: " in func:
                func = func[14:len(func)]
            funcs.append(func)


#----------------------------------------------------------------------------------------

Entrez.email = "mafaldasap@gmail.com"     # Always tell NCBI who you are
handle4 = Entrez.efetch(db="nucleotide", id="15638995", rettype="gb", strand=1, seq_start=639851, 
                       seq_stop=765100)
record4 = SeqIO.read(handle4, "genbank")
handle5 = Entrez.efetch(db="nucleotide", id="15638995", rettype="fasta", strand=1, seq_start=639851, 
                       seq_stop=765100)
recordF = SeqIO.read(handle5, "fasta")

#---------------------------------------------------------------------------


#Save information in variables
fastaSeq= str(recordF.seq)
filename2 = "g6.gb"
save_file2 = open(filename2, "w")
save_file2.write(handle4.read())
save_file2.close()
handle4.close()

#Qualifiers expressions
locus_tag="locus_tag"
old_locus_tag="old_locus_tag"
db_xref="db_xref"
product="product"
accession="protein_id"
translation="translation"

cdsList = []
features = record4.features
for feature in features:
    f= str(feature.location) 
    if(str(feature.type)!="gene" and str(feature.type)!="source"):        
        locations=re.findall("[0-9]+",f)
        l1=int(locations[0])+639851
        l2=int(locations[1])+639850
        loc="["+str(l1)+":"+str(l2)+"]"
        qualifiers= feature.qualifiers
        if(str(feature.type)=="CDS"):
            cDS= cds.MyCDS(str(feature.type),str(feature.strand),loc,str(qualifiers.get(locus_tag)),str(qualifiers.get(old_locus_tag)), str(qualifiers.get(db_xref)),str(qualifiers.get(product)),str(qualifiers.get(accession)),str(qualifiers.get(translation)),fastaSeq[int(locations[0]):int(locations[1])] )
            cdsList.append(cDS)


ls = []
for cds in cdsList:
    if cds.translation != "None":
        transl = cds.translation
        ls.append(cds.translation)
    else:
        ls.append("['None']")
seqs = []
for seq in ls:
    seqs.append(seq[2:len(seq)-2])
 
funcs2 = []
for cds in cdsList:
        #if cds.product != "None":
    func = cds.product
    func = func[2:len(func)-2]
    func2 = funcs2.append(func)
verif = []
for funcao in range(len(funcs)): 
    if funcs[funcao] == funcs2[funcao]:
        verif.append(True)
    else:
        verif.append(False)
 
for i in range(105):
    print(funcs[i],i, "LIST")
    print(funcs2[i],i, "NBCI")
 
print()
 
conclusao = 0
for confir in range(len(verif)):
    if verif[confir] == False:
        conclusao += 1
if conclusao > 0:
    print("Nao se confirma que as funcoes sao iguais.")
else:
    print("Confirma-se que as funcoes sao iguais.")


#-----------------------------------------------------------------------

# BATCH-CD

hipprot = []
for prot in range(len(funcs)):
    if funcs[prot] == 'hypothetical protein':
        hipprot.append(prot)

hipnome = []
for num in hipprot:
    hipnome.append(prots6[num])

locus_tag_CDD = []
for prot in hipnome:
    chave = re.search("TP_RS[0-9]+",prot)
    locus_tag = chave.group()
    locus_tag = locus_tag[5:10]
    locus_tag_CDD.append(locus_tag)

print(locus_tag_CDD)
ls = []
for cds in cdsList:
    lt = cds.locus_tag[7:-2]
    for i in locus_tag_CDD:
        #print(i,lt)
        if i == lt:
            transl = cds.translation
            print(transl)
            ls.append(transl[2:-2])
        else:
            pass

print(ls,"g")

CDD = open("CDD.txt",'w')

for i in ls:
    CDD.write('> \n')
    CDD.write(i + '\n')
CDD.close()

#-------------------------------------------------------------------

file = open("hitdata.txt","r")

domin = []
for line in file:
    if line != "\n":
        domin.append(line)

novo_domin = domin[7:]
print(novo_domin)
dominio = []
frase2 = ""
for D in range(len(novo_domin)):
    mos = re.finditer("\t",novo_domin[D])
    i = 0
    for espaco in mos:
        i += 1
        if i == 8:
            frase = novo_domin[D][espaco.span()[1]:-1]
            print(frase)
            enc = re.search("\t",frase)
            if "superfamily" not in frase:
                exp = " Dominio"
            else:
                exp = ""
            dominio.append("TP_RS" + str(locus_tag_CDD[int(novo_domin[D][2:3])]) + " -" + exp + " " + frase[:enc.span()[0]])

dom_sup = open("dom_sup.txt","w")     
for i in dominio:
    dom_sup.write(i + "\n")
dom_sup.close()