-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgene-product.py
46 lines (38 loc) · 1.45 KB
/
gene-product.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#!/usr/bin/env python3
import pandas as pd
from Bio import Entrez
# Read CRISPRminer Self-Targeting database
df = pd.read_csv('self-target.csv')
# Create auxiliary lists
products = []
protein_ids = []
# Get feature table of protospacer region
for i, start, end in zip(df['Refseq ID'], df['Proto-spacer Start'], df['Proto-spacer End']):
try:
handle = Entrez.efetch(db='nuccore', id=i, seq_start=start, seq_stop=end, rettype='ft')
feature_table = handle.read().split('\t')
handle.close()
except Exception as e:
print(e, 'Refseq ID:', i, 'Start:', start, 'End:', end)
products.append('Error: '+str(e))
protein_ids.append('Error: '+str(e))
continue
# Get product name
try:
product_idx = feature_table.index('product')
products.append(feature_table[product_idx + 1].strip())
except Exception as e:
print(e, 'Refseq ID:', i, 'Start:', start, 'End:', end)
products.append('Error: '+str(e))
# Get protein ID
try:
protein_id_idx = feature_table.index('protein_id')
protein_ids.append(feature_table[protein_id_idx + 1].replace('ref|','').strip('\n|'))
except Exception as e:
print(e, 'Refseq ID:', i, 'Start:', start, 'End:', end)
protein_ids.append('Error: '+str(e))
# Modify original dataframe
df['Product'] = products
df['Protein id'] = protein_ids
# Save table to file
df.to_csv('self-target-proteins.tsv', index=False, sep='\t')