-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathuniprot.py
35 lines (30 loc) · 1.25 KB
/
uniprot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import re
import requests
from requests.adapters import HTTPAdapter, Retry
re_next_link = re.compile(r'<(.+)>; rel="next"')
retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=retries))
def get_next_link(headers):
if "Link" in headers:
match = re_next_link.match(headers["Link"])
if match:
return match.group(1)
def get_batch(batch_url):
while batch_url:
response = session.get(batch_url)
response.raise_for_status()
total = response.headers["x-total-results"]
yield response, total
batch_url = get_next_link(response.headers)
url = "https://rest.uniprot.org/uniprotkb/search?query=organism_id:9606+AND+reviewed:true&fields=accession,reviewed,protein_name,gene_primary,cc_function,cc_tissue_specificity,cc_disease&format=tsv&compressed=false&size=500"
progress = 0
with open('data/uniprot.tsv', 'w') as f:
for batch, total in get_batch(url):
lines = batch.text.splitlines()
if not progress:
print(lines[0], file=f)
for line in lines[1:]:
print(line, file=f)
progress += len(lines[1:])
print(f'{progress} / {total}')