-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathproteomeXchange_scraper.py
104 lines (87 loc) · 4.83 KB
/
proteomeXchange_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import argparse
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.select import Select
def web_table_into_dataframe(soup):
cur_table = soup.find('table', attrs={'class':'tablesorter'})
cur_table_rows = cur_table.find_all('tr')
table_header = cur_table_rows[0]
cur_table_rows = cur_table_rows[1:]
header_names = [i.text for i in table_header.find_all('th')]
table_as_dict = {}
cur_id = 0
for tr in cur_table_rows:
td = tr.find_all('td')
cur_td_elements = []
for i in td:
link_element = i.find('a')
if link_element != None:
value = i.find('a').attrs['href'].split('/')[-1].strip('GetDataset?ID=')
else:
value = i.text
cur_td_elements.append(value)
table_as_dict[cur_id] = dict(zip(header_names, cur_td_elements))
cur_id += 1
return pd.DataFrame(table_as_dict).T
def get_page_sources(link, waitTime=5, fullDump=True):
# ensure chromedriver.exe is in PATH, or, pass in the file path as an argument below
d = webdriver.Chrome()
d.get(link)
time.sleep(waitTime)
pager_element = d.find_element_by_class_name('pager')
page_size_element = pager_element.find_element_by_class_name('pagesize')
select_object = Select(page_size_element)
time.sleep(waitTime)
select_object.select_by_value('1000')
num_pages = int(pager_element.find_element_by_class_name('pagedisplay').get_attribute('value').split("/")[-1])
full_dataframe = pd.read_csv('full_metadata_of_PX.tsv', sep='\t')
existing_IDs = full_dataframe['Dataset Identifier'].values
for i in range(num_pages):
soup = BeautifulSoup(d.page_source, features='lxml')
df = web_table_into_dataframe(soup)
df_IDs = df['Dataset Identifier'].values
full_dataframe = pd.concat([df, full_dataframe])
time.sleep(1.5)
if any(dataset_ID in existing_IDs for dataset_ID in df_IDs):
break
else:
pager_element.find_elements_by_class_name('next')[0].click()
full_dataframe = full_dataframe.drop_duplicates()
full_dataframe.to_csv('full_metadata_of_PX.tsv', index=False, sep='\t')
print("Web scraping complete.")
def keyword_matching(data, keyword_list, column2look=['Title', 'Keywords']):
binary_matches = pd.DataFrame()
for col in column2look:
binary_matches[col] = data[col].str.contains((r'\b(?:{})\b'.format('|'.join(keyword_list))))
return binary_matches
# get user-defined keyword list from command line using -i or --inputFile flags
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("-i", "--inputFile", type=str, help="name of text file w/ each keyword on a new line")
args = vars(parser.parse_args())
# get all datasets from ProteomeXchange
get_page_sources('http://proteomecentral.proteomexchange.org/cgi/GetDataset')
full_dataframe = pd.read_csv('full_metadata_of_PX.tsv', sep='\t')
# filter the full dataframe for human, pediatric cancer datasets
full_dataframe['Species'] = full_dataframe['Species'].str.lower()
full_dataframe['Title'] = full_dataframe['Title'].str.lower()
full_dataframe['Keywords'] = full_dataframe['Keywords'].str.lower()
filtered_df = full_dataframe[full_dataframe['Species'].str.contains('homo sapiens') | full_dataframe['Species'].str.contains('human')]
# default keyword list
pediatric_cancer_k_list = ['child', 'pediatric', 'paediatric', 'leukemia', 'medulloblastoma', 'astrocytoma', 'glioma', 'retinoblastoma', 'ewing'
, 'osteosarcoma', 'lymphoma', 'all', 'aml', 'rhabdomyosarcoma', 'wilm', 'nephroblastoma', 'glioblastoma', 'ependymoma', 'atrt'
, 'oligodendroglioma', 'pnet', 'pineoblastoma', 'rhabdoid', 'craniopharyngioma', 'dnet', 'schwannoma', 'meningioma',
'choroid plexus', 'chordoma', 'neuroma', 'teratoma', 'youth', 'teen', 'adolescent', 'infant', 'babies', 'choriocarinoma', 'bone cancer', 'neuroblastoma', 'embryonal', 'neurilemmoma'
'brain cancer', 'brain tumo', 'spinal cord tumo', 'sarcoma', 'thyroid cancer', 'thyroid carcinoma', 'kidney cancer', 'kidney tumo', 'central nervous system tumo',
'CNS tumo', 'dysembryoplastic neuroepithelial', 'primitive neuroectodermal', 'teratoid']
# use the user-defined keyword list if provided
if args["inputFile"] != None:
with open(str(args['inputFile'])) as file:
lines = file.readlines()
keywords = [line.replace('\n', '').lower() for line in lines]
else:
keywords = pediatric_cancer_k_list
filtered_df = filtered_df[keyword_matching(filtered_df, keywords).sum(axis=1) > 0]
filtered_df.to_csv('filtered_metadata_of_PX.tsv', index=False, sep='\t')
print("Dataset filtering complete.")