-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.py
109 lines (82 loc) · 2.85 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import hashlib
import os
import time
from bs4 import BeautifulSoup
import requests
TCCL_TOS_HOST = 'http://opac.tulsalibrary.org:82'
TCCL_TOS_PATH = 'search~S0'
TCCL_TOS_PARAMS = {
'searchtype': 't',
'searcharg': '',
'SORT': 'D',
'extended': 0,
'SUBMIT': 'Search',
'searchlimits': '',
'searchorigarg': 'ta'
}
LOCALWIKI_HOST = os.getenv('LOCALWIKI_HOST', 'http://127.0.0.1:8000')
LOCALWIKI_USERNAME = os.getenv('LOCALWIKI_USERNAME', None)
LOCALWIKI_API_KEY = os.getenv('LOCALWIKI_API_KEY', None)
def search_letter(letter):
search_params = TCCL_TOS_PARAMS.copy()
search_params['searcharg'] = letter
print "POST"
resp = requests.post('%s/%s' % (TCCL_TOS_HOST, TCCL_TOS_PATH),
data=search_params)
soup = BeautifulSoup(resp.content)
scrape_entries(soup)
def scrape_entries(soup):
entries = soup.find_all('td', 'browseEntryData')
for entry in entries:
entry_link = entry.find_all('a')[1]
copy_entry(entry_link['href'])
next_link = soup.find('a', text='Next')
if next_link:
next_content = cached_get_content(TCCL_TOS_HOST + next_link['href'])
next_soup = BeautifulSoup(next_content)
scrape_entries(next_soup)
def copy_entry(entry_href):
name = address = phone = website = ''
entry_content = cached_get_content(TCCL_TOS_HOST + entry_href)
soup = BeautifulSoup(entry_content)
name = find_data_by_label(soup, 'Agency Name')
address = find_data_by_label(soup, 'Address')
phone = find_data_by_label(soup, 'Phone/Fax')
linkTable = soup.find('table', 'bibLinks')
if linkTable:
website = linkTable.find('a')['href']
details = get_entry_details(soup)
print name, address, phone, website
print details
def find_data_by_label(soup, label):
tag = soup.find('td', text=label)
data = tag.find_next_sibling('td', 'bibInfoData').text.strip()
return data
def get_entry_details(soup):
details = {}
detail_table = soup.find_all('table', 'bibDetail')[1]
for row in detail_table.find_all('tr'):
label = row.find('td', 'bibInfoLabel')
data = row.find('td', 'bibInfoData').text.strip()
if label:
key = label.text.strip()
details[key] = [data,]
else:
details[key].append(data)
return details
def cached_get_content(url, cache_timeout=3600):
path_hash = hashlib.md5(url).hexdigest()
cache_file = 'cache/%s' % path_hash
content = None
if os.path.exists(cache_file) and file_age(cache_file) < cache_timeout:
content = open(cache_file, 'r').read()
if not content:
print "GET"
resp = requests.get(url)
content = resp.content
open(cache_file, 'w').write(content)
return content
def file_age(fn):
return time.time() - os.stat(fn).st_mtime
for letter in 'b':
search_letter(letter)