This repository has been archived by the owner on Feb 4, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbingtools.py
43 lines (36 loc) · 1.56 KB
/
bingtools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from corpustools import WebCorpusBuilder
import urllib, urlparse
import re
def bingurl(query, quotes=False, language=None, first=None):
"""Computes a bing search URL from a query.
:param query: (str) the query string
:param quotes: (bool) whether to quote the query string
:param language: (str) an ISO 2-letter language code
:param first: (int) the first result to search from
"""
base = 'https://www.bing.com/search?q=%s'
q = '"%s"' % query if quotes else '%s' % query
cc = '' if not language else '&cc=%s' % language
f = '' if not first else '&first=%d' % first
return base % ('%s%s%s' % (urllib.quote(q), cc, f))
def bingsearch(query, quotes=False, language=None, results=50):
"""Collects the first ``limit'' links from a Bing query.
:param query: (str) the query string to ask Bing
:param quotes: (bool) whether to quote the query string
:param language: (str) an ISO 2-letter language code
:param results: (int) the result limit (default: 50)
:return (list): a list of web pages from this query.
"""
wcb = WebCorpusBuilder()
wcb.urlfilter = lambda u:\
not re.match(r'^.*//.*bing.com', u) and\
not re.match(r'^.*//.*microsoft.com', u) and\
not re.match(r'^.*//.*microsofttranslator.com', u) and\
not re.match(r'^.*//.*dictionary.*', u)
wcb.writer = lambda t: None
i = 0
while i < results:
url = bingurl(query, quotes=quotes, language=language, first=i)
wcb.feed(url)
i = len(wcb.crawler.pages)
return list(wcb.crawler.pages)[0:results]