-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhaaretz.recipe
79 lines (69 loc) · 2.87 KB
/
haaretz.recipe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# vim:fileencoding=utf-8
from calibre.ebooks.BeautifulSoup import BeautifulSoup
import re
"""
A Calibre recipe for http://www.haaretz.co.il
"""
__license__ = 'GPL v3'
__copyright__ = '2015, Guy Rutenberg (http://www.guyrutenberg.com/contact-me)'
from calibre.web.feeds.news import BasicNewsRecipe
import urllib
class HaaretzRecipe(BasicNewsRecipe):
title = u'הארץ'
__author__ = "Guy Rutenberg"
language = 'he'
max_articles_per_feed = 100
auto_cleanup = False
use_embedded_content = False
encoding = 'utf-8'
timefmt = '%d/%m/%Y'
no_stylesheets = True
needs_subscription = True
ignore_duplicate_articles = {'url'}
feeds = [
(u'סוף השבוע', 'http://www.haaretz.co.il/cmlink/%D7%A1%D7%95%D7%A3-%D7%A9%D7%91%D7%95%D7%A2-1.1475630'),
(u'מאמרים ודעות', 'http://www.haaretz.co.il/cmlink/%D7%9E%D7%90%D7%9E%D7%A8%D7%99%D7%9D-%D7%95%D7%93%D7%A2%D7%95%D7%AA-1.1472481'),
(u'מגזין', 'http://www.haaretz.co.il/cmlink/1.1617677'),
(u'גלריה', 'http://www.haaretz.co.il/cmlink/%D7%92%D7%9C%D7%A8%D7%99%D7%94-1.1472287'),
]
def print_version(self, url):
return 'http://www.haaretz.co.il/misc/article-print-page/' + url.rsplit('/',1)[1]
extra_css = """body {direction:rtl;}"""
remove_tags = [
dict(name='div', attrs={'class': re.compile(r'\bsocial-nav\b')}),
dict(name='div', attrs={'class': re.compile(r'\bmobile-link\b')}),
]
keep_only_tags = [
dict(name='div', attrs={'id':'content'}),
]
def preprocess_raw_html(self, raw_html, url):
"""The HTML dom tree is broken, we need to fix it"""
regexs = [
re.compile(r'<!--\[if.*?if\]-->', re.DOTALL),
re.compile(r'<!--.*?-->', re.DOTALL),
]
for r in regexs:
raw_html = r.sub("", raw_html)
return raw_html
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
br.open('http://www.haaretz.co.il')
if self.username is not None and self.password is not None:
data = urllib.urlencode({
'userName': self.username,
'password': self.password,
'newsso': 'true',
'fromlogin': 'true',
'layer': 'login',
'site': '80',
})
# The login page returns a list of URLs that need to be visited
# in order to set the authentication cookies for each domain.
res = br.open('https://sso.themarker.com/sso/sso/signIn', data)
response = res.read()
response_prefix = "null('success',"
if not response.startswith(response_prefix):
raise RuntimeError("Can't sign in. Verify that the username and password are correct.")
for url in re.findall(r'https://[^"]*', response):
br.open(url)
return br