-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathscraper.rb
113 lines (95 loc) · 3.21 KB
/
scraper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/bin/env ruby
# encoding: utf-8
# frozen_string_literal: true
require 'json'
require 'nokogiri'
require 'open-uri'
require 'pry'
require 'scraped'
require 'scraperwiki'
class CantonList < Scraped::JSON
field :cantons do
json.map { |j| fragment j => Canton }
end
end
class Canton < Scraped::JSON
field :id do
json[:abbreviation]
end
field :name do
json[:name]
end
field :identifier__parlamentdotch do
json[:id]
end
end
CANTON_URL = 'http://ws-old.parlament.ch/cantons'
@cantons = CantonList.new(response: Scraped::Request.new(
url: CANTON_URL, headers: { 'Accept' => 'text/json' }
).response).cantons.map { |c| [c.id, c.to_h] }.to_h
def json_from(url)
JSON.parse(open(url, 'Accept' => 'text/json').read, symbolize_names: true)
end
def gender_from(str)
return unless str
return 'male' if str == 'm'
return 'female' if str == 'f'
raise "unknown gender: #{str}"
end
def scrape_term(t)
return if t[:id] > 50 # or if the term start date is in the future
base = "http://ws-old.parlament.ch/councillors/historic?legislativePeriodFromFilter=#{t[:id]}&format=json&pageNumber=%d"
page = 0
while page += 1
url = base % page
mems = json_from(url)
mems.each do |mem|
scrape_person(mem, t)
end
break unless mems.last[:hasMorePages]
end
end
def scrape_person(mp, term)
# This is icky, but it'll do until we rewrite the whole thing using Scraped
canton = @cantons[mp[:canton][:abbreviation]] or raise("Unknown canton: #{mp[:canton]}")
data = {
id: mp[:id],
identifier__parlamentdotch: mp[:id],
name: mp[:firstName] + ' ' + mp[:lastName],
sort_name: mp[:lastName] + ', ' + mp[:firstName],
given_name: mp[:firstName],
family_name: mp[:lastName],
birth_date: mp[:birthDate].slice!(0, 10),
gender: gender_from(mp[:gender]),
area: canton[:name],
area_id: canton[:id],
council: mp[:council][:abbreviation],
council_id: mp[:council][:id],
party: mp[:party][:abbreviation],
party_id: mp[:party][:id],
faction: mp[:faction][:abbreviation],
faction_id: mp[:faction][:id],
term: term[:id],
source: "https://www.parlament.ch/en/biografie?CouncillorId=#{mp[:id]}",
}
if mp[:membership][:entryDate]
start_date = mp[:membership][:entryDate].slice(0, 10)
data[:start_date] = start_date if start_date > term[:start_date]
end
if mp[:membership][:leavingDate]
end_date = mp[:membership][:leavingDate].slice(0, 10)
data[:end_date] = end_date if end_date < term[:end_date]
end
ScraperWiki.save_sqlite(%i(id term), data)
end
terms = json_from('http://ws-old.parlament.ch/legislativeperiods?format=json')
terms.each do |t|
t[:start_date] = (t.delete :from).slice!(0, 10)
t[:end_date] = (t.delete :to).slice!(0, 10)
%i(hasMorePages updated code).each { |i| t.delete i }
t.delete :hasMorePages
t.delete :updated
puts t
ScraperWiki.save_sqlite([:id], t, 'terms')
scrape_term(t)
end