From ffbb7a1fe929c9839438ec6feeeca784587ad775 Mon Sep 17 00:00:00 2001 From: Oliver Denman Date: Tue, 15 Nov 2016 14:41:44 +0000 Subject: [PATCH 01/11] Add scraped page archive gem --- Gemfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Gemfile b/Gemfile index 56c3411..09e418c 100644 --- a/Gemfile +++ b/Gemfile @@ -12,3 +12,4 @@ gem "colorize" gem "nokogiri" gem "open-uri-cached" gem 'pdf-reader' +gem 'scraped_page_archive' From e600cb20532f04b006bf4b78a137b83d21b91453 Mon Sep 17 00:00:00 2001 From: Oliver Denman Date: Tue, 15 Nov 2016 14:42:03 +0000 Subject: [PATCH 02/11] Bundle install --- Gemfile.lock | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Gemfile.lock b/Gemfile.lock index 9c19fd3..6a599c6 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -48,3 +48,9 @@ DEPENDENCIES pdf-reader pry scraperwiki! + +RUBY VERSION + ruby 2.0.0p648 + +BUNDLED WITH + 1.13.5 From 77b76227de5b3fb23346274599e9372882675122 Mon Sep 17 00:00:00 2001 From: Oliver Denman Date: Tue, 15 Nov 2016 14:42:21 +0000 Subject: [PATCH 03/11] Add scraped page archive --- scraper.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scraper.rb b/scraper.rb index 09ffe4d..54caeac 100644 --- a/scraper.rb +++ b/scraper.rb @@ -2,8 +2,9 @@ # encoding: utf-8 require 'scraperwiki' -require 'open-uri' +# require 'open-uri' require 'pdf-reader' +require 'scraped_page_archive/open-uri' # require 'colorize' # require 'pry' From c4040eb68e7322f43078f6e90f24e4f749fa3ad7 Mon Sep 17 00:00:00 2001 From: Oliver Denman Date: Fri, 18 Nov 2016 10:37:58 +0000 Subject: [PATCH 04/11] Add rubocop config --- .rubocop.yml | 12 ++++++++++++ .rubocop_todo.yml | 0 2 files changed, 12 insertions(+) create mode 100644 .rubocop.yml create mode 100644 .rubocop_todo.yml diff --git a/.rubocop.yml b/.rubocop.yml new file mode 100644 index 0000000..6f1dcf2 --- /dev/null +++ b/.rubocop.yml @@ -0,0 +1,12 @@ +AllCops: + Exclude: + - 'Vagrantfile' + - 'vendor/**/*' + TargetRubyVersion: 2.3 + +inherit_from: + - https://raw.githubusercontent.com/everypolitician/everypolitician-data/master/.rubocop_base.yml + - .rubocop_todo.yml + +Style/AndOr: + Enabled: false diff --git a/.rubocop_todo.yml b/.rubocop_todo.yml new file mode 100644 index 0000000..e69de29 From 1b7d41d035e543bace6c6e03fb5846c95eb11fc8 Mon Sep 17 00:00:00 2001 From: Oliver Denman Date: Fri, 18 Nov 2016 10:39:11 +0000 Subject: [PATCH 05/11] Add rubocop gem --- Gemfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Gemfile b/Gemfile index 09e418c..15cc3ae 100644 --- a/Gemfile +++ b/Gemfile @@ -13,3 +13,4 @@ gem "nokogiri" gem "open-uri-cached" gem 'pdf-reader' gem 'scraped_page_archive' +gem 'rubocop' From b5a70890e8a5428b0609c1e06766729941c32052 Mon Sep 17 00:00:00 2001 From: Oliver Denman Date: Fri, 18 Nov 2016 10:41:05 +0000 Subject: [PATCH 06/11] Bundle install --- Gemfile.lock | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/Gemfile.lock b/Gemfile.lock index 6a599c6..f1f8e04 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -11,9 +11,16 @@ GEM remote: https://rubygems.org/ specs: Ascii85 (1.0.2) + addressable (2.5.0) + public_suffix (~> 2.0, >= 2.0.2) afm (0.2.2) + ast (2.3.0) coderay (1.1.0) colorize (0.7.7) + crack (0.4.3) + safe_yaml (~> 1.0.0) + git (1.3.0) + hashdiff (0.3.0) hashery (2.1.1) httpclient (2.6.0.1) method_source (0.8.2) @@ -21,22 +28,47 @@ GEM nokogiri (1.6.6.2) mini_portile (~> 0.6.0) open-uri-cached (0.0.5) + parser (2.3.1.4) + ast (~> 2.2) pdf-reader (1.3.3) Ascii85 (~> 1.0.0) afm (~> 0.2.0) hashery (~> 2.0) ruby-rc4 ttfunk + powerpack (0.1.1) pry (0.10.1) coderay (~> 1.1.0) method_source (~> 0.8.1) slop (~> 3.4) + public_suffix (2.0.4) + rainbow (2.1.0) + rubocop (0.45.0) + parser (>= 2.3.1.1, < 3.0) + powerpack (~> 0.1) + rainbow (>= 1.99.1, < 3.0) + ruby-progressbar (~> 1.7) + unicode-display_width (~> 1.0, >= 1.0.1) + ruby-progressbar (1.8.1) ruby-rc4 (0.1.5) + safe_yaml (1.0.4) + scraped_page_archive (0.5.0) + git (~> 1.3.0) + vcr-archive (~> 0.3.0) slop (3.6.0) sqlite3 (1.3.10) sqlite_magic (0.0.3) sqlite3 ttfunk (1.4.0) + unicode-display_width (1.1.1) + vcr (3.0.3) + vcr-archive (0.3.0) + vcr (~> 3.0.2) + webmock (~> 2.0.3) + webmock (2.0.3) + addressable (>= 2.3.6) + crack (>= 0.3.2) + hashdiff PLATFORMS ruby @@ -47,6 +79,8 @@ DEPENDENCIES open-uri-cached pdf-reader pry + rubocop + scraped_page_archive scraperwiki! RUBY VERSION From cd62b98357e18441bcab1f18bd016599f20f6d7e Mon Sep 17 00:00:00 2001 From: Oliver Denman Date: Mon, 21 Nov 2016 10:45:23 +0000 Subject: [PATCH 07/11] Tidy Gemfile for Rubocop --- Gemfile | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/Gemfile b/Gemfile index 15cc3ae..5608632 100644 --- a/Gemfile +++ b/Gemfile @@ -1,16 +1,18 @@ +# frozen_string_literal: true # It's easy to add more libraries or choose different versions. Any libraries # specified here will be installed and made available to your morph.io scraper. # Find out more: https://morph.io/documentation/ruby -source "https://rubygems.org" +source 'https://rubygems.org' -ruby "2.0.0" +ruby '2.0.0' -gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults" -gem "pry" -gem "colorize" -gem "nokogiri" -gem "open-uri-cached" +gem 'scraperwiki', git: 'https://github.com/openaustralia/scraperwiki-ruby.git', + branch: 'morph_defaults' +gem 'pry' +gem 'colorize' +gem 'nokogiri' +gem 'open-uri-cached' gem 'pdf-reader' gem 'scraped_page_archive' gem 'rubocop' From a036d380a9d383c2122fac845aa7c8141c584a77 Mon Sep 17 00:00:00 2001 From: Oliver Denman Date: Mon, 21 Nov 2016 11:06:54 +0000 Subject: [PATCH 08/11] Add secure git gem source --- Gemfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Gemfile b/Gemfile index 5608632..eaa36db 100644 --- a/Gemfile +++ b/Gemfile @@ -4,10 +4,11 @@ # Find out more: https://morph.io/documentation/ruby source 'https://rubygems.org' +git_source(:github) { |repo_name| "https://github.com/#{repo_name}" } ruby '2.0.0' -gem 'scraperwiki', git: 'https://github.com/openaustralia/scraperwiki-ruby.git', +gem 'scraperwiki', git: 'https://github.com/openaustralia/scraperwiki-ruby', branch: 'morph_defaults' gem 'pry' gem 'colorize' From b3ca9324d64bf193d358c8aa81c62be2e7fdaf0d Mon Sep 17 00:00:00 2001 From: Oliver Denman Date: Mon, 21 Nov 2016 13:03:43 +0000 Subject: [PATCH 09/11] Fix Github gem source Source was pointing to :git not :github source as defined above --- Gemfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Gemfile b/Gemfile index eaa36db..b205544 100644 --- a/Gemfile +++ b/Gemfile @@ -8,7 +8,7 @@ git_source(:github) { |repo_name| "https://github.com/#{repo_name}" } ruby '2.0.0' -gem 'scraperwiki', git: 'https://github.com/openaustralia/scraperwiki-ruby', +gem 'scraperwiki', github: 'openaustralia/scraperwiki-ruby', branch: 'morph_defaults' gem 'pry' gem 'colorize' From c9b6f6f3bbf533e16a3b53c800dbddd7f7225811 Mon Sep 17 00:00:00 2001 From: Oliver Denman Date: Mon, 21 Nov 2016 13:04:04 +0000 Subject: [PATCH 10/11] Update Gemfile.lock with bundle install --- Gemfile.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index f1f8e04..3cfb550 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,5 +1,5 @@ GIT - remote: https://github.com/openaustralia/scraperwiki-ruby.git + remote: https://github.com/openaustralia/scraperwiki-ruby revision: fc50176812505e463077d5c673d504a6a234aa78 branch: morph_defaults specs: @@ -22,7 +22,7 @@ GEM git (1.3.0) hashdiff (0.3.0) hashery (2.1.1) - httpclient (2.6.0.1) + httpclient (2.8.2.4) method_source (0.8.2) mini_portile (0.6.2) nokogiri (1.6.6.2) @@ -56,8 +56,8 @@ GEM git (~> 1.3.0) vcr-archive (~> 0.3.0) slop (3.6.0) - sqlite3 (1.3.10) - sqlite_magic (0.0.3) + sqlite3 (1.3.12) + sqlite_magic (0.0.6) sqlite3 ttfunk (1.4.0) unicode-display_width (1.1.1) From c06ca087d374c3b6cfa6f8192400412225173efb Mon Sep 17 00:00:00 2001 From: Tony Bowden Date: Wed, 23 Nov 2016 09:44:59 +0000 Subject: [PATCH 11/11] Archive some other (unprocessed) pages The only two relevant pages I could find from the outgoing legislature were the Commissions and the list of female deputies. --- scraper.rb | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scraper.rb b/scraper.rb index 54caeac..dc77c60 100644 --- a/scraper.rb +++ b/scraper.rb @@ -43,3 +43,7 @@ def scrape_list(url) ScraperWiki.save_sqlite([:id], term, 'terms') scrape_list('http://www.assemblee-nationale.ga/object.getObject.do?id=190') + +# archive some pages for later processing +open('http://www.assemblee-nationale.ga/34-deputes/168-bureaux-des-commissions/') +open('http://www.assemblee-nationale.ga/34-deputes/153-les-femmes-deputes/')