From 4578610f43b54a27e1c04080dba687cce65bb2f2 Mon Sep 17 00:00:00 2001 From: Dimitris Kontokostas Date: Thu, 18 Jun 2015 17:02:37 +0300 Subject: [PATCH] Change wikipedia API calls to https and added "&continue=" in WikiApi fixes #399 --- .../scala/org/dbpedia/extraction/util/Language.scala | 10 +++++----- .../scala/org/dbpedia/extraction/util/WikiApi.scala | 12 ++++++------ .../mappings/AbstractExtractorWikipedia.scala | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/core/src/main/scala/org/dbpedia/extraction/util/Language.scala b/core/src/main/scala/org/dbpedia/extraction/util/Language.scala index 61edc3eb95..46fc69c80c 100644 --- a/core/src/main/scala/org/dbpedia/extraction/util/Language.scala +++ b/core/src/main/scala/org/dbpedia/extraction/util/Language.scala @@ -25,8 +25,8 @@ import org.dbpedia.extraction.ontology.RdfNamespace * Use propertyUri.append("xy"), not string concatenation. * @param baseUri URI prefix for this wiki, e.g. "http://be-x-old.wikipedia.org", * "http://commons.wikimedia.org", "http://mappings.dbpedia.org". - * @param apiUri API URI for this wiki, e.g. "http://be-x-old.wikipedia.org/w/api.php", - * "http://commons.wikimedia.org/w/api.php", "http://mappings.dbpedia.org/api.php". + * @param apiUri API URI for this wiki, e.g. "https://be-x-old.wikipedia.org/w/api.php", + * "http://commons.wikimedia.org/w/api.php", "https://mappings.dbpedia.org/api.php". */ class Language private( val wikiCode: String, @@ -69,7 +69,7 @@ object Language extends (String => Language) new DBpediaNamespace("http://"+code+".dbpedia.org/resource/"), new DBpediaNamespace("http://"+code+".dbpedia.org/property/"), "http://"+code+".wikipedia.org", - "http://"+code+".wikipedia.org/w/api.php" + "https://"+code+".wikipedia.org/w/api.php" ) } @@ -235,7 +235,7 @@ object Language extends (String => Language) new DBpediaNamespace("http://commons.dbpedia.org/resource/"), new DBpediaNamespace("http://commons.dbpedia.org/property/"), "http://commons.wikimedia.org", - "http://commons.wikimedia.org/w/api.php" + "https://commons.wikimedia.org/w/api.php" ) languages("wikidata") = @@ -248,7 +248,7 @@ object Language extends (String => Language) new DBpediaNamespace("http://wikidata.dbpedia.org/resource/"), new DBpediaNamespace("http://wikidata.dbpedia.org/property/"), "http://www.wikidata.org", - "http://www.wikidata.org/w/api.php" + "https://www.wikidata.org/w/api.php" ) diff --git a/core/src/main/scala/org/dbpedia/extraction/util/WikiApi.scala b/core/src/main/scala/org/dbpedia/extraction/util/WikiApi.scala index 4ca9a3ab8a..e40c75cb25 100644 --- a/core/src/main/scala/org/dbpedia/extraction/util/WikiApi.scala +++ b/core/src/main/scala/org/dbpedia/extraction/util/WikiApi.scala @@ -49,11 +49,11 @@ class WikiApi(url: URL, language: Language) def retrievePagesByNamespace[U](namespace : Namespace, f : WikiPage => U, fromPage : String = "") { // TODO: instead of first getting the page ids and then the pages, use something like - // ?action=query&generator=allpages&prop=revisions&rvprop=ids|content&format=xml&gapnamespace=0 + // ?action=query&continue=&generator=allpages&prop=revisions&rvprop=ids|content&format=xml&gapnamespace=0 // -> "generator" instead of "list" and "gapnamespace" instead of "apnamespace" ("gap" is for "generator all pages") //Retrieve list of pages - val response = query("?action=query&format=xml&list=allpages&apfrom=" + URLEncoder.encode(fromPage, "UTF-8") + "&aplimit=" + pageListLimit + "&apnamespace=" + namespace.code) + val response = query("?action=query&continue=&format=xml&list=allpages&apfrom=" + URLEncoder.encode(fromPage, "UTF-8") + "&aplimit=" + pageListLimit + "&apnamespace=" + namespace.code) //Extract page ids val pageIds = for(p <- response \ "query" \ "allpages" \ "p") yield (p \ "@pageid").head.text.toLong @@ -102,7 +102,7 @@ class WikiApi(url: URL, language: Language) { for(group <- ids.grouped(pageDownloadLimit)) { - val response = query("?action=query&format=xml&prop=revisions&"+param+"=" + group.mkString("|") + "&rvprop=ids|content|timestamp|user|userid") + val response = query("?action=query&continue=&format=xml&prop=revisions&"+param+"=" + group.mkString("|") + "&rvprop=ids|content|timestamp|user|userid") processPages(response, proc) } } @@ -119,7 +119,7 @@ class WikiApi(url: URL, language: Language) { for(titleGroup <- titles.grouped(pageDownloadLimit)) { - val response = query("?action=query&format=xml&prop=revisions&titles=" + titleGroup.map(formatWikiTitle).mkString("|") + "&rvprop=ids|content|timestamp|user|userid") + val response = query("?action=query&continue=&format=xml&prop=revisions&titles=" + titleGroup.map(formatWikiTitle).mkString("|") + "&rvprop=ids|content|timestamp|user|userid") processPages(response, proc) } } @@ -161,7 +161,7 @@ class WikiApi(url: URL, language: Language) */ def retrieveTemplateUsages(title : WikiTitle, namespace: Namespace = Namespace.Main, maxCount : Int = 500) : Seq[WikiTitle] = { - val response = query("?action=query&format=xml&list=embeddedin&eititle=" + title.encodedWithNamespace + "&einamespace=" + namespace.code + "&eifilterredir=nonredirects&eilimit=" + maxCount) + val response = query("?action=query&continue=&format=xml&list=embeddedin&eititle=" + title.encodedWithNamespace + "&einamespace=" + namespace.code + "&eifilterredir=nonredirects&eilimit=" + maxCount) for(page <- response \ "query" \ "embeddedin" \ "ei"; title <- page \ "@title" ) @@ -182,7 +182,7 @@ class WikiApi(url: URL, language: Language) var appropriateQuery = ""; do{ - appropriateQuery = "?action=query&format=xml&list=embeddedin&eititle=" + title.encodedWithNamespace + + appropriateQuery = "?action=query&continue=&format=xml&list=embeddedin&eititle=" + title.encodedWithNamespace + "&einamespace=0&eifilterredir=nonredirects&eilimit=" + maxCount; //Since the call can return only 500 matches at most we must use the eicontinue parameter to //get the other matches diff --git a/server/src/main/scala/org/dbpedia/extraction/mappings/AbstractExtractorWikipedia.scala b/server/src/main/scala/org/dbpedia/extraction/mappings/AbstractExtractorWikipedia.scala index b7a87c0ab4..0340010996 100644 --- a/server/src/main/scala/org/dbpedia/extraction/mappings/AbstractExtractorWikipedia.scala +++ b/server/src/main/scala/org/dbpedia/extraction/mappings/AbstractExtractorWikipedia.scala @@ -26,5 +26,5 @@ class AbstractExtractorWikipedia( extends AbstractExtractor (context) { - override def apiUrl: String = "http://" + context.language.wikiCode + ".wikipedia.org/w/api.php" + override def apiUrl: String = "https://" + context.language.wikiCode + ".wikipedia.org/w/api.php" }