diff --git a/core/src/main/java/org/dbpedia/extraction/nif/LinkExtractor.java b/core/src/main/java/org/dbpedia/extraction/nif/LinkExtractor.java
old mode 100644
new mode 100755
index 04b71fc08a..07f5a814de
--- a/core/src/main/java/org/dbpedia/extraction/nif/LinkExtractor.java
+++ b/core/src/main/java/org/dbpedia/extraction/nif/LinkExtractor.java
@@ -39,11 +39,14 @@ public LinkExtractor(NifExtractorContext context) {
public void head(Node node, int depth) {
- if(skipLevel>=0)
+ if(skipLevel>=0){
return;
+ }
+
- if(paragraph == null)
- paragraph = new Paragraph(0, "", "p");
+ if(paragraph == null) {
+ paragraph = new Paragraph(0, "", "p");
+ }
//ignore all content inside invisible tags
if(invisible || node.attr("style").matches(".*display\\s*:\\s*none.*")) {
invisible = true;
@@ -52,6 +55,7 @@ public void head(Node node, int depth) {
if(node.nodeName().equals("#text")) {
String tempText = node.toString();
+
//replace no-break spaces because unescape doesn't deal with them
tempText = StringEscapeUtils.unescapeHtml4(tempText);
tempText = org.dbpedia.extraction.util.StringUtils.escape(tempText, replaceChars());
@@ -59,6 +63,7 @@ public void head(Node node, int depth) {
//this text node is the content of an element: make a new nif:Word
if(inLink) {
+
if(!tempText.trim().startsWith(this.context.wikipediaTemplateString + ":")) //not!
{
tempLink.setLinkText(tempText);
@@ -70,11 +75,15 @@ public void head(Node node, int depth) {
errors.add("found Template in resource: " + this.context.resource + ": " + tempText);
return;
}
+
}
else
paragraph.addText(tempText);
- } else if(node.nodeName().equals("a")) {
+ }
+
+ else if(node.nodeName().equals("a")) {
+
String link = node.attr("href");
//TODO central string management
/**
@@ -84,41 +93,62 @@ public void head(Node node, int depth) {
* see Schopenhauer: https://en.wikipedia.org/w/api.php?uselang=en&format=xml&action=parse&prop=text&pageid=17340400
*/
String linkPrefix = "/wiki/";
- // standard wikilinks
- if (link.contains(linkPrefix) && !link.contains(":")) {
- tempLink = new Link();
- String uri = cleanLink(node.attr("href"), false);
- setUri(uri);
-
- //simple example of Help:IPA
- // [ˈaɐ̯tʊɐ̯ ˈʃoːpn̩haʊ̯ɐ]
- } else if (link.contains(linkPrefix) && link.contains(":")) {
- /**
- * TODO buggy
- * Cleans up child nodes: difficult example
- * /ˈʃoʊpənhaʊ.ər/
- */
- if (!node.childNodes().isEmpty()) {
- if (node.childNode(0).nodeName().equals("#text") &&
- node.childNode(0).toString().contains(":") &&
- !node.childNode(0).toString().contains("http")) {
- tempLink = new Link();
- String uri = cleanLink(node.attr("href"), false);
- setUri(uri);
- }
- } else {
- skipLevel = depth;
- }
- //TODO add example
- } else if (node.attr("class").equals("external text")) {
- //don't skip external links
- tempLink = new Link();
- String uri = cleanLink(node.attr("href"), true);
- setUri(uri);
-
- } else {
- skipLevel = depth;
- }
+ // SPECIAL CASE FOR RESTAPI PARSING
+
+ if(node.hasAttr("rel")){
+
+ String relType = node.attr("rel");
+ if(relType.equals("mw:WikiLink")){
+
+ tempLink = new Link();
+ String uri = cleanLink(node.attr("href"), false);
+ setUri(uri);
+
+
+ } else if (relType.equals("mw:ExtLink")) {
+ tempLink = new Link();
+ String uri = cleanLink(node.attr("href"), true);
+ setUri(uri);
+ }
+ }else{
+ // standard wikilinks
+ if (link.contains(linkPrefix) && !link.contains(":")) {
+ tempLink = new Link();
+ String uri = cleanLink(node.attr("href"), false);
+ setUri(uri);
+
+ //simple example of Help:IPA
+ // [ˈaɐ̯tʊɐ̯ ˈʃoːpn̩haʊ̯ɐ]
+ } else if (link.contains(linkPrefix) && link.contains(":")) {
+ /**
+ * TODO buggy
+ * Cleans up child nodes: difficult example
+ * /ˈʃoʊpənhaʊ.ər/
+ */
+ if (!node.childNodes().isEmpty()) {
+ if (node.childNode(0).nodeName().equals("#text") &&
+ node.childNode(0).toString().contains(":") &&
+ !node.childNode(0).toString().contains("http")) {
+ tempLink = new Link();
+ String uri = cleanLink(node.attr("href"), false);
+ setUri(uri);
+ }
+ } else {
+ skipLevel = depth;
+ }
+ //TODO add example
+ } else if (node.attr("class").equals("external text")) {
+ //don't skip external links
+ tempLink = new Link();
+ String uri = cleanLink(node.attr("href"), true);
+ setUri(uri);
+
+ } else {
+ skipLevel = depth;
+ }
+ }
+
+
} else if(node.nodeName().equals("p")) {
if(paragraph != null) {
addParagraph("p");
@@ -136,6 +166,7 @@ public void head(Node node, int depth) {
skipLevel = depth;
} else if(node.nodeName().equals("span")) {
//denote notes
+
if(node.attr("class").contains("notebegin"))
addParagraph("note");
@@ -159,13 +190,21 @@ private void setUri(String uri) {
private String cleanLink(String uri, boolean external) {
if(!external) {
+
+ String linkPrefix = "/wiki/";
+ String linkPrefix2= "./";
+ if(uri.contains(linkPrefix)){
+ uri=uri.substring(uri.indexOf("?title=")+7);
+ } else if (uri.contains(linkPrefix2)) {
+ uri=uri.substring(uri.indexOf("?title=")+3);
+ }
//TODO central string management
if(!this.context.language.equals("en")) {
-
- uri="http://"+this.context.language+".dbpedia.org/resource/"+uri.substring(uri.indexOf("?title=")+7);
+ uri="http://"+this.context.language+".dbpedia.org/resource/"+uri;
- } else {
- uri="http://dbpedia.org/resource/"+uri.substring(uri.indexOf("?title=")+7);
+ }
+ else {
+ uri="http://dbpedia.org/resource/"+uri;
}
uri = uri.replace("&action=edit&redlink=1", "");
@@ -183,14 +222,15 @@ private String cleanLink(String uri, boolean external) {
e.printStackTrace();
}
}
-
return UriUtils.uriToDbpediaIri(uri).toString();
}
public void tail(Node node, int depth) {
-
+
+
if(skipLevel>0) {
if(skipLevel==depth) {
+
skipLevel = -1;
return;
} else {
@@ -198,7 +238,7 @@ public void tail(Node node, int depth) {
}
}
- if(node.nodeName().equals("a")&&inLink) {
+ if(node.nodeName().equals("a") && inLink) {
inLink = false;
paragraph.addLink(tempLink);
tempLink = new Link();
@@ -210,6 +250,7 @@ else if(node.nodeName().equals("p") && paragraph != null) {
addParagraph("p");
}
else if(node.nodeName().equals("sup") && inSup) {
+
inSup = false;
}
else if(node.nodeName().matches("h\\d")) {
diff --git a/core/src/main/resources/datasetdefinitions.json b/core/src/main/resources/datasetdefinitions.json
index 521f573410..e93917a858 100644
--- a/core/src/main/resources/datasetdefinitions.json
+++ b/core/src/main/resources/datasetdefinitions.json
@@ -626,5 +626,21 @@
"traits":"LinkedData, Published",
"defaultgraph": "dataset"
}
+ },
+ "history": {
+ "history_dataset": {
+ "name": "History Links",
+ "traits":"LinkedData, Published",
+ "desc": "All data related to history",
+ "defaultgraph": "dataset"
+ },
+ "history_stats": {
+ "name": "History Stats",
+ "traits":"LinkedData, Published",
+ "desc": "Statistics related to edition statistics",
+ "defaultgraph": "dataset"
+ }
+
+
}
}
diff --git a/core/src/main/scala/org/dbpedia/extraction/config/Config.scala b/core/src/main/scala/org/dbpedia/extraction/config/Config.scala
old mode 100644
new mode 100755
index e10453e7d1..54ede5c402
--- a/core/src/main/scala/org/dbpedia/extraction/config/Config.scala
+++ b/core/src/main/scala/org/dbpedia/extraction/config/Config.scala
@@ -94,10 +94,10 @@ class Config(val configPath: String) extends
}
/**
- * Number of parallel processes allowed. Depends on the number of cores, type of disk and IO speed
+ * Number of parallel processes allowed. Depends on the number of cores, type of disk, and IO speed
*
*/
- lazy val parallelProcesses: Int = this.getProperty("parallel-processes", "4").trim.toInt
+ lazy val parallelProcesses: Int = this.getProperty("parallel-processes", "1").trim.toInt
lazy val sparkMaster: String = Option(getString(this, "spark-master")).getOrElse("local[*]")
@@ -259,18 +259,32 @@ class Config(val configPath: String) extends
}
lazy val mediawikiConnection: MediaWikiConnection = Try {
+
MediaWikiConnection(
- apiUrl = this.getProperty("mwc-apiUrl", "").trim,
+ apiType = this.getProperty("mwc-type", "").trim,
+ apiUrl = this.getProperty("mwc-type").trim match {
+ case "rest" => this.getProperty("mwc-apiRestUrl", "").trim
+ case "mwc" => this.getProperty("mwc-apiMWCUrl", "").trim
+ case "local" => this.getProperty("mwc-apiLocalUrl", "").trim
+ },
maxRetries = this.getProperty("mwc-maxRetries", "4").trim.toInt,
connectMs = this.getProperty("mwc-connectMs", "2000").trim.toInt,
readMs = this.getProperty("mwc-readMs", "5000").trim.toInt,
- sleepFactor = this.getProperty("mwc-sleepFactor", "1000").trim.toInt
+ sleepFactor = this.getProperty("mwc-sleepFactor", "1000").trim.toInt,
+ maxlag = this.getProperty("mwc-maxlag", "5").trim.toInt,
+ useragent = this.getProperty("mwc-useragent", "anonymous").trim,
+ gzip = this.getProperty("mwc-gzip","false").trim.toBoolean,
+ retryafter = this.getProperty("mwc-retryafter", "false").trim.toBoolean,
+ accept = this.getProperty("mwc-accept", "text/html").trim,
+ charset = this.getProperty("mwc-charset", "utf-8").trim,
+ profile = this.getProperty("mwc-profile", "https://www.mediawiki.org/wiki/Specs/HTML/2.1.0").trim
)
} match{
case Success(s) => s
- case Failure(f) => throw new IllegalArgumentException("Not all necessary parameters for the 'MediaWikiConnection' class were provided or could not be parsed to the expected type.", f)
+ case Failure(f) => throw new IllegalArgumentException("Some parameters necessary for the 'MediaWikiConnection' class were not provided or could not be parsed to the expected type.", f)
}
+
lazy val abstractParameters: AbstractParameters = Try {
AbstractParameters(
abstractQuery = this.getProperty("abstract-query", "").trim,
@@ -364,12 +378,20 @@ object Config{
* @param sleepFactor
*/
case class MediaWikiConnection(
- apiUrl: String,
- maxRetries: Int,
- connectMs: Int,
- readMs: Int,
- sleepFactor: Int
- )
+ apiType: String,
+ apiUrl: String,
+ maxRetries: Int,
+ connectMs: Int,
+ readMs: Int,
+ sleepFactor: Int,
+ maxlag: Int,
+ useragent: String,
+ gzip: Boolean,
+ retryafter: Boolean,
+ accept : String,
+ charset: String,
+ profile: String
+ )
case class AbstractParameters(
abstractQuery: String,
diff --git a/core/src/main/scala/org/dbpedia/extraction/config/provenance/DBpediaDatasets.scala b/core/src/main/scala/org/dbpedia/extraction/config/provenance/DBpediaDatasets.scala
index a040316c5b..0d9d32e263 100644
--- a/core/src/main/scala/org/dbpedia/extraction/config/provenance/DBpediaDatasets.scala
+++ b/core/src/main/scala/org/dbpedia/extraction/config/provenance/DBpediaDatasets.scala
@@ -278,8 +278,14 @@ object DBpediaDatasets extends java.io.Serializable
val CitatedFacts: Dataset = datasets("cited_facts") //TODO add description @Dimitris
//val CitationTypes = datasets.get("citation_types").get
+ /**
+ * History
+ *
+ */
+ val HistoryData: Dataset = datasets("history_dataset")
+ val HistoryStats: Dataset = datasets("history_stats")
- /**
+ /**
* misc
*/
val MainDataset: Dataset = datasets("main_dataset")
diff --git a/core/src/main/scala/org/dbpedia/extraction/mappings/NifExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/mappings/NifExtractor.scala
old mode 100644
new mode 100755
index 28e7b533db..5d84fe96cc
--- a/core/src/main/scala/org/dbpedia/extraction/mappings/NifExtractor.scala
+++ b/core/src/main/scala/org/dbpedia/extraction/mappings/NifExtractor.scala
@@ -3,10 +3,10 @@ package org.dbpedia.extraction.mappings
import org.dbpedia.extraction.annotations.ExtractorAnnotation
import org.dbpedia.extraction.config.Config
import org.dbpedia.extraction.config.provenance.DBpediaDatasets
-import org.dbpedia.extraction.nif.WikipediaNifExtractor
+import org.dbpedia.extraction.nif.{WikipediaNifExtractorRest, WikipediaNifExtractor}
import org.dbpedia.extraction.ontology.Ontology
import org.dbpedia.extraction.transform.Quad
-import org.dbpedia.extraction.util.{Language, MediaWikiConnector}
+import org.dbpedia.extraction.util.{Language, MediawikiConnectorConfigured, MediaWikiConnectorRest}
import org.dbpedia.extraction.wikiparser._
import scala.language.reflectiveCalls
@@ -41,12 +41,11 @@ class NifExtractor(
protected val writeLinkAnchors: Boolean = context.configFile.nifParameters.writeLinkAnchor
protected val writeStrings: Boolean = context.configFile.nifParameters.writeAnchor
protected val shortAbstractLength: Int = context.configFile.abstractParameters.shortAbstractMinLength
-
+ protected val abstractsOnly : Boolean = context.configFile.nifParameters.abstractsOnly
protected val dbpediaVersion: String = context.configFile.dbPediaVersion
override val datasets = Set(DBpediaDatasets.NifContext,DBpediaDatasets.NifPageStructure,DBpediaDatasets.NifTextLinks,DBpediaDatasets.LongAbstracts, DBpediaDatasets.ShortAbstracts, DBpediaDatasets.RawTables, DBpediaDatasets.Equations)
- private val mwConnector = new MediaWikiConnector(context.configFile.mediawikiConnection, context.configFile.nifParameters.nifTags.split(","))
override def extract(pageNode : WikiPage, subjectUri : String): Seq[Quad] =
{
@@ -56,13 +55,24 @@ class NifExtractor(
//Don't extract abstracts from redirect and disambiguation pages
if(pageNode.isRedirect || pageNode.isDisambiguation) return Seq.empty
- //Retrieve page text
- val html = mwConnector.retrievePage(pageNode.title, apiParametersFormat, pageNode.isRetry) match{
- case Some(t) => NifExtractor.postProcessExtractedHtml(pageNode.title, t)
- case None => return Seq.empty
- }
+ var html = ""
+ val mwcType = context.configFile.mediawikiConnection.apiType
- new WikipediaNifExtractor(context, pageNode).extractNif(html)(err => pageNode.addExtractionRecord(err))
+ if (mwcType == "rest") {
+ val mwConnector = new MediaWikiConnectorRest(context.configFile.mediawikiConnection, context.configFile.nifParameters.nifTags.split(","))
+ html = mwConnector.retrievePage(pageNode.title, apiParametersFormat, pageNode.isRetry) match {
+ case Some(t) => NifExtractor.postProcessExtractedHtml(pageNode.title, t)
+ case None => return Seq.empty
+ }
+ new WikipediaNifExtractorRest(context, pageNode).extractNif(html)(err => pageNode.addExtractionRecord(err))
+ } else {
+ val mwConnector = new MediawikiConnectorConfigured(context.configFile.mediawikiConnection, context.configFile.nifParameters.nifTags.split(","))
+ html = mwConnector.retrievePage(pageNode.title, apiParametersFormat, pageNode.isRetry) match {
+ case Some(t) => NifExtractor.postProcessExtractedHtml(pageNode.title, t)
+ case None => return Seq.empty
+ }
+ new WikipediaNifExtractor(context, pageNode).extractNif(html)(err => pageNode.addExtractionRecord(err))
+ }
}
}
diff --git a/core/src/main/scala/org/dbpedia/extraction/mappings/PlainAbstractExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/mappings/PlainAbstractExtractor.scala
old mode 100644
new mode 100755
index 59cf57d029..87acba9558
--- a/core/src/main/scala/org/dbpedia/extraction/mappings/PlainAbstractExtractor.scala
+++ b/core/src/main/scala/org/dbpedia/extraction/mappings/PlainAbstractExtractor.scala
@@ -7,7 +7,7 @@ import org.dbpedia.extraction.config.provenance.DBpediaDatasets
import org.dbpedia.extraction.ontology.Ontology
import org.dbpedia.extraction.transform.{Quad, QuadBuilder}
import org.dbpedia.extraction.util.abstracts.AbstractUtils
-import org.dbpedia.extraction.util.{Language, MediaWikiConnector, WikiUtil}
+import org.dbpedia.extraction.util.{Language, MediawikiConnectorConfigured}
import org.dbpedia.extraction.wikiparser._
import scala.language.reflectiveCalls
@@ -63,7 +63,6 @@ extends WikiPageExtractor
override val datasets = Set(DBpediaDatasets.LongAbstracts, DBpediaDatasets.ShortAbstracts)
- private val mwConnector = new MediaWikiConnector(context.configFile.mediawikiConnection, context.configFile.abstractParameters.abstractTags.split(","))
override def extract(pageNode : WikiPage, subjectUri: String): Seq[Quad] =
{
@@ -79,7 +78,8 @@ extends WikiPageExtractor
//val abstractWikiText = getAbstractWikiText(pageNode)
// if(abstractWikiText == "") return Seq.empty
- //Retrieve page text
+
+ val mwConnector = new MediawikiConnectorConfigured(context.configFile.mediawikiConnection, context.configFile.abstractParameters.abstractTags.split(","))
val text = mwConnector.retrievePage(pageNode.title, apiParametersFormat, pageNode.isRetry) match {
case Some(t) => PlainAbstractExtractor.postProcessExtractedHtml(pageNode.title, replacePatterns(t))
case None => return Seq.empty
diff --git a/core/src/main/scala/org/dbpedia/extraction/nif/HtmlNifExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/nif/HtmlNifExtractor.scala
old mode 100644
new mode 100755
index c886538d6f..90e70ae151
--- a/core/src/main/scala/org/dbpedia/extraction/nif/HtmlNifExtractor.scala
+++ b/core/src/main/scala/org/dbpedia/extraction/nif/HtmlNifExtractor.scala
@@ -61,28 +61,36 @@ abstract class HtmlNifExtractor(nifContextIri: String, language: String, nifPara
var context = ""
var offset = 0
+
val quads = for(section <- sections) yield {
extractTextFromHtml(section, new NifExtractorContext(language, subjectIri, templateString)) match {
case Success(extractionResults) => {
sectionMap.put(section, extractionResults)
sectionMap.put(extractionResults, extractionResults)
+
if (context.length != 0) {
context = context + "\n\n"
offset += 2
}
+
var quads = if(nifParameters.abstractsOnly)
Seq()
else
makeStructureElements(extractionResults, nifContextIri, graphIri, offset)
+
offset += extractionResults.getExtractedLength
context += extractionResults.getExtractedText
//collect additional triples
quads ++= extendSectionTriples(extractionResults, graphIri, subjectIri)
+
+
//forward exceptions
extractionResults.errors.foreach(exceptionHandle(_, RecordSeverity.Warning, null))
+
+
quads
}
case Failure(e) => {
@@ -143,6 +151,7 @@ abstract class HtmlNifExtractor(nifContextIri: String, language: String, nifPara
triples += nifStructure(p.getSectionIri(), RdfNamespace.NIF.append("nextSection"), sectionUri, sourceUrl, null)
case None =>
}
+
section.getTop match{
case Some(p) =>
triples += nifStructure(sectionUri, RdfNamespace.NIF.append("superString"), p.getSectionIri(), sourceUrl, null)
@@ -159,12 +168,17 @@ abstract class HtmlNifExtractor(nifContextIri: String, language: String, nifPara
triples += nifStructure(contextUri, RdfNamespace.NIF.append("lastSection"), sectionUri, sourceUrl, null)
}
else{
- triples += nifStructure(sectionUri, RdfNamespace.NIF.append("superString"), section.getTop.get.getSectionIri(), sourceUrl, null)
- triples += nifStructure(section.getTop.get.getSectionIri(), RdfNamespace.NIF.append("hasSection"), sectionUri, sourceUrl, null)
- if (section.prev.isEmpty)
- triples += nifStructure(section.getTop.get.getSectionIri(), RdfNamespace.NIF.append("firstSection"), sectionUri, sourceUrl, null)
- if (section.next.isEmpty)
- triples += nifStructure(section.getTop.get.getSectionIri(), RdfNamespace.NIF.append("lastSection"), sectionUri, sourceUrl, null)
+ // ADDED THIS TEST BECAUSE WHEN THIS IS A PAGE END IT CAUSES PROBLEMS (top not empty but no getTop)
+ if(section.getTop != None) {
+ triples += nifStructure(sectionUri, RdfNamespace.NIF.append("superString"), section.getTop.get.getSectionIri(), sourceUrl, null)
+ triples += nifStructure(section.getTop.get.getSectionIri(), RdfNamespace.NIF.append("hasSection"), sectionUri, sourceUrl, null)
+ if (section.prev.isEmpty)
+ triples += nifStructure(section.getTop.get.getSectionIri(), RdfNamespace.NIF.append("firstSection"), sectionUri, sourceUrl, null)
+ if (section.next.isEmpty)
+ triples += nifStructure(section.getTop.get.getSectionIri(), RdfNamespace.NIF.append("lastSection"), sectionUri, sourceUrl, null)
+ }
+
+
}
//further specifying paragraphs of every section
@@ -341,7 +355,9 @@ abstract class HtmlNifExtractor(nifContextIri: String, language: String, nifPara
}
protected def getJsoupDoc(html: String): Document = {
- val doc = Jsoup.parse(html.replaceAll("\n", ""))
+
+ var html_clean=cleanHtml(html)
+ val doc = Jsoup.parse( html_clean)
//delete queries
for(query <- cssSelectorConfigMap.removeElements)
diff --git a/core/src/main/scala/org/dbpedia/extraction/nif/WikipediaNifExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/nif/WikipediaNifExtractor.scala
old mode 100644
new mode 100755
index 64764d3bf5..3f99c16854
--- a/core/src/main/scala/org/dbpedia/extraction/nif/WikipediaNifExtractor.scala
+++ b/core/src/main/scala/org/dbpedia/extraction/nif/WikipediaNifExtractor.scala
@@ -210,15 +210,15 @@ class WikipediaNifExtractor(
tocMap
}
- private def isWikiPageEnd(node: Node): Boolean ={
+ protected def isWikiPageEnd(node: Node): Boolean ={
cssSelectorTest(node, cssSelectorConfigMap.findPageEnd)
}
- private def isWikiToc(node: Node): Boolean ={
+ protected def isWikiToc(node: Node): Boolean ={
cssSelectorTest(node, cssSelectorConfigMap.findToc)
}
- private def isWikiNextTitle(node: Node): Boolean ={
+ protected def isWikiNextTitle(node: Node): Boolean ={
cssSelectorTest(node, cssSelectorConfigMap.nextTitle)
}
diff --git a/core/src/main/scala/org/dbpedia/extraction/nif/WikipediaNifExtractorRest.scala b/core/src/main/scala/org/dbpedia/extraction/nif/WikipediaNifExtractorRest.scala
new file mode 100755
index 0000000000..268dac376f
--- /dev/null
+++ b/core/src/main/scala/org/dbpedia/extraction/nif/WikipediaNifExtractorRest.scala
@@ -0,0 +1,137 @@
+package org.dbpedia.extraction.nif
+
+import org.dbpedia.extraction.config.Config
+import org.dbpedia.extraction.ontology.Ontology
+import org.dbpedia.extraction.util.Language
+import org.dbpedia.extraction.wikiparser.WikiPage
+import org.jsoup.nodes.{Document, Element, Node}
+import scala.collection.convert.decorateAsScala._
+import scala.collection.mutable
+import scala.collection.mutable.ListBuffer
+import scala.language.reflectiveCalls
+
+/**
+ * Created by Chile on 1/19/2017.
+ */
+class WikipediaNifExtractorRest(
+ context : {
+ def ontology : Ontology
+ def language : Language
+ def configFile : Config
+ },
+ wikiPage: WikiPage
+ )
+ extends WikipediaNifExtractor ( context ,wikiPage) {
+
+
+ /**
+ * subtracts the relevant text
+ * @param html
+ * @return list of sections
+ */
+ override def getRelevantParagraphs (html: String): mutable.ListBuffer[PageSection] = {
+
+ val tocMap = new mutable.ListBuffer[PageSection]()
+ val doc: Document = getJsoupDoc(html)
+
+ var nodes = doc.select("body").first.childNodes.asScala
+
+ val currentSection = new ListBuffer[Int]() //keeps track of section number
+ currentSection.append(0) //initialize on abstract section
+
+ def getSection(currentNodes : scala.collection.mutable.Buffer[Node]) : Unit = {
+ //look for the next tag
+
+ var subnodes = currentNodes.head.childNodes().asScala
+ subnodes = subnodes.dropWhile(currentNodes => !currentNodes.nodeName().matches("h\\d") && !currentNodes.nodeName().matches("section"))
+ var processEnd=false
+ while (subnodes.nonEmpty && !processEnd) {
+ if (subnodes.head.nodeName().matches("h\\d")) {
+ val title = subnodes.headOption
+ processEnd=super.isWikiPageEnd(subnodes.head)
+
+ title match {
+
+ case Some(t) if super.isWikiNextTitle(t) && !processEnd=>
+
+ //calculate the section number by looking at the to tags
+ val depth = Integer.parseInt(t.asInstanceOf[org.jsoup.nodes.Element].tagName().substring(1)) - 1
+ if (currentSection.size < depth) //first subsection
+ currentSection.append(1)
+ else {
+ //delete last entries depending on the depth difference to the last section
+ val del = currentSection.size - depth + 1
+ val zw = currentSection(currentSection.size - del)
+ currentSection.remove(currentSection.size - del, del)
+ //if its just another section of the same level -> add one
+ if (currentSection.size == depth - 1)
+ currentSection.append(zw + 1)
+ }
+
+ subnodes = subnodes.drop(1)
+
+ val section = new PageSection(
+ //previous section (if on same depth level
+ prev = currentSection.last match {
+ case x: Int if x > 1 => tocMap.lastOption
+ case _ => None
+ },
+ //super section
+ top = tocMap.find(x => currentSection.size > 1 && x.ref == currentSection.slice(0, currentSection.size - 1).map(n => "." + n.toString).foldRight("")(_ + _).substring(1)),
+ next = None,
+ sub = None,
+ id = t.attr("id"),
+ title = t.asInstanceOf[Element].text(),
+ //merge section numbers separated by a dot
+ ref = currentSection.map(n => "." + n.toString).foldRight("")(_ + _).substring(1),
+ tableCount = 0,
+ equationCount = 0,
+ //take all following tags until you hit another title or end of content
+ content = Seq(t) ++ subnodes.takeWhile(node => !node.nodeName().matches("h\\d") && !node.nodeName().matches("section"))
+ )
+
+ tocMap.append(section)
+ case None => processEnd=true
+ case _ => processEnd=true
+ }
+ } else if (subnodes.head.nodeName().matches("section")) {
+ getSection(subnodes)
+ subnodes = subnodes.drop(1)
+ }
+
+ subnodes = subnodes.dropWhile(node => !node.nodeName().matches("h\\d") && !node.nodeName().matches("section"))
+ }
+
+ }
+
+
+ val abstractSect=doc.select("body").select("section").first.childNodes.asScala //get first section
+ val ab = abstractSect.filter(node => node.nodeName() == "p") //move cursor to abstract
+
+ nodes = nodes.drop(1)
+
+ tocMap.append(new PageSection( //save abstract (abstract = section 0)
+ prev = None,
+ top = None,
+ next = None,
+ sub = None,
+ id = "abstract",
+ title = "abstract",
+ ref = currentSection.map(n => "." + n.toString).foldRight("")(_+_).substring(1),
+ tableCount=0,
+ equationCount = 0,
+ content = ab
+ ))
+
+ if(!abstractsOnly) {
+ while (nodes.nonEmpty) {
+ getSection(nodes)
+ nodes = nodes.drop(1)
+ }
+ }
+ tocMap
+ }
+
+
+
+}
diff --git a/core/src/main/scala/org/dbpedia/extraction/util/MediaWikiConnector.scala b/core/src/main/scala/org/dbpedia/extraction/util/MediaWikiConnector.scala
old mode 100644
new mode 100755
index 68e903c239..23d619e96e
--- a/core/src/main/scala/org/dbpedia/extraction/util/MediaWikiConnector.scala
+++ b/core/src/main/scala/org/dbpedia/extraction/util/MediaWikiConnector.scala
@@ -17,6 +17,7 @@ import org.slf4j.LoggerFactory
* @param connectionConfig - Collection of parameters necessary for API requests (see Config.scala)
* @param xmlPath - An array of XML tag names leading from the root (usually 'api') to the intended content of the response XML (depending on the request query used)
*/
+@deprecated("replaced by MediaWikiConnectorAbstract classes", "2022-09")
class MediaWikiConnector(connectionConfig: MediaWikiConnection, xmlPath: Seq[String]) {
protected val log = LoggerFactory.getLogger(classOf[MediaWikiConnector])
diff --git a/core/src/main/scala/org/dbpedia/extraction/util/MediaWikiConnectorAbstract.scala b/core/src/main/scala/org/dbpedia/extraction/util/MediaWikiConnectorAbstract.scala
new file mode 100755
index 0000000000..f6037bed90
--- /dev/null
+++ b/core/src/main/scala/org/dbpedia/extraction/util/MediaWikiConnectorAbstract.scala
@@ -0,0 +1,78 @@
+package org.dbpedia.extraction.util
+
+import org.dbpedia.extraction.config.Config.MediaWikiConnection
+import org.dbpedia.extraction.wikiparser.WikiTitle
+import org.dbpedia.util.text.html.{HtmlCoder, XmlCodes}
+import org.slf4j.LoggerFactory
+
+import java.io.{InputStream, OutputStreamWriter}
+import java.net.{HttpURLConnection, URL}
+import java.time.temporal.ChronoUnit
+import scala.util.{Failure, Success, Try}
+
+/**
+ * The Mediawiki API connector
+ * @param connectionConfig - Collection of parameters necessary for API requests (see Config.scala)
+ * @param xmlPath - An array of XML tag names leading from the root (usually 'api') to the intended content of the response XML (depending on the request query used)
+ */
+abstract class MediaWikiConnectorAbstract(connectionConfig: MediaWikiConnection, xmlPath: Seq[String]) {
+
+ protected val log = LoggerFactory.getLogger(classOf[MediaWikiConnectorAbstract])
+ //protected def apiUrl: URL = new URL(connectionConfig.apiUrl)
+ //require(Try{apiUrl.openConnection().connect()} match {case Success(x)=> true case Failure(e) => false}, "can not connect to the apiUrl")
+
+ protected val maxRetries: Int = connectionConfig.maxRetries
+ require(maxRetries <= 10 && maxRetries > 0, "maxRetries has to be in the interval of [1,10]")
+
+ protected val retryAfter: Boolean = connectionConfig.retryafter
+ /** timeout for connection to web server, milliseconds */
+ protected val connectMs: Int = connectionConfig.connectMs
+ require(connectMs > 200, "connectMs shall be more than 200 ms!")
+
+ /** timeout for result from web server, milliseconds */
+ protected val readMs: Int = connectionConfig.readMs
+ require(readMs > 1000, "readMs shall be more than 1000 ms!")
+
+ /** sleep between retries, milliseconds, multiplied by CPU load */
+ protected val sleepFactorMs: Int = connectionConfig.sleepFactor
+ require(sleepFactorMs > 200, "sleepFactorMs shall be more than 200 ms!")
+
+ //protected val xmlPath = connectionConfig.abstractTags.split(",").map(_.trim)
+
+ private val osBean = java.lang.management.ManagementFactory.getOperatingSystemMXBean
+ private val availableProcessors = osBean.getAvailableProcessors
+
+ protected val CHARACTERS_TO_ESCAPE = List(
+ (";", "%3B"),
+ ("/", "%2F"),
+ ("?", "%3F"),
+ (":", "%3A"),
+ ("@", "%40"),
+ ("&", "%26"),
+ ("=", "%3D"),
+ ("+", "%2B"),
+ (",", "%2C"),
+ ("$", "%24")
+ )
+ /**
+ * Retrieves a Wikipedia page.
+ *
+ * @param pageTitle The encoded title of the page
+ * @return The page as an Option
+ */
+ def retrievePage(pageTitle : WikiTitle, apiParameterString: String, isRetry: Boolean = false) : Option[String]
+
+ def decodeHtml(text: String): Try[String] = {
+ val coder = new HtmlCoder(XmlCodes.NONE)
+ Try(coder.code(text))
+ }
+
+ /**
+ * Get the parsed and cleaned abstract text from the MediaWiki instance input stream.
+ * It returns
+ * ABSTRACT_TEXT
+ * /// ABSTRACT_TEXT
+ */
+ protected def readInAbstract(inputStream : InputStream) : Try[String]
+
+}
diff --git a/core/src/main/scala/org/dbpedia/extraction/util/MediaWikiConnectorRest.scala b/core/src/main/scala/org/dbpedia/extraction/util/MediaWikiConnectorRest.scala
new file mode 100755
index 0000000000..c7dfa164dc
--- /dev/null
+++ b/core/src/main/scala/org/dbpedia/extraction/util/MediaWikiConnectorRest.scala
@@ -0,0 +1,156 @@
+package org.dbpedia.extraction.util
+
+import org.dbpedia.extraction.config.Config.MediaWikiConnection
+import org.dbpedia.extraction.wikiparser.WikiTitle
+import java.io.InputStream
+import java.net.{HttpURLConnection, URL}
+import java.time.temporal.ChronoUnit
+import scala.collection.JavaConverters._
+import scala.math.pow
+import javax.xml.ws.WebServiceException
+import scala.io.Source
+import scala.util.{Failure, Success, Try}
+
+/**
+ * The Mediawiki API connector
+ * @param connectionConfig - Collection of parameters necessary for API requests (see Config.scala)
+ * @param xmlPath - An array of XML tag names leading from the root (usually 'api') to the intended content of the response XML (depending on the request query used)
+ */
+class MediaWikiConnectorRest(connectionConfig: MediaWikiConnection, xmlPath: Seq[String]) extends MediaWikiConnectorAbstract(connectionConfig, xmlPath ) {
+
+ protected val apiAccept: String = connectionConfig.accept
+ protected val apiCharset: String = connectionConfig.charset
+ protected val apiProfile: String = connectionConfig.profile
+ protected val userAgent: String = connectionConfig.useragent
+
+
+ /**
+ * Retrieves a Wikipedia page.
+ *
+ * @param pageTitle The encoded title of the page
+ * @return The page as an Option
+ */
+ override def retrievePage(pageTitle: WikiTitle, apiParameterString: String, isRetry: Boolean = false): Option[String] = {
+ val retryFactor = if (isRetry) 2 else 1
+ var SuccessParsing = false
+ var parsedAnswer: Try[String] = null
+ var waitingTime = sleepFactorMs
+
+
+ //val apiUrl: URL = new URL(connectionConfig.apiUrl.replace("{{LANG}}",pageTitle.language.wikiCode))
+ // The encoded title may contain some URI-escaped characters (e.g. "5%25-Klausel"),
+ // so we can't use URLEncoder.encode(). But "&" is not escaped, so we do this here.
+ // TODO: test this in detail!!! there may be other characters that need to be escaped.
+ // TODO central string management
+ var titleParam = pageTitle.encodedWithNamespace
+ this.CHARACTERS_TO_ESCAPE foreach {
+ case (search, replacement) => titleParam = titleParam.replace(search, replacement);
+ }
+ //replaces {{lang}} with the language
+ val url: String = connectionConfig.apiUrl.replace("{{LANG}}", pageTitle.language.wikiCode)
+
+ val parameters = "redirect=true"
+ val apiUrl: URL = new URL(url.concat(titleParam).concat("?"+parameters))
+
+
+
+ //println(s"mediawikiurl: $apiUrl")
+
+
+ for (counter <- 1 to maxRetries) {
+
+ val conn = apiUrl.openConnection
+ conn.setDoOutput(true) // POST REQUEST to verify
+
+ val start = java.time.LocalTime.now()
+
+ conn.setConnectTimeout(retryFactor * connectMs)
+ conn.setReadTimeout(retryFactor * readMs)
+ conn.setRequestProperty("accept", apiAccept)
+ conn.setRequestProperty("charset", apiCharset)
+ conn.setRequestProperty("profile", apiProfile)
+ conn.setRequestProperty("Accept-Language", pageTitle.language.wikiCode)
+ conn.setRequestProperty("User-Agent", userAgent)
+
+ val inputStream = conn.getInputStream
+ val answerHeader = conn.getHeaderFields()
+ val answerClean = answerHeader.asScala.filterKeys(_ != null)
+
+ if(conn.getHeaderField(null).contains("HTTP/1.1 200 OK") ){
+
+
+ val end = java.time.LocalTime.now()
+ conn match {
+ case connection: HttpURLConnection =>
+ log.debug("Request type: " + connection.getRequestMethod + "; URL: " + connection.getURL +
+ "; Parameters: " + parameters + "; HTTP code: " + connection.getHeaderField(null) +
+ "; Request time: " + start + "; Response time: " + end + "; Time needed: " +
+ start.until(end, ChronoUnit.MILLIS))
+ case _ =>
+ }
+ // Read answer
+ parsedAnswer = readInAbstract(inputStream)
+ SuccessParsing = parsedAnswer match {
+ case Success(str) => true
+ case Failure(_) => false
+ }
+ }
+ if(!SuccessParsing){
+ var sleepMs = sleepFactorMs
+ if (retryAfter && answerClean.contains("retry-after")) {
+ //println("GIVEN RETRY-AFTER > "+ answer_clean("retry-after").get(0))
+ waitingTime = Integer.parseInt(answerClean("retry-after").get(0)) * 1000
+
+ // exponential backoff test
+ sleepMs = pow(waitingTime, counter).toInt
+ //println("WITH EXPONENTIAL BACK OFF" + counter)
+ //println("Sleeping time double >>>>>>>>>>>" + pow(waiting_time, counter))
+ //println("Sleeping time int >>>>>>>>>>>" + sleepMs)
+
+ }
+ if (counter < maxRetries)
+ Thread.sleep(sleepMs)
+ else
+ throw new Exception("Timeout error retrieving abstract of " + pageTitle + " in " + counter + " tries.")
+ } else {
+
+
+ //println(s"mediawikiurl: $apiUrl?$parameters")
+ return parsedAnswer match {
+ case Success(str) => Option(str)
+ case Failure(e) => throw e
+ }
+ }
+
+ }
+ throw new Exception("Could not retrieve abstract after " + maxRetries + " tries for page: " + pageTitle.encoded)
+ }
+
+
+ /**
+ * Get the parsed and cleaned abstract text from the MediaWiki instance input stream.
+ * It returns
+ * ABSTRACT_TEXT
+ * /// ABSTRACT_TEXT
+ */
+ override def readInAbstract(inputStream: InputStream): Try[String] = {
+ // for XML format
+ try{
+ val htmlAnswer = Source.fromInputStream(inputStream, "UTF-8").getLines().mkString("")
+ //var text = XML.loadString(xmlAnswer).asInstanceOf[NodeSeq]
+
+ //test for errors
+ val pattern = "(]+info=\")([^\\\"]+)".r
+ if (htmlAnswer.contains("error code=")) {
+ return Failure(new WebServiceException(pattern.findFirstMatchIn(htmlAnswer) match {
+ case Some(m) => m.group(2)
+ case None => "An unknown exception occurred while retrieving the source XML from the mediawiki API."
+ }))
+ }
+
+ Success(htmlAnswer)
+ }
+
+
+ }
+}
\ No newline at end of file
diff --git a/core/src/main/scala/org/dbpedia/extraction/util/MediawikiConnectorConfigured.scala b/core/src/main/scala/org/dbpedia/extraction/util/MediawikiConnectorConfigured.scala
new file mode 100755
index 0000000000..d6187ca17b
--- /dev/null
+++ b/core/src/main/scala/org/dbpedia/extraction/util/MediawikiConnectorConfigured.scala
@@ -0,0 +1,227 @@
+package org.dbpedia.extraction.util
+
+import org.dbpedia.extraction.config.Config.MediaWikiConnection
+import org.dbpedia.extraction.wikiparser.WikiTitle
+
+import java.io.{InputStream, OutputStreamWriter}
+import java.net.{HttpURLConnection, URL}
+import java.time.temporal.ChronoUnit
+import java.util.zip._
+import scala.math.pow
+import javax.xml.ws.WebServiceException
+import scala.io.Source
+import scala.util.{Failure, Success, Try}
+import scala.collection.JavaConverters._
+/**
+ * The Mediawiki API connector
+ * @param connectionConfig - Collection of parameters necessary for API requests (see Config.scala)
+ * @param xmlPath - An array of XML tag names leading from the root (usually 'api') to the intended content of the response XML (depending on the request query used)
+ */
+class MediawikiConnectorConfigured(connectionConfig: MediaWikiConnection, xmlPath: Seq[String]) extends MediaWikiConnectorAbstract(connectionConfig, xmlPath ){
+
+ protected val userAgent: String = connectionConfig.useragent
+ require(userAgent != "" , "userAgent must be declared !")
+ protected val gzipCall: Boolean = connectionConfig.gzip
+ protected val maxLag: Int = connectionConfig.maxlag
+ private val osBean = java.lang.management.ManagementFactory.getOperatingSystemMXBean
+ private val availableProcessors = osBean.getAvailableProcessors
+
+ /**
+ * Retrieves a Wikipedia page.
+ *
+ * @param pageTitle The encoded title of the page
+ * @return The page as an Option
+ */
+ override def retrievePage(pageTitle : WikiTitle, apiParameterString: String, isRetry: Boolean = false) : Option[String] =
+ {
+ val retryFactor = if(isRetry) 2 else 1
+ var waitingTime = sleepFactorMs
+ var SuccessParsing = false
+ var currentMaxLag= maxLag
+ var gzipok = true
+ var parsedAnswer: Try[String] = null
+ //replaces {{lang}} with the language
+ val apiUrl: URL = new URL(connectionConfig.apiUrl.replace("{{LANG}}",pageTitle.language.wikiCode))
+
+ // The encoded title may contain some URI-escaped characters (e.g. "5%25-Klausel"),
+ // so we can't use URLEncoder.encode(). But "&" is not escaped, so we do this here.
+ // TODO: test this in detail!!! there may be other characters that need to be escaped.
+ // TODO central string management
+ var titleParam = pageTitle.encodedWithNamespace
+ this.CHARACTERS_TO_ESCAPE foreach {
+ case (search, replacement) => titleParam = titleParam.replace(search, replacement);
+ }
+
+
+
+ for(counter <- 1 to maxRetries)
+ {
+ // Fill parameters
+ var parameters = "uselang=" + pageTitle.language.wikiCode
+
+ parameters += (pageTitle.id match {
+ case Some(id) if apiParameterString.contains("%d") =>
+ apiParameterString.replace("&page=%s", "").format(id)
+ case _ => apiParameterString.replaceAll("&pageid=[^&]+", "").format(titleParam)
+ })
+ //parameters += apiParameterString.replaceAll("&pageid=[^&]+", "").format(titleParam)
+ parameters += "&maxlag=" + currentMaxLag
+ // NEED TO BE ABLE TO MANAGE parsing
+ //parameters += "&redirects=1"
+
+
+ val conn = apiUrl.openConnection
+ val start = java.time.LocalTime.now()
+ conn.setDoOutput(true)
+ conn.setConnectTimeout(retryFactor * connectMs)
+ conn.setReadTimeout(retryFactor * readMs)
+ conn.setRequestProperty("User-Agent",userAgent)
+ if ( gzipCall ) conn.setRequestProperty("Accept-Encoding","gzip")
+
+ //println(s"mediawikiurl: $apiUrl?$parameters")
+ val writer = new OutputStreamWriter(conn.getOutputStream)
+ writer.write(parameters)
+ writer.flush()
+ writer.close()
+ var answerHeader = conn.getHeaderFields();
+ var answerClean = answerHeader.asScala.filterKeys(_ != null);
+
+ // UNCOMMENT FOR LOG
+ /* var mapper = new ObjectMapper()
+ mapper.registerModule(DefaultScalaModule)
+ mapper.configure(SerializationFeature.WRITE_NULL_MAP_VALUES, false)
+
+
+ answer_clean += ("parameters" -> util.Arrays.asList(parameters) )
+ answer_clean += ("url" -> util.Arrays.asList(apiUrl.toString) )
+ answer_clean += ("titleParam" -> util.Arrays.asList(titleParam.toString) )
+ answer_clean += ("status" -> util.Arrays.asList(conn.getHeaderField(null)))
+
+ var jsonString = mapper.writeValueAsString(answer_clean);
+
+ log2.info(jsonString)*/
+
+ if(conn.getHeaderField(null).contains("HTTP/1.1 200 OK") ){
+ var inputStream = conn.getInputStream
+ // IF GZIP
+ if ( gzipCall ){
+ try {
+ inputStream = new GZIPInputStream(inputStream)
+ }catch {
+ case x:ZipException =>{
+ gzipok = false
+ }
+ }
+ }
+ val end = java.time.LocalTime.now()
+ conn match {
+ case connection: HttpURLConnection => {
+ log.debug("Request type: "+ connection.getRequestMethod + "; URL: " + connection.getURL +
+ "; Parameters: " + parameters +"; HTTP code: "+ connection.getHeaderField(null) +
+ "; Request time: "+start+"; Response time: " + end + "; Time needed: " +
+ start.until(end, ChronoUnit.MILLIS))
+ }
+ case _ =>
+ }
+
+ // Read answer
+ parsedAnswer = readInAbstract(inputStream)
+ SuccessParsing = parsedAnswer match {
+ case Success(str) => true
+ case Failure(e) => false
+ }
+
+
+ }
+ if(!SuccessParsing){
+ //println("ERROR DURING PARSING" )
+
+ var sleepMs = sleepFactorMs
+ if (retryAfter && answerClean.contains("retry-after") ){
+ //println("GIVEN RETRY-AFTER > "+ answer_clean("retry-after").get(0))
+ waitingTime = Integer.parseInt(answerClean("retry-after").get(0)) * 1000
+
+ // exponential backoff test
+ sleepMs = pow(waitingTime, counter).toInt
+ //println("WITH EXPONENTIAL BACK OFF" + counter)
+ //println("Sleeping time double >>>>>>>>>>>" + pow(waiting_time, counter))
+ //println("Sleeping time int >>>>>>>>>>>" + sleepMs)
+ if (currentMaxLag < 15) {
+ // INCREMENT MaxLag
+ currentMaxLag = currentMaxLag + 1
+ //println("> INCREASE MAX LAG : " + currentMaxLag)
+ }
+ if (counter < maxRetries)
+ Thread.sleep(sleepMs)
+ else
+ throw new Exception("Timeout error retrieving abstract of " + pageTitle + " in " + counter + " tries.")
+ }
+
+
+ }else{
+
+
+ //println(s"mediawikiurl: $apiUrl?$parameters")
+ return parsedAnswer match {
+ case Success(str) => Option(str)
+ case Failure(e) => throw e
+ }
+ }
+
+ }
+ throw new Exception("Could not retrieve abstract after " + maxRetries + " tries for page: " + pageTitle.encoded)
+
+ }
+
+
+
+ /**
+ * Get the parsed and cleaned abstract text from the MediaWiki instance input stream.
+ * It returns
+ * ABSTRACT_TEXT
+ * /// ABSTRACT_TEXT
+ */
+ override def readInAbstract(inputStream : InputStream) : Try[String] =
+ {
+ // for XML format
+ var xmlAnswer = Source.fromInputStream(inputStream, "UTF-8").getLines().mkString("")
+ //var text = XML.loadString(xmlAnswer).asInstanceOf[NodeSeq]
+
+ //test for errors
+ val pattern = "(]+info=\")([^\\\"]+)".r
+ if(xmlAnswer.contains("error code=")) {
+ return Failure(new WebServiceException(pattern.findFirstMatchIn(xmlAnswer) match {
+ case Some(m) => m.group(2)
+ case None => "An unknown exception occurred while retrieving the source XML from the mediawiki API."
+ }))
+ }
+
+
+ // REDIRECT CASE
+ // Implemented but useful?
+ //xmlAnswer = xmlAnswer.replaceAll("", "")
+ /*if (xmlAnswer.contains("" ) && xmlAnswer.contains("")) {
+ val indexBegin = xmlAnswer.indexOf("")
+ val indexEnd = xmlAnswer.indexOf("", indexBegin + "".length())
+ xmlAnswer=xmlAnswer.substring(0, indexBegin)+xmlAnswer.substring(indexEnd + 1, xmlAnswer.length())
+ }*/
+
+ //get rid of surrounding tags
+ // I limited the regex and I added here a second replace because some pages like the following returned malformed triples :
+ // "xml version=\"1.0\"?>123Movies, GoMovies, GoStream, MeMovies or 123movieshub was a network of file streaming websites operating from Vietnam which allowed users to watch films for free. It was called the world's \"most popular illegal site\" by the Motion Picture Association of America (MPAA) in March 2018, before being shut down a few weeks later on foot of a criminal investigation by the Vietnamese authorities. As of July 2022, the network is still active via clone sites."@en
+
+ xmlAnswer = xmlAnswer.replaceFirst("""<\?xml version=\"\d.\d\"\?>""", "").replaceFirst("""xml version=\"\d.\d\"\?>""","")
+
+ for (child <- xmlPath) {
+ if (xmlAnswer.contains("<" + child) && xmlAnswer.contains("" + child)) {
+ xmlAnswer = xmlAnswer.replaceFirst("<" + child + "[^>]*>", "")
+ xmlAnswer = xmlAnswer.substring(0, xmlAnswer.lastIndexOf("" + child + ">"))
+ }
+ else
+ return Failure(new WebServiceException("The response from the mediawiki API does not contain the expected XML path: " + xmlPath))
+ }
+
+ decodeHtml(xmlAnswer.trim)
+ }
+
+}
diff --git a/dump/src/test/bash/createMinidump_custom_sample.sh b/dump/src/test/bash/createMinidump_custom_sample.sh
new file mode 100755
index 0000000000..81cc513de5
--- /dev/null
+++ b/dump/src/test/bash/createMinidump_custom_sample.sh
@@ -0,0 +1,85 @@
+#!/bin/sh
+file="uris.lst"
+while getopts f: flag
+do
+ case "${flag}" in
+ f) file=${OPTARG};;
+ esac
+done
+echo "========================="
+echo "file: $file";
+echo "========================="
+
+fileUPDT=minidump_file_used.txt;
+if [ -f "$fileUPDT" ]
+then
+ rm -f $fileUPDT
+fi
+
+# sort the file
+LC_ALL=C sort -u -o $file $file
+
+SHACL=`rapper -i turtle ../resources/shacl-tests/* | cut -d ' ' -f1 | grep '^<' | sed 's/.*#//;s/^/;s/>//' | sort -u | wc -l`
+
+echo "# Minidump Overview
+
+This readme is generated upon creation of the minidump by running \`./createMinidump.sh\` [code](https://github.com/dbpedia/extraction-framework/blob/master/dump/src/test/bash/createMinidump.sh).
+
+## SHACL Tests
+Total: $SHACL
+
+TODO match shacl to URIs with a SPARQL query
+
+" > minidump-overview.md
+
+echo "
+## Included Articles
+
+" > minidump-overview.md
+for i in `cat $file` ; do
+ echo "* $i">> minidump-overview.md
+done
+
+
+# detect languages
+LANG=`sed 's|^https://||;s|\.wikipedia.org.*||' $file | sort -u`
+
+
+
+for l in ${LANG} ; do
+ echo "LANGUAGE $l"
+ PAGES=`grep "$l.wikipedia.org" $file | sed 's|wikipedia.org/wiki/|wikipedia.org/wiki/Special:Export/|' `
+ # copy header
+ mkdir -p "../resources/minidumps/"$l
+ TARGET="../resources/minidumps/"$l"/wiki.xml"
+ echo "TARGET: $TARGET"
+ cp head.xml "$TARGET"
+ # process pages
+ for p in ${PAGES}; do
+ echo "PAGE: $p"
+
+ ## Sanitize page name for avoiding none results
+ p_uri=$(basename $p)
+ p_uri_clean=$( echo $p_uri |jq -Rr @uri )
+ p_sanitized=$(echo "$p" | sed "s/$p_uri/$p_uri_clean/")
+
+ echo "PAGE p_sanitized : $p_sanitized"
+
+ echo "" >> "$TARGET"
+
+ echo "" >> $TARGET
+ curl --progress-bar -L $p_sanitized \
+ | xmlstarlet sel -N x="http://www.mediawiki.org/xml/export-0.10/" -t -c "//x:page" \
+ | tail -n+2 >> $TARGET
+ echo "" >> "$TARGET"
+ done
+ echo "\n" >> $TARGET
+ cat "$TARGET" | lbzip2 > "$TARGET.bz2"
+ rm $TARGET
+
+done
+
+echo "$file" > ${fileUPDT};
+# curl $FIRST > $TMPFOLDER/main2.xml
+#xmlstarlet sel -N x="http://www.mediawiki.org/xml/export-0.10/" -t -c "//page" main2.xml
+# xmlstarlet ed -N x="http://www.mediawiki.org/xml/export-0.10/" --subnode "/x:mediawiki/x:siteinfo" --type elem -n "newsubnode" -v "" head.xml
diff --git a/dump/src/test/bash/createSampleRandomFromPageIDdataset.sh b/dump/src/test/bash/createSampleRandomFromPageIDdataset.sh
new file mode 100755
index 0000000000..018753c2a6
--- /dev/null
+++ b/dump/src/test/bash/createSampleRandomFromPageIDdataset.sh
@@ -0,0 +1,23 @@
+#!/bin/sh
+n=10;
+file="page_lang=ro_ids.ttl"
+lang="ro"
+while getopts n:l:f: flag
+do
+ case "${flag}" in
+ n) n=${OPTARG};;
+ l) lang=${OPTARG};;
+ f) file=${OPTARG};;
+
+ esac
+done
+
+
+echo "========================="
+echo "n: $n";
+echo "file: $file";
+echo "========================="
+
+grep -v "resource\/\w*\:" $file > temp.txt
+shuf -n $n temp.txt | grep -oP " <" | sed "s/> /g" | while read line; do echo "https://${lang}.wikipedia.org/wiki/$line"; done > uri_sample_random_${lang}_${n}.lst
+#rm -f temp.txt
\ No newline at end of file
diff --git a/dump/src/test/bash/create_custom_sample.sh b/dump/src/test/bash/create_custom_sample.sh
new file mode 100755
index 0000000000..9732af85e3
--- /dev/null
+++ b/dump/src/test/bash/create_custom_sample.sh
@@ -0,0 +1,88 @@
+#!/bin/sh
+lang="";
+n=1000;
+date_archive=$(date -d "$(date +%Y-%m-01) -1 day" +%Y-%m);
+sort="desc"
+while getopts l:d:n:s: flag
+do
+ case "${flag}" in
+ l) lang=${OPTARG};;
+ d) date_archive=${OPTARG};;
+ n) n=${OPTARG};;
+ s) sort=${OPTARG};;
+ esac
+done
+echo "========================="
+echo "lang: $lang";
+echo "date: $date_archive";
+echo "n: $n";
+echo "sort: $sort";
+echo "========================="
+
+
+clickstream_data="clickstream_data_${lang}_${date_archive}"
+if [ -f "$clickstream_data" ]
+then
+ echo "File found"
+else
+ echo "File not found"
+ clickstream_url="https://dumps.wikimedia.org/other/clickstream/";
+ content=$(curl -L "$clickstream_url$date_archive/")
+ links=$( echo $content | grep -Po '(?<=href=")[^"]*');
+ toextract="";
+ substr="-${lang}wiki-"
+ echo $substr
+ for link in ${links[@]}; do
+ echo $link
+ if [[ $link =~ "-${lang}wiki-" ]];then
+ toextract="$clickstream_url$date_archive/$link";
+ fi
+ done
+
+ if [[ $toextract == "" ]]; then
+ echo "Lang not found in clickstream";
+ exit 1;
+ fi
+
+ echo ">>>> DOWNLOAD $toextract and save it"
+
+ wget -O "${clickstream_data}.gz" $toextract;
+ gzip -d "${clickstream_data}.gz"
+fi
+
+
+echo ">>>> COMPUTE SUM OF CLICKS"
+declare -A dict
+while IFS= read -r line; do
+ IFS=$'\t'; arrIN=($line); unset IFS;
+ key=${arrIN[1]}
+ val=${arrIN[3]}
+ if [[ ${key} != *"List"* ]];then
+ if [[ ${#dict[${key}]} -eq 0 ]] ;then
+ dict[${key}]=$(($val));
+ else
+ dict[${key}]=$((${dict[${key}]}+$val));
+ fi
+ fi
+done < $clickstream_data
+
+echo ">>>> SORT IT AND SAVE TEMP"
+if [[ $sort == "desc" ]]; then
+ for page in "${!dict[@]}"
+ do
+ echo "$page ${dict[$page]}"
+ done | sort -rn -k2 | head -n "$n" | cut -d ' ' -f 1 >> temp.txt;
+else
+ for page in "${!dict[@]}"
+ do
+ echo "$page ${dict[$page]}"
+ done | sort -n -k2 | head -n "$n" | cut -d ' ' -f 1 >> temp.txt;
+fi
+
+
+echo ">>>>> SAVE FINAL FILE : uri_sample_${lang}_${sort}_${n}.lst"
+while IFS= read -r line;do
+ echo "https://$lang.wikipedia.org/wiki/$line" >> "uri_sample_${lang}_${sort}_${n}.lst"
+done < "temp.txt"
+
+rm -rf temp.txt
diff --git a/dump/src/test/resources/extraction-configs/extraction.nif.abstracts.properties b/dump/src/test/resources/extraction-configs/extraction.nif.abstracts.properties
old mode 100644
new mode 100755
index 1083e6db10..499789ef0e
--- a/dump/src/test/resources/extraction-configs/extraction.nif.abstracts.properties
+++ b/dump/src/test/resources/extraction-configs/extraction.nif.abstracts.properties
@@ -30,13 +30,15 @@ require-download-complete=false
# List of languages or article count ranges, e.g. 'en,de,fr' or '10000-20000' or '10000-', or '@mappings'
# NOTE sync with minidumps
#languages=af,als,am,an,arz,ast,azb,ba,bar,bat-smg,bpy,br,bs,bug,cdo,ce,ceb,ckb,cv,fo,fy,gd,he,hsb,ht,ia,ilo,io,is,jv,ka,kn,ku,ky,la,lb,li,lmo,mai,mg,min,ml,mn,mr,mrj,ms,mt,my,mzn,nah,nap,nds,ne,new,nn,no,oc,or,os,pa,pms,pnb,qu,sa,sah,scn,sco,sh,si,simple,sq,su,sw,ta,te,tg,th,tl,tt,uz,vec,wa,xmf,yo,zh-min-nan,zh-yue
-languages=en,fr,de,nl,ro
+#languages=af,als,am,an,arz,ast,azb,ba,bar,bat-smg,bpy,br,bs,bug,cdo,ce,ceb,ckb,cv,fo,fy,gd,he,hsb,ht,ia,ilo,io,is,jv,ka,kn,ku,ky,la,lb,li,lmo,mai,mg,min,ml,mn,mr,mrj,ms,mt,my,mzn,nah,nap,nds,ne,new,nn,no,oc,or,os,pa,pms,pnb,qu,sa,sah,scn,sco,sh,si,simple,sq,su,sw,ta,te,tg,th,tl,tt,uz,vec,wa,xmf,yo,zh-min-nan,zh-yue
+languages=fr
+
# default namespaces: Main, File, Category, Template
# we only want abstracts for articles -> only main namespace
namespaces=Main
# extractor class names starting with "." are prefixed by "org.dbpedia.extraction.mappings"
-
+# Change to NifExtractor for extracting Nif texts
extractors=.HtmlAbstractExtractor
remove-broken-brackets-html-abstracts=true
# if ontology and mapping files are not given or do not exist, download info from mappings.dbpedia.org
@@ -64,11 +66,27 @@ format.ttl.bz2=turtle-triples;uri-policy.iri
#the following parameters are for the mediawiki api connection used in nif and abstract extraction
-mwc-apiUrl=https://{{LANG}}.wikipedia.org/w/api.php
+
+mwc-apiMWCUrl=https://{{LANG}}.wikipedia.org/w/api.php
+mwc-apiRestUrl=https://{{LANG}}.wikipedia.org/api/rest_v1/page/html/
+mwc-apiLocalUrl=http://localhost:8080/api.php
+# chose "rest", "mwc" or "local"
+mwc-type=rest
+# MWC params
mwc-maxRetries=5
mwc-connectMs=4000
mwc-readMs=30000
mwc-sleepFactor=2000
+# MWC specifics params
+mwc-maxlag=3
+mwc-useragent=(https://dbpedia.org/; dbpedia@infai.org) DIEF
+mwc-gzip=true
+mwc-retryafter=true
+# REST specifics params
+mwc-accept=text/html
+mwc-charset=utf-8
+mwc-profile=https://www.mediawiki.org/wiki/Specs/HTML/2.1.0
+
#parameters specific for the abstract extraction
abstract-query=&format=xml&action=query&prop=extracts&exintro=&explaintext=&titles=%s
@@ -83,7 +101,7 @@ short-abstract-min-length=200
#parameters specific to the nif extraction
#only extract abstract (not the whole page)
-nif-extract-abstract-only=false
+nif-extract-abstract-only=true
#the request query string
nif-query=&format=xml&action=parse&prop=text&page=%s&pageid=%d
#the xml path of the response
diff --git a/dump/src/test/resources/extraction-configs/extraction.plain.abstracts.properties b/dump/src/test/resources/extraction-configs/extraction.plain.abstracts.properties
old mode 100644
new mode 100755
index d865cb4c45..4025e87854
--- a/dump/src/test/resources/extraction-configs/extraction.plain.abstracts.properties
+++ b/dump/src/test/resources/extraction-configs/extraction.plain.abstracts.properties
@@ -30,7 +30,8 @@ require-download-complete=false
# List of languages or article count ranges, e.g. 'en,de,fr' or '10000-20000' or '10000-', or '@mappings'
# NOTE sync with minidumps
#languages=af,als,am,an,arz,ast,azb,ba,bar,bat-smg,bpy,br,bs,bug,cdo,ce,ceb,ckb,cv,fo,fy,gd,he,hsb,ht,ia,ilo,io,is,jv,ka,kn,ku,ky,la,lb,li,lmo,mai,mg,min,ml,mn,mr,mrj,ms,mt,my,mzn,nah,nap,nds,ne,new,nn,no,oc,or,os,pa,pms,pnb,qu,sa,sah,scn,sco,sh,si,simple,sq,su,sw,ta,te,tg,th,tl,tt,uz,vec,wa,xmf,yo,zh-min-nan,zh-yue
-languages=en
+
+languages=ro
# default namespaces: Main, File, Category, Template
# we only want abstracts for articles -> only main namespace
namespaces=Main
@@ -64,11 +65,21 @@ format.ttl.bz2=turtle-triples;uri-policy.iri
#the following parameters are for the mediawiki api connection used in nif and abstract extraction
-mwc-apiUrl=https://{{LANG}}.wikipedia.org/w/api.php
+mwc-apiMWCUrl=https://{{LANG}}.wikipedia.org/w/api.php
+mwc-apiLocalUrl=http://localhost:8080/api.php
+# chose "mwc" or "local"
+mwc-type=mwc
+# MWC params
mwc-maxRetries=5
mwc-connectMs=4000
mwc-readMs=30000
mwc-sleepFactor=2000
+# MWC rest params
+mwc-maxlag=3
+mwc-useragent=(https://dbpedia.org/; dbpedia@infai.org) DIEF
+mwc-gzip=true
+mwc-retryafter=true
+
#parameters specific for the abstract extraction
abstract-query=&format=xml&action=query&prop=extracts&exintro=&explaintext=&titles=%s
@@ -83,7 +94,7 @@ short-abstract-min-length=200
#parameters specific to the nif extraction
#only extract abstract (not the whole page)
-nif-extract-abstract-only=false
+nif-extract-abstract-only=true
#the request query string
nif-query=&format=xml&action=parse&prop=text&page=%s&pageid=%d
#the xml path of the response
diff --git a/dump/src/test/scala/org/dbpedia/extraction/dump/ExtractionTestAbstract.md b/dump/src/test/scala/org/dbpedia/extraction/dump/ExtractionTestAbstract.md
new file mode 100755
index 0000000000..c7ec48230c
--- /dev/null
+++ b/dump/src/test/scala/org/dbpedia/extraction/dump/ExtractionTestAbstract.md
@@ -0,0 +1,28 @@
+# ExtractionTestAbstract
+
+designed for testing abstracts extractors
+## Before all
+
+* Delete tag `@DoNotDiscover` of `ExtractionTestAbstract`
+* add tag `@DoNotDiscover` to other test class
+
+## Procedure
+1. Clean your target directory with `mvn clean` in the root directory of DIEF
+1. Go to bash scripts via
+ ```shell
+ cd /dump/src/test/bash
+ ```
+1. OPTIONAL: Create a new Wikipedia minidump sample with
+ ```shell
+ bash create_custom_sample.sh -n $numberOfPage -l $lang -d $optionalDate
+ ```
+1. Process sample of Wikipedia pages
+ ```shell
+ bash Minidump_custom_sample.sh -f $filename/lst
+ ```
+1. Update the extraction language parameter for your minidump sample in [`extraction.nif.abstracts.properties`](https://github.com/datalogism/extraction-framework/blob/gsoc-celian/dump/src/test/resources/extraction-configs/extraction.nif.abstracts.properties) and in [`extraction.plain.abstracts.properties`](https://github.com/datalogism/extraction-framework/blob/gsoc-celian/dump/src/test/resources/extraction-configs/extraction.plain.abstracts.properties)
+1. Change the name of your log in the [`ExtractionTestAbstract.scala`](https://github.com/datalogism/extraction-framework/blob/gsoc-celian/dump/src/test/scala/org/dbpedia/extraction/dump/ExtractionTestAbstract.scala) file
+1. Rebuild the app with `mvn install`, or just test it with
+ ```shell
+ mvn test -Dtest="ExtractionTestAbstract2"
+ ```
diff --git a/dump/src/test/scala/org/dbpedia/extraction/dump/ExtractionTestAbstract.scala b/dump/src/test/scala/org/dbpedia/extraction/dump/ExtractionTestAbstract.scala
new file mode 100755
index 0000000000..db19d13fff
--- /dev/null
+++ b/dump/src/test/scala/org/dbpedia/extraction/dump/ExtractionTestAbstract.scala
@@ -0,0 +1,162 @@
+package org.dbpedia.extraction.dump
+import scala.io.Source
+import java.io.{BufferedWriter, File, FileWriter}
+import java.util.concurrent.ConcurrentLinkedQueue
+import org.apache.commons.io.FileUtils
+import org.dbpedia.extraction.config.Config
+import org.dbpedia.extraction.dump.TestConfig.{ date, mappingsConfig, minidumpDir, nifAbstractConfig, plainAbstractConfig}
+import org.dbpedia.extraction.dump.extract.ConfigLoader
+import org.dbpedia.extraction.dump.tags.ExtractionTestTag
+import org.scalatest.{BeforeAndAfterAll, DoNotDiscover, FunSuite}
+import com.fasterxml.jackson.module.scala.DefaultScalaModule
+import com.fasterxml.jackson.databind.ObjectMapper
+import java.nio.file.{Files, Paths}
+import scala.concurrent.Future
+
+
+@DoNotDiscover
+class ExtractionTestAbstract extends FunSuite with BeforeAndAfterAll {
+ println(""" __ ____ _ __ ______ __
+ | / |/ (_)___ (_)___/ /_ ______ ___ ____ /_ __/__ _____/ /______
+ | / /|_/ / / __ \/ / __ / / / / __ `__ \/ __ \ / / / _ \/ ___/ __/ ___/
+ | / / / / / / / / / /_/ / /_/ / / / / / / /_/ / / / / __(__ ) /_(__ )
+ |/_/ /_/_/_/ /_/_/\__,_/\__,_/_/ /_/ /_/ .___/ /_/ \___/____/\__/____/
+ | /_/ ABSTRACTS""".replace("\r", "").stripMargin)
+
+ override def beforeAll() {
+ minidumpDir.listFiles().foreach(f => {
+ val wikiMasque = f.getName + "wiki"
+ val targetDir = new File(mappingsConfig.dumpDir, s"$wikiMasque/$date/")
+ // create directories
+ targetDir.mkdirs()
+ FileUtils.copyFile(
+ new File(f + "/wiki.xml.bz2"),
+ new File(targetDir, s"$wikiMasque-$date-pages-articles-multistream.xml.bz2")
+ )
+ })
+ }
+
+
+
+
+test("extract html abstract datasets", ExtractionTestTag) {
+ Utils.renameAbstractsDatasetFiles("html")
+ println(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> html abstract begin")
+ val jobsRunning1 = new ConcurrentLinkedQueue[Future[Unit]]()
+ val extractRes = extract(nifAbstractConfig, jobsRunning1)
+ writeTestResult("MWC_ro_html_rest_only",extractRes)
+ println("> html abstract end")
+
+ }
+
+
+/*test("extract plain abstract datasets", ExtractionTestTag) {
+ println(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Plain abstract begin")
+ Utils.renameAbstractsDatasetFiles("plain")
+ val jobsRunning2 = new ConcurrentLinkedQueue[Future[Unit]]()
+ val extractRes2=extract(plainAbstractConfig, jobsRunning2)
+ writeTestResult("MWC_ro_plain_only",extractRes2)
+ println("> Plain abstract end")
+ }*/
+
+ def writeTestResult(fileName : String, content: Array[Map[String,String]]): Unit ={
+ val today = java.time.LocalDate.now.toString
+ val urisListUsed="../dump/src/test/bash/minidump_file_used.txt"
+ var fileName2=""
+ val mapper = new ObjectMapper()
+ mapper.registerModule(DefaultScalaModule)
+ val json = mapper.writeValueAsString(content)
+
+ val targetDir = new File("../dump/test_logs/")
+ // create directories
+ targetDir.mkdirs()
+
+ if( Files.exists(Paths.get(urisListUsed))){
+
+ val Urifile = Source.fromFile(urisListUsed)
+ val fileContents = Urifile.getLines.mkString
+ Urifile.close()
+
+ fileName2="../dump/test_logs/"+fileName+"_"+fileContents+"_"+today+".log"
+
+ }else{
+ fileName2="../dump/test_logs/"+fileName+"_"+"base-list"+today+".log"
+
+ }
+ val file = new File(fileName2)
+ val bw = new BufferedWriter(new FileWriter(file))
+ bw.write(json)
+ bw.close()
+ }
+
+ def extract(config: Config, jobsRunning: ConcurrentLinkedQueue[Future[Unit]]): Array[Map[String,String]] ={
+ println(">>>>>>>>> EXTRACT - BEGIN")
+ var mapResults = Array[Map[String,String]]()
+ val configLoader = new ConfigLoader(config)
+
+ val parallelProcesses = 1
+ println(parallelProcesses)
+ val jobs=configLoader.getExtractionJobs
+ println(">>>>>>>>> EXTRACT - NBJOBS > "+jobs.size)
+ println("LAUNCH JOBS")
+ for (job <- jobs) {
+
+ job.run()
+
+ val lang=job.extractionRecorder.language
+ val records=job.extractionRecorder
+ println(">>>>>>>>> EXTRACT - LANG > " + lang.wikiCode)
+
+ val status = records.getStatusValues(lang);
+ var numberOfFailedPages429 = 0
+ var numberOfFailedPages503 = 0
+ var numberOfFailedPagesIOException = 0
+ var numberOfFailedPagesOutOfMemoryError = 0
+ var numberOfFailedPagesNullPointerException = 0
+ var mapLocal= status
+
+ mapLocal += "language" -> lang.wikiCode.toString();
+
+ try {
+ val listFailedPages_ = records.listFailedPages(lang)
+ for( failed <- listFailedPages_) {
+ if (failed.toString().contains("Server returned HTTP response code: 429")){
+ numberOfFailedPages429 += 1
+ }
+ if (failed.toString().contains("Server returned HTTP response code: 503")){
+ numberOfFailedPages503 += 1
+ }
+ if (failed.toString().contains("java.io.IOException")){
+ numberOfFailedPagesIOException += 1
+ }
+ if (failed.toString().contains("java.io.OutOfMemoryError")){
+ numberOfFailedPagesOutOfMemoryError += 1
+ }
+ if (failed.toString().contains("java.io.NullPointerException")){
+ numberOfFailedPagesNullPointerException += 1
+ }
+ }
+ } catch {
+ case e: Exception => None
+ }
+
+ mapLocal += "numberOfFailedPages429" -> numberOfFailedPages429.toString
+ mapLocal += "numberOfFailedPages503" -> numberOfFailedPages503.toString
+ mapLocal += "numberOfFailedPagesIOException" -> numberOfFailedPagesIOException.toString
+ mapLocal += "numberOfFailedPagesOutOfMemoryError" -> numberOfFailedPagesOutOfMemoryError.toString
+ mapLocal += "numberOfFailedPagesNullPointerException" -> numberOfFailedPagesNullPointerException.toString
+
+
+ mapResults = mapResults :+ mapLocal
+
+ }
+ while (jobsRunning.size() > 0) {
+
+ Thread.sleep(1000)
+ }
+
+ jobsRunning.clear()
+ mapResults
+
+ }
+}
diff --git a/history/ReadMe.md b/history/ReadMe.md
new file mode 100644
index 0000000000..dc781d25cc
--- /dev/null
+++ b/history/ReadMe.md
@@ -0,0 +1,182 @@
+# DBPEDIA HISTORY
+
+DBpedia History enables the history of a Wikipedia chapter to be extracted into an RDF format
+
+
+## Previous work
+
+This DBpedia App is a Scala/Java version of the first work conducted by the French Chapter, .
+
+Fabien Gandon, Raphael Boyer, Olivier Corby, Alexandre Monnin. Wikipedia editing history in DBpedia: extracting and publishing the encyclopedia editing activity as linked data. IEEE/WIC/ACM International Joint Conference on Web Intelligence (WI' 16), Oct 2016, Omaha, United States.
+https://hal.inria.fr/hal-01359575
+
+Fabien Gandon, Raphael Boyer, Olivier Corby, Alexandre Monnin. Materializing the editing history of Wikipedia as linked data in DBpedia. ISWC 2016 - 15th International Semantic Web Conference, Oct 2016, Kobe, Japan. .
+https://hal.inria.fr/hal-01359583
+
+## A first working prototype
+
+This prototype is not optimized. During its development, we were faced with the WikiPage type-checking constraints that are checked in almost every module of the DBpedia pipeline.
+We basically copy/pasted and renamed all the classes and objects we needed for running the extractors.
+This conception could be easily improved by making `WikiPage` and `WikiPageWithRevision` objects inherit from the same abstract object.
+But as a first step, we didn't want to impact the core module.
+
+Some other improvements that could be made:
+* Scala version
+* Enabling use of a historic namespace, taking into account the DBpedia chapter language
+* Enabling following when a revision impacts content of an `infobox`
+
+## Main Class
+
+* [WikipediaDumpParserHistory.java](src/main/java/org/dbpedia/extraction/sources/WikipediaDumpParserHistory.java) — for parsing the history dumps
+* [RevisionNode.scala](src/main/scala/org/dbpedia/extraction/wikiparser/RevisionNode.scala) — define revision of node object
+* [WikiPageWithRevision](src/main/scala/org/dbpedia/extraction/wikiparser/WikiPageWithRevisions.scala) — define `wikipage` with revision list object
+
+## Extractors
+
+### [HistoryPageExtractor.scala](src/main/scala/org/dbpedia/extraction/mappings/HistoryPageExtractor.scala)
+
+ * Extract all revisions of every Wikipedia page
+ * Use the foaf, xsd, rdf, prov, dc, sioc ontologies
+ * Describre each revisions of each page, the content / date / size / importance of that revision, the author of this one and the delta with the last version of the page updated by this one
+ * the id of the user are based depending on what is available : ip / nickname or the wikipedia id
+
+### [HistoryStatsExtractor.scala](src/main/scala/org/dbpedia/extraction/mappings/HistoryStatsExtractor.scala)
+ * Extract statistics about revision activity for every page of Wikipedia :
+ * number of revision per year / months
+ * avg size of revision per year / month
+ * number of unique contribution
+ * This exctraction add some computation and could be not necessary
+ * Use dc, rdf, rdfs ontologies
+
+
+
+## How to run it ?
+
+### Download
+
+* configure the [download.properties](download.properties) file
+* and run ```../run download download.properties```
+
+### Extraction
+
+* configure the [extraction.properties](extraction.properties) file
+* and run ```../run run extraction.properties```
+
+* Test it with `mvn test`. If you're starting cold, you may have to manually create an empty file named in the form of `frwiki-[YYYYMMDD]-download-complete` (for instance, `frwiki-20221209-download-complete`) in the `base-dir` as defined in the `extraction-properties` file.
+
+### Triple extracted
+
+Given this little Wikipedia page: [Hôtes_de_passage](https://fr.wikipedia.org/wiki/H%C3%B4tes_de_passage)
+
+→ The `HistoryPageExtractor.scala` extractor will produce:
+```
+ .
+ .
+ .
+ "36815850"^^ .
+ "2009-01-06T16:56:57Z"^^ .
+ .
+ .
+ "82.244.44.195" .
+ "Nouvelle page : ''Hôtes de passage'' appartient est une partie de ''La Corde et les souris'' consignée dans ''Le Miroir des limbes''.Malraux y mène des entretiens avec entre autre Senghor et un p..." .
+ "214"^^ .
+ "214"^^ .
+ "false"^^ .
+...
+```
+
+→ `HistoryStatsExtractor.scala` extractor will produce:
+```
+ .
+ "9"^^ .
+ "http://fr.dbpedia.org/resource/Hôtes_de_passage__nbRevPerYear__2014"^^ .
+ "2014"^^ .
+ "2"^^ .
+ "http://fr.dbpedia.org/resource/Hôtes_de_passage__nbRevPerYear__2013"^^ .
+ "2013"^^ .
+ "1"^^ .
+ "http://fr.dbpedia.org/resource/Hôtes_de_passage__nbRevPerYear__2015"^^ .
+ "2015"^^ .
+ "2"^^ .
+ "http://fr.dbpedia.org/resource/Hôtes_de_passage__nbRevPerYear__2011"^^ .
+ "2011"^^ .
+ "1"^^ .
+ "http://fr.dbpedia.org/resource/Hôtes_de_passage__nbRevPerYear__2021"^^ .
+ "2021"^^ .
+ "2"^^ .
+ "http://fr.dbpedia.org/resource/Hôtes_de_passage__nbRevPerYear__2009"^^ .
+ "2009"^^ .
+ "3"^^ .
+ "http://fr.dbpedia.org/resource/Hôtes_de_passage__nbRevPerMonth__10/2015"^^ .
+ "10/2015"^^ .
+ "1"^^ .
+ "http://fr.dbpedia.org/resource/Hôtes_de_passage__nbRevPerMonth__9/2013"^^ .
+ "9/2013"^^ .
+ "1"^^ .
+ "http://fr.dbpedia.org/resource/Hôtes_de_passage__nbRevPerMonth__1/2009"^^ .
+ "1/2009"^^ .
+ "2"^^ .
+ "http://fr.dbpedia.org/resource/Hôtes_de_passage__nbRevPerMonth__12/2014"^^ .
+ "12/2014"^^ .
+ "1"^^ .
+ "http://fr.dbpedia.org/resource/Hôtes_de_passage__nbRevPerMonth__10/2014"^^ .
+ "10/2014"^^ .
+ "1"^^ .
+ "http://fr.dbpedia.org/resource/Hôtes_de_passage__nbRevPerMonth__3/2015"^^ .
+ "3/2015"^^ .
+ "1"^^ .
+ "http://fr.dbpedia.org/resource/Hôtes_de_passage__nbRevPerMonth__7/2009"^^ .
+ "7/2009"^^ .
+ "1"^^ .
+ "http://fr.dbpedia.org/resource/Hôtes_de_passage__nbRevPerMonth__10/2011"^^ .
+ "10/2011"^^ .
+ "1"^^ .
+ "http://fr.dbpedia.org/resource/Hôtes_de_passage__nbRevPerMonth__11/2021"^^ .
+ "11/2021"^^ .
+ "2"^^ .
+ "http://fr.dbpedia.org/resource/Hôtes_de_passage__revPerYearAvgSize__2014"^^ .
+ "2014"^^ .
+ "591"^^ .
+ "http://fr.dbpedia.org/resource/Hôtes_de_passage__revPerYearAvgSize__2013"^^ .
+ "2013"^^ .
+ "467"^^ .
+ "http://fr.dbpedia.org/resource/Hôtes_de_passage__revPerYearAvgSize__2015"^^ .
+ "2015"^^ .
+ "645"^^ .
+ "http://fr.dbpedia.org/resource/Hôtes_de_passage__revPerYearAvgSize__2011"^^ .
+ "2011"^^ .
+ "434"^^ .
+