diff --git a/core/src/main/scala/org/dbpedia/extraction/config/transform/TemplateTransformConfig.scala b/core/src/main/scala/org/dbpedia/extraction/config/transform/TemplateTransformConfig.scala index 8320171d94..d145751b96 100644 --- a/core/src/main/scala/org/dbpedia/extraction/config/transform/TemplateTransformConfig.scala +++ b/core/src/main/scala/org/dbpedia/extraction/config/transform/TemplateTransformConfig.scala @@ -76,6 +76,13 @@ object TemplateTransformConfig { private def extractFirstExternalLinkNode(node: Option[PropertyNode]) : Option[ExternalLinkNode] = { node .flatMap(_.children + .map(c => { + if (c.isInstanceOf[TextNode] && c.toPlainText.contains(".") && !c.toPlainText.contains(" ")) { + val text = c.toPlainText + val triedUri = UriUtils.createURI(if (!text.startsWith("http") && !text.contains(":")) "http://" + text else text) + triedUri.map(uri => ExternalLinkNode(uri, c.children, c.line)).getOrElse(c) + } else c + }) .filter(c => c.isInstanceOf[ExternalLinkNode]) .map(_.asInstanceOf[ExternalLinkNode]) .headOption @@ -169,7 +176,8 @@ object TemplateTransformConfig { PropertyNode("link-title", List(TextNode("", node.line)), node.line) } - // Check if this uri has a scheme. If it does not, add a default http:// scheme + + // Check if this uri has a scheme. If it does not, add a default http:// scheme // From https://en.wikipedia.org/wiki/Template:URL: // The first parameter is parsed to see if it takes the form of a complete URL. // If it doesn't start with a URI scheme (such as "http:", "https:", or "ftp:"), diff --git a/core/src/main/scala/org/dbpedia/extraction/wikiparser/impl/simple/SimpleWikiParser.scala b/core/src/main/scala/org/dbpedia/extraction/wikiparser/impl/simple/SimpleWikiParser.scala index aee4cdbdd8..a9fe6247c2 100644 --- a/core/src/main/scala/org/dbpedia/extraction/wikiparser/impl/simple/SimpleWikiParser.scala +++ b/core/src/main/scala/org/dbpedia/extraction/wikiparser/impl/simple/SimpleWikiParser.scala @@ -33,7 +33,7 @@ object SimpleWikiParser private val externalLinkLabelOrEnd = new Matcher(List(" ", "]", "\n")) private val externalLinkEnd = new Matcher(List("]", "\n"), true) - private val linkEnd = new Matcher(List(" ", "{","}", "[", "]", "\n", "\t")) + private val linkEnd = new Matcher(List(" ", "{","}", "[", "]", "|", "\n", "\t")) // '|=' is not valid wiki markup but safe to include, see http://sourceforge.net/tracker/?func=detail&atid=935521&aid=3572779&group_id=190976 private val propertyValueOrEnd = new Matcher(List("|=","=", "|", "}}"), true) diff --git a/core/src/test/scala/org/dbpedia/extraction/dataparser/LinkParserTest.scala b/core/src/test/scala/org/dbpedia/extraction/dataparser/LinkParserTest.scala index ed962f8aee..c2f3fef574 100644 --- a/core/src/test/scala/org/dbpedia/extraction/dataparser/LinkParserTest.scala +++ b/core/src/test/scala/org/dbpedia/extraction/dataparser/LinkParserTest.scala @@ -65,7 +65,7 @@ class LinkParserTest extends FlatSpec with Matchers } it should "return http://EXAMPLE.COM" in { - parse("{{URL|EXAMPLE.com}}") should equal (Some(build("http://EXAMPLE.COM"))) + parse("{{URL|EXAMPLE.COM}}") should equal (Some(build("http://EXAMPLE.COM"))) } it should "return http://www.example.com" in { @@ -100,17 +100,20 @@ class LinkParserTest extends FlatSpec with Matchers private val parser = WikiParser.getInstance() private val notStrictParser = new LinkParser(strict = false) - private def build(uri: String) : URI = { - URI.create(uri) + private def build(uri: String) : String = { + URI.create(uri).toString } - private def parse(input : String) : Option[IRI] = + private def parse(input : String) : Option[String] = { val page = new WikiPage(WikiTitle.parse("TestPage", Language.English), input) // Not strict parsing parser(page) match { - case Some(n) => notStrictParser.parse(n).map(_.value) + case Some(n) => { + val option = notStrictParser.parse(n) + option.map(_.value.toString) + } case None => None } } diff --git a/dump/src/test/resources/shaclTestsCoverageTable.md b/dump/src/test/resources/shaclTestsCoverageTable.md index 3d974b2730..da8edcc1f5 100644 --- a/dump/src/test/resources/shaclTestsCoverageTable.md +++ b/dump/src/test/resources/shaclTestsCoverageTable.md @@ -62,8 +62,10 @@ wikipage-uri|shacl-test|issue|comment [http://en.dbpedia.org/resource/Asda](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Asda&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/date](http://dbpedia.org/property/date) #Citation_english_language_date_datatype_validation | [http://en.dbpedia.org/resource/Asda](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Asda&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/last1](http://dbpedia.org/property/last1) #Citation_english_language_last1_datatype_validation | [http://en.dbpedia.org/resource/Asda](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Asda&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/last](http://dbpedia.org/property/last) #Citation_english_language_last_datatype_validation | +[http://en.dbpedia.org/resource/Asda](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Asda&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/page](http://dbpedia.org/property/page) #Citation_english_language_page_datatype_validation | [http://en.dbpedia.org/resource/Asda](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Asda&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/title](http://dbpedia.org/property/title) #Citation_english_language_title_datatype_validation | [http://en.dbpedia.org/resource/Asda](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Asda&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/work](http://dbpedia.org/property/work) #Citation_english_language_work_datatype_validation | +[http://en.dbpedia.org/resource/Asda](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Asda&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/year](http://dbpedia.org/property/year) #Citation_english_languagа_year_datatype_validation | [http://en.dbpedia.org/resource/Asda](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Asda&revid=&format=trix&extractors=custom) | [http://www.w3.org/2003/01/geo/wgs84_pos#long](http://www.w3.org/2003/01/geo/wgs84_pos#long) #wgs84_lat_long | | generic test for range of wgs84 lat/long | [http://en.dbpedia.org/resource/Atlantic_Ocean](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Atlantic_Ocean&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/accessDate](http://dbpedia.org/property/accessDate) #Citation_english_languagа_accessDate_datatype_validation | [http://en.dbpedia.org/resource/Atlantic_Ocean](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Atlantic_Ocean&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/date](http://dbpedia.org/property/date) #Citation_english_language_date_datatype_validation | @@ -117,12 +119,16 @@ wikipage-uri|shacl-test|issue|comment [http://en.dbpedia.org/resource/Kerala_Agricultural_University](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Kerala_Agricultural_University&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/accessDate](http://dbpedia.org/property/accessDate) #Citation_english_languagа_accessDate_datatype_validation | [http://en.dbpedia.org/resource/Kerala_Agricultural_University](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Kerala_Agricultural_University&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/title](http://dbpedia.org/property/title) #Citation_english_language_title_datatype_validation | [http://en.dbpedia.org/resource/Kerala_Agricultural_University](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Kerala_Agricultural_University&revid=&format=trix&extractors=custom) | [http://www.w3.org/2003/01/geo/wgs84_pos#long](http://www.w3.org/2003/01/geo/wgs84_pos#long) #wgs84_lat_long | | generic test for range of wgs84 lat/long | -[http://en.dbpedia.org/resource/Mini_(Mark_I)](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Mini_(Mark_I)&revid=&format=trix&extractors=custom) | -[http://en.dbpedia.org/resource/N.EX.T](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=N.EX.T&revid=&format=trix&extractors=custom) | +[http://en.dbpedia.org/resource/Mini_(Mark_I)](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Mini_(Mark_I)&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/title](http://dbpedia.org/property/title) #Citation_english_language_title_datatype_validation | +[http://en.dbpedia.org/resource/N.EX.T](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=N.EX.T&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/accessDate](http://dbpedia.org/property/accessDate) #Citation_english_languagа_accessDate_datatype_validation | +[http://en.dbpedia.org/resource/N.EX.T](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=N.EX.T&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/date](http://dbpedia.org/property/date) #Citation_english_language_date_datatype_validation | +[http://en.dbpedia.org/resource/N.EX.T](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=N.EX.T&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/title](http://dbpedia.org/property/title) #Citation_english_language_title_datatype_validation | +[http://en.dbpedia.org/resource/N.EX.T](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=N.EX.T&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/work](http://dbpedia.org/property/work) #Citation_english_language_work_datatype_validation | [http://en.dbpedia.org/resource/Ranma_½](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Ranma_½&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/accessDate](http://dbpedia.org/property/accessDate) #Citation_english_languagа_accessDate_datatype_validation | [http://en.dbpedia.org/resource/Ranma_½](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Ranma_½&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/date](http://dbpedia.org/property/date) #Citation_english_language_date_datatype_validation | [http://en.dbpedia.org/resource/Ranma_½](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Ranma_½&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/last](http://dbpedia.org/property/last) #Citation_english_language_last_datatype_validation | [http://en.dbpedia.org/resource/Ranma_½](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Ranma_½&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/title](http://dbpedia.org/property/title) #Citation_english_language_title_datatype_validation | +[http://en.dbpedia.org/resource/Ranma_½](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Ranma_½&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/work](http://dbpedia.org/property/work) #Citation_english_language_work_datatype_validation | [http://en.dbpedia.org/resource/Redd_Kross](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Redd_Kross&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/accessDate](http://dbpedia.org/property/accessDate) #Citation_english_languagа_accessDate_datatype_validation | [http://en.dbpedia.org/resource/Redd_Kross](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Redd_Kross&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/date](http://dbpedia.org/property/date) #Citation_english_language_date_datatype_validation | [http://en.dbpedia.org/resource/Redd_Kross](http://dief.tools.dbpedia.org/server/extraction/en/extract?title=Redd_Kross&revid=&format=trix&extractors=custom) | [http://dbpedia.org/property/isbn](http://dbpedia.org/property/isbn) #en_property_isbn_citation |