Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GSOC'24 Amharic chapter] Extend Existing Extractors For Amharic #766

Merged
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ object GeoCoordinateParserConfig

//map latitude letters used in languages to the ones used in English ("E" for East and "W" for West)
val longitudeLetterMap = Map(
"am" -> Map("E" -> "E", "W" -> "W"),
"de" -> Map("E" -> "E", "O" -> "E", "W" -> "W"),
"en" -> Map("E" -> "E", "W" -> "W"),
"cs" -> Map("E" -> "E", "W" -> "W"),
Expand All @@ -22,6 +23,7 @@ object GeoCoordinateParserConfig

//map longitude letters used in languages to the ones used in English ("N" for North and "S" for South)
val latitudeLetterMap = Map(
"am" -> Map("N" -> "N", "S" -> "S"),
"en" -> Map("N" -> "N", "S" -> "S"),
"cs" -> Map("N" -> "N", "S" -> "S"),
"mk" -> Map("N" -> "N", "S" -> "S")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,23 @@ object ParserUtilsConfig
"bln" -> 9,
"trillion" -> 12,
"quadrillion" -> 15
),
"am" -> Map(
"አስር" -> 1,
"መቶ" -> 2,
"መቶዎች" -> 2,
"thousand" -> 3,
"ሺህ" -> 3,
"million" -> 6,
"mln" -> 6,
"ሚሊዮን" -> 6,
"billion" -> 9,
"ቢሊዮን" -> 9,
"bln" -> 9,
"trillion" -> 12,
"ትሪሊዮን" -> 12,
"quadrillion" -> 15,
"ኳድሪሊየን" -> 15
),
// For "ar" configuration, rendering right-to-left may seems like a bug, but it's not.
// Don't change this else if you know how it is done.
Meti-Adane marked this conversation as resolved.
Show resolved Hide resolved
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ object DateIntervalMappingConfig
// Don't change this else if you know how it is done.
Meti-Adane marked this conversation as resolved.
Show resolved Hide resolved
val presentMap = Map(
"en" -> Set("present", "now"), // for example see https://en.wikipedia.org/wiki/Donald_Trump -> Political party -> Republican (1987–1999, 2009–2011, 2012–present)
"am" -> Set("አሁን", "እስካሁን", "እስካሁን ድረስ"),
"ar" -> Set("الحاضر"),
"be" -> Set("па гэты дзень", "па сучаснасць"),
"bg" -> Set("до наши дни", "настояще", "досега"),
Expand Down Expand Up @@ -38,6 +39,7 @@ object DateIntervalMappingConfig

val sinceMap = Map(
"en" -> "since",
"am" -> "(?:ጀምሮ|አንሥቶ|አንስቶ|ከ)",
"ca" -> "des del",
"es" -> "desde",
"fr" -> "depuis",
Expand All @@ -48,12 +50,14 @@ object DateIntervalMappingConfig

val onwardMap = Map(
"en" -> "onward",
"am" -> "በኋላ",
"es" -> "en adelante",
"pt" -> "adiante|avante"
)

val splitMap = Map(
"en" -> "to",
"am" -> "እስከ",
"es" -> "al|a la|a|hasta (?:el|la)",
"fr" -> "à|au",
"pl" -> "do",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ object DisambiguationExtractorConfig
// For "ar" and "he" configurations, rendering right-to-left may seem like a bug, but it's not.
// Don't change this unless you know what you're doing.
val disambiguationTitlePartMap = Map(
"am" -> " (መንታ)",
"ar" -> " (توضيح)",
"bg" -> " (пояснение)",
"ca" -> " (desambiguació)",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,19 @@ object GenderExtractorConfig
val pronounsMap = Map(
"en" -> Map("she" -> "female", "her" -> "female", "he" -> "male", "his" -> "male", "him" -> "male", "herself" -> "female", "himself" -> "male",
"She" -> "female", "Her" -> "female", "He" -> "male", "His" -> "male", "Him" -> "male", "Herself" -> "female", "Himself" -> "male" //TODO why not just do case insensitive matches?
),
"am" -> Map(
"እሷ" -> "ሴት",
"እሷን" -> "ሴት",
"የሷ" -> "ሴት",
"እራሷን" -> "ሴት",
"እራሷ" -> "ሴት",
"እሱ" -> "ወንድ",
"እሱን" -> "ወንድ",
"የእሱ" -> "ወንድ",
"የራሱ" -> "ወንድ",
"እራሱ" -> "ወንድ",
"እራሱን" -> "ወንድ"
),
"pt" -> Map ("ela"-> "mulher", "dela" -> "mulher", "ele" -> "homem", "dele" -> "homem", "nela" -> "mulher", "nele" -> "homem",
"Ela"-> "mulher", "Dela" -> "mulher", "Ele" -> "homem", "Dele" -> "homem", "Nela" -> "mulher", "Nele" -> "homem"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,23 @@ object HomepageExtractorConfig
// Don't change this else if you know how it is done.

private val propertyNamesMap = Map(
"am" -> Set(
"ድህረገፅ",
"ድህረ_ገፅ",
"ገጽ",
"ድህረ ገጽ",
"ድህረ_ገጽ",
"ድረ_ገፅ",
"ድረገፅ",
"ድረገጽ",
"ድረ ገጽ",
"ድረ_ገጽ",
"ዋና_ገጽ",
"ዌብሳይት",
"website",
"web",
"site"
),
"ar" -> Set("الموقع", "الصفحة الرسمية", "موقع", "الصفحة الرئيسية", "صفحة ويب", "موقع ويب"),
"bg" -> Set("сайт", "уебсайт"),
"ca" -> Set("pàgina", "web", "lloc"),
Expand Down Expand Up @@ -38,6 +55,7 @@ object HomepageExtractorConfig
val supportedLanguages = propertyNamesMap.keySet

private val externalLinkSectionsMap = Map(
"am" -> "(?:የውጭ ንባብ|የውጭ ማያያዣ)",
"ar" -> "وصلات خارجية",
"bg" -> "Външни препратки",
"ca" -> "(?:Enllaços externs|Enllaço extern)",
Expand Down Expand Up @@ -65,6 +83,7 @@ object HomepageExtractorConfig
}

private val officialMap = Map(
"am" -> "ዋና",
"ar" -> "رسمي",
"bg" -> "официален",
"ca" -> "oficial",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ object ImageExtractorConfig
// Don't change this else if you know how it is done.
val NonFreeRegex = Map(
"ar" -> """(?i)\{\{\s?غير حر""".r,
"am" ->"""(?i)\{\{\s?(non-free|Logo|Screenshot|Noncommercial|ነፃ_ያልሆነ)""".r,
"bg" ->"""(?i)\{\{\s?non-free""".r,
"de" -> """(?iu)\{\{\s?(Dateiüberprüfung/benachrichtigt_\(Kategorie\)|Geschützt|Geschützt-Ungeklärt|Bild-LogoSH|Bild-PD-alt-100|Bild-PD-alt-1923|Bild-WikimediaCopyright)\s?\}\}""".r ,
"el" -> """(?iu)\{\{\s?(εύλογη χρήση|σήμα|σήμα αθλητικού σωματείου|αφίσα ταινίας|σκηνή από ταινία|γραφικά υπολογιστή|εξώφυλλο άλμπουμ|εξώφυλλο βιβλίου|μη ελεύθερο έργο τέχνης|σελίδα κόμικς|σελίδα εφημερίδας|εικόνα-βιντεοπαιχνίδι|ιδιοκτησία Wikimedia)\s?\}\}""".r ,
Expand All @@ -29,9 +30,9 @@ object ImageExtractorConfig
"ru" -> """(?iu)\{\{\s?(CopyrightByWikimedia|Fairuse|несвободный файл|несвободная лицензия|запрещенная лицензия)\s?\}\}""".r
)

val flagRegex = """(?iu)s?^([^a-zA-Z0-9]*|[\w\s]*[^a-zA-Z0-9]+)(flag|banner|pavillon|drapeau|bandera|pabellón|bandiera|флаг)([^\w]*|[_\s]+)""".r
val mapRegex = """(?iu)s?^([^a-zA-Z0-9]*|[\w\s]*[^a-zA-Z0-9]+)(map|karte|location|position|carte|carta|lage)([^\w]*|[_\s]+)""".r
val signatureRegex = """(?iu)s?^([^a-zA-Z0-9]*|[\w\s]*[^a-zA-Z0-9]+)(signature|unterschrift)""".r
val flagRegex = """(?iu)s?^([^a-zA-Z0-9]*|[\w\s]*[^a-zA-Z0-9]+)(flag|banner|pavillon|drapeau|bandera|pabellón|bandiera|флаг|ባንዲራ|ሰንደቅ_ዓላማ)([^\w]*|[_\s]+)""".r
val mapRegex = """(?iu)s?^([^a-zA-Z0-9]*|[\w\s]*[^a-zA-Z0-9]+)(map|karte|location|position|carte|carta|lage|ካርታ)([^\w]*|[_\s]+)""".r
val signatureRegex = """(?iu)s?^([^a-zA-Z0-9]*|[\w\s]*[^a-zA-Z0-9]+)(signature|unterschrift|ፊርማ)""".r
val cOARegex = """(?iu)s?^([^a-zA-Z0-9]*|[\w\s]*[^a-zA-Z0-9]+)(coat_of_arms|emblem|crest|wappen|grandes_armes|blason|armoiries)([^\w]*|[_\s]+)""".r


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ object InfoboxExtractorConfig

val ignoreProperties = Map (
"en"-> Set("image", "image_photo", "map"),
"am"-> Set("ምስል", "ፎቶ", "ስዕል", "ካርታ", "አርማ"),
"ar"-> Set("صورة"),
"id"-> Set("foto", "gambar"),
"el"-> Set("εικόνα", "εικονα", "Εικόνα", "Εικονα", "χάρτης", "Χάρτης"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ object TopicalConceptsExtractorConfig
val catMainTemplates = Set(
"مزيد" ,// ar
"Infocat", "Infocatm", // ca
"Catmore", // el,ja
"Catmore", // el,ja,am
"Cat main", // en
"AP", // es
"Nagusia", // eu
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,16 @@ class GeoCoordinateParserTest extends FlatSpec with Matchers
{
parse("fr", "{{coord|51/12/N|03/13/E}}") should equal (Some(51.2,3.216666666666667))
}


// Tests for Amharic
"GeoCoordinateParser(20º12'00\"N 03º13'00\"E)" should "return (20.2,3.216666666666667))" in
{
parse("am", "20º12'00\"N 03º13'00\"E") should equal (Some(20.2,3.216666666666667))
}
"GeoCoordinateParser({{coord|10.2|N|13.2|E}}" should "return (10.2,13.2)) for Amharic" in
{
parse("am", "{{coord|10.2|N|13.2|E}}") should equal (Some(10.2,13.2))
}

private val wikiParser = WikiParser.getInstance()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,14 @@ class ParserUtilsTest extends TestCase
// testConvertLargeNumbers("de", "1.234,5 trillion", "1234500000000000000000")
testConvertLargeNumbers("nl", "123 milja", "123 milja")
testConvertLargeNumbers("nl", "123 milj.", "123000000000")

// Tests for Amharic
testConvertLargeNumbers("am", "15 ሚሊዮን", "15000000")
testConvertLargeNumbers("am", "3 ሺህ", "3000")
testConvertLargeNumbers("am", "6 billion", "6000000000")



}

def testParse(): Unit = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,15 @@ class DateIntervalMappingTest extends FlatSpec with ShouldMatchers
parse("en", "xsd:date", "foo") should be (Seq())
}

// Tests for Amharic
"DateIntervalMapping" should "return Seq 1988 2024 @am" in
{
parse("am", "xsd:gYear", "ከ 1988 እስከ 2024") should be (Seq("1988", "2024"))
}
"DateIntervalMapping" should "return Seq 2014-07-01 2024-07-01 @am" in
{
parse("am", "xsd:date", "2014-07-01 እስከ 2024-07-01") should be (Seq("2014-07-01", "2024-07-01"))
}

private val wikiParser = WikiParser.getInstance()
private val ontology = {
Expand Down
2 changes: 2 additions & 0 deletions dump/extraction.default.properties
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ extractors=.ArticleCategoriesExtractor,.ArticlePageExtractor,.ArticleTemplatesEx
.PageLinksExtractor,.RedirectExtractor,.RevisionIdExtractor,.ProvenanceExtractor,.SkosCategoriesExtractor,\
.WikiPageLengthExtractor,.WikiPageOutDegreeExtractor

extractors.am=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.TopicalConceptsExtractor,.ImageExtractorNew,.CommonsResourceExtractor,.CitationExtractor,.AnchorTextExtractor

extractors.ar=.MappingExtractor,.TopicalConceptsExtractor

extractors.be=.MappingExtractor
Expand Down
2 changes: 2 additions & 0 deletions dump/extraction.mappings.properties
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ languages=@mappings

extractors=.MappingExtractor

#extractors.am=.MappingExtractor,.DisambiguationExtractor,.HomepageExtractor,.TopicalConceptsExtractor,.ImageExtractorNew,.CommonsResourceExtractor,.CitationExtractor,.AnchorTextExtractor
#
#extractors.ar=.MappingExtractor,.TopicalConceptsExtractor
#
#extractors.be=.MappingExtractor
Expand Down
2 changes: 1 addition & 1 deletion dump/extraction.topical.properties
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# use only directories that contain a 'download-complete' file? Default is false.
require-download-complete=true

languages=ar,ca,el,en,es,eu,fr,it,pt,ru
languages=am,ar,ca,el,en,es,eu,fr,it,pt,ru

# extractor class names starting with "." are prefixed by "org.dbpedia.extraction.mappings"

Expand Down
Loading