Skip to content

Commit

Permalink
Fix the issue with parsing sentences expressing the decade with centu…
Browse files Browse the repository at this point in the history
…ry and number with unit in Spanish (#3154)

* Date range without accents - Draft commit

* Date range without accents - Fix cannot recognize Spanish written in the 90s

* Date range without accents - Revert local nuget config and program.cs change

* DateRangeWithoutAccents - Resolved comments

---------

Co-authored-by: Michael Wang (Centific Technologies Inc) <[email protected]>
  • Loading branch information
MichaelMWW and Michael Wang (Centific Technologies Inc) authored Sep 12, 2024
1 parent cb8f16d commit 7534fab
Show file tree
Hide file tree
Showing 8 changed files with 298 additions and 61 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ public static class DateTimeDefinitions
public const string RelativeMonthRegex = @"(?<relmonth>(de\s+)?((este|pr[oó]ximo|([uú]ltim(o|as|os)))\s+mes)|(del\s+)?(mes\s+((que\s+viene)|pasado)))\b";
public const string MonthRegex = @"\b(?<month>abr(\.|(il)?\b)|ago(\.|(sto)?\b)|dic(\.|(iembre)?\b)|feb(\.|(rero)?\b)|ene(\.|(ro)?\b)|ju[ln](\.|(io)?\b)|mar(\.|(zo)?\b)|may(\.|(o)?\b)|nov(\.|(iembre)?\b)|oct(\.|(ubre)?\b)|sep?t(\.|(iembre)?\b)|sep(\.|\b))";
public static readonly string MonthSuffixRegex = $@"(?<msuf>((del?|la|el)\s+)?({RelativeMonthRegex}|{MonthRegex}))";
public const string DateUnitRegex = @"(?<unit>(año|(?<uoy>semana))(?<plural>s)?|(?<uoy>mes)(?<plural>es)?|(?<uoy>d[ií]a)(?<plural>s)?(?<business>\s+(h[aá]biles|laborales))?)\b";
public const string DateUnitRegex = @"(?<unit>(a[nñ]o|(?<uoy>semana))(?<plural>s)?|(?<uoy>mes)(?<plural>es)?|(?<uoy>d[ií]a)(?<plural>s)?(?<business>\s+(h[aá]biles|laborales))?)\b";
public const string PastRegex = @"(?<past>\b(pasad(a|o)(s)?|[uú]ltim[oa](s)?|anterior(es)?|previo(s)?)\b)";
public const string FutureRegex = @"\b(siguiente(s)?|pr[oó]xim[oa](s)?)\b";
public static readonly string SimpleCasesRegex = $@"\b((desde(\s+el)?|entre|del?)\s+)?({DayRegex})\s*{TillRegex}\s*({DayRegex})\s+{MonthSuffixRegex}((\s+|\s*,\s*)((en|del?)\s+)?{YearRegex})?\b";
Expand Down Expand Up @@ -83,7 +83,7 @@ public static class DateTimeDefinitions
public static readonly string WhichWeekRegex = $@"\b(semana)(\s*)(?<number>5[0-3]|[1-4]\d|0?[1-9])(\s+del?\s+({YearRegex}|(?<order>pr[oó]ximo|[uú]ltimo|este)\s+año|año\s+(?<order>pasado)))?\b";
public static readonly string WeekOfRegex = $@"((del?|el|la)\s+)?(semana)(\s*)({OfPrepositionRegex}|que\s+(inicia|comienza)\s+el|(que\s+va|a\s+partir)\s+del)";
public static readonly string MonthOfRegex = $@"(mes)(\s+)({OfPrepositionRegex})";
public const string RangeUnitRegex = @"\b(?<unit>años?|mes(es)?|semanas?)\b";
public const string RangeUnitRegex = @"\b(?<unit>a[nñ]os?|mes(es)?|semanas?)\b";
public const string BeforeAfterRegex = @"^[.]";
public const string InConnectorRegex = @"\b(en)(?=\s*$)\b";
public const string TodayNowRegex = @"\b(hoy|ahora|este entonces)\b";
Expand Down Expand Up @@ -178,7 +178,7 @@ public static class DateTimeDefinitions
public const string DateTimeTimeOfDayRegex = @"\b(?<timeOfDay>mañana|madrugada|(?<pm>pasado\s+(el\s+)?medio\s?d[ií]a|tarde|noche))\b";
public static readonly string PeriodTimeOfDayRegex = $@"\b((en\s+(el|la|lo)?\s+)?({LaterEarlyRegex}\s+)?(est[ae]\s+)?{DateTimeTimeOfDayRegex})\b";
public static readonly string PeriodSpecificTimeOfDayRegex = $@"\b(({LaterEarlyRegex}\s+)?est[ae]\s+{DateTimeTimeOfDayRegex}|({StrictRelativeRegex}\s+{PeriodTimeOfDayRegex})|anoche)\b";
public const string UnitRegex = @"(?<unit>años?|(bi|tri|cuatri|se)mestre|mes(es)?|semanas?|fin(es)?\s+de\s+semana|finde|d[ií]as?|horas?|hra?s?|hs?|minutos?|mins?|segundos?|segs?|noches?)\b";
public const string UnitRegex = @"(?<unit>a[nñ]os?|(bi|tri|cuatri|se)mestre|mes(es)?|semanas?|fin(es)?\s+de\s+semana|finde|d[ií]as?|horas?|hra?s?|hs?|minutos?|mins?|segundos?|segs?|noches?)\b";
public const string ConnectorRegex = @"^(,|t|(para|y|a|en|por) las?|(\s*,\s*)?((cerca|alrededor)\s+)?(de\s+las?|del))$";
public const string TimeHourNumRegex = @"(?<hour>veint(i(uno|dos|tres|cuatro)|e)|cero|uno|dos|tres|cuatro|cinco|seis|siete|ocho|nueve|diez|once|doce|trece|catorce|quince|dieci(s([eé])is|siete|ocho|nueve))";
public static readonly string PureNumFromTo = $@"((\b(desde|de)\s+(la(s)?\s+)?)?({BaseDateTime.HourRegex}|{TimeHourNumRegex})(?!\s+al?\b)(\s*(?<leftDesc>{DescRegex}))?|(\b(desde|de)\s+(la(s)?\s+)?)({BaseDateTime.HourRegex}|{TimeHourNumRegex})(\s*(?<leftDesc>{DescRegex}))?)\s*{TillRegex}\s*({BaseDateTime.HourRegex}|{TimeHourNumRegex})\s*(?<rightDesc>{PmRegex}|{AmRegex}|{DescRegex})?";
Expand Down Expand Up @@ -225,14 +225,14 @@ public static class DateTimeDefinitions
public const string MiddlePauseRegex = @"^[.]";
public const string PrefixArticleRegex = @"\b(e[ln]\s+(d[ií]a\s+)?)";
public const string OrRegex = @"^[.]";
public static readonly string SpecialYearTermsRegex = $@"\b(({SpecialYearPrefixes}\s+años?\s+|años?\s+({SpecialYearPrefixes}\s+)?)(de\s+)?)";
public static readonly string SpecialYearTermsRegex = $@"\b(({SpecialYearPrefixes}\s+a[nñ]os?\s+|a[nñ]os?\s+({SpecialYearPrefixes}\s+)?)(de\s+)?)";
public static readonly string YearPlusNumberRegex = $@"\b({SpecialYearTermsRegex}((?<year>(\d{{2,4}}))|{FullTextYearRegex}))\b";
public static readonly string NumberAsTimeRegex = $@"\b({WrittenTimeRegex}|{HourRegex}(?<desc>\s*h(oras)?)?)\b";
public static readonly string TimeBeforeAfterRegex = $@"\b((?<=\b(antes|no\s+m[aá]s\s+tard(e|ar)\s+(de|a\s+las?)|por| después)\s+)({WrittenTimeRegex}|{HourNumRegex}|{BaseDateTime.HourRegex}|{MidTimeRegex}))\b";
public const string DateNumberConnectorRegex = @"^\s*(?<connector>a\s+las)\s*$";
public const string CenturyRegex = @"^[.]";
public const string DecadeRegex = @"(?<decade>diez|veinte|treinta|cuarenta|cincuenta|se[st]enta|ochenta|noventa)";
public static readonly string DecadeWithCenturyRegex = $@"(los\s+)?((((d[ée]cada(\s+de)?)\s+)(((?<century>\d|1\d|2\d)?(?<decade>\d0))))|a[ñn]os\s+((((dos\s+)?mil\s+)?({WrittenOneHundredToNineHundredRegex}\s+)?{DecadeRegex})|((dos\s+)?mil\s+)?({WrittenOneHundredToNineHundredRegex})(\s+{DecadeRegex}?)|((dos\s+)?mil)(\s+{WrittenOneHundredToNineHundredRegex}\s+)?{DecadeRegex}?))";
public static readonly string DecadeWithCenturyRegex = $@"(los\s+)?((((d[ée]cada(\s+de)?)\s+)(((?<century>\d|1\d|2\d)?(?<decade>\d0))))|a[ñn]os\s+((?<century>\d|1\d|2\d)?(?<decade>\d0)\b)|a[ñn]os\s+(((?<century>((dos\s+)?mil\s+)?({WrittenOneHundredToNineHundredRegex}\s+)?)?{DecadeRegex})|(?<century>((dos\s+)?mil\s+)?({WrittenOneHundredToNineHundredRegex}))(\s+{DecadeRegex}?)|(?<century>((dos\s+)?mil)(\s+{WrittenOneHundredToNineHundredRegex}\s+)?){DecadeRegex}?))";
public static readonly string RelativeDecadeRegex = $@"\b(((el|las?)\s+)?{RelativeRegex}\s+((?<number>[\w,]+)\s+)?(d[eé]cada|decenio)s?)\b";
public static readonly string ComplexDatePeriodRegex = $@"(?:((de(sde)?)\s+)?(?<start>.+)\s*({StrictTillRegex})\s*(?<end>.+)|((entre)\s+)(?<start>.+)\s*({RangeConnectorRegex})\s*(?<end>.+))";
public const string AmbiguousPointRangeRegex = @"^(mar\.?)$";
Expand All @@ -245,6 +245,8 @@ public static class DateTimeDefinitions
{
{ @"años", @"Y" },
{ @"año", @"Y" },
{ @"anos", @"Y" },
{ @"ano", @"Y" },
{ @"meses", @"MON" },
{ @"mes", @"MON" },
{ @"trimestre", @"3MON" },
Expand Down Expand Up @@ -287,6 +289,8 @@ public static class DateTimeDefinitions
{
{ @"años", 31536000 },
{ @"año", 31536000 },
{ @"anos", 31536000 },
{ @"ano", 31536000 },
{ @"meses", 2592000 },
{ @"mes", 2592000 },
{ @"semanas", 604800 },
Expand Down Expand Up @@ -639,7 +643,15 @@ public static class DateTimeDefinitions
public static readonly string ModSuffixRegex = $@"\b({AgoRegex}|{LaterRegex}|{BeforeAfterRegex}|{FutureSuffixRegex}|{PastSuffixRegex})\b";
public static readonly Dictionary<string, int> WrittenDecades = new Dictionary<string, int>
{
{ @"", 0 }
{ @"diez", 10 },
{ @"veinte", 20 },
{ @"treinta", 30 },
{ @"cuarenta", 40 },
{ @"cincuenta", 50 },
{ @"sesenta", 60 },
{ @"setenta", 70 },
{ @"ochenta", 80 },
{ @"noventa", 90 }
};
public static readonly Dictionary<string, int> SpecialDecadeCases = new Dictionary<string, int>
{
Expand Down Expand Up @@ -747,7 +759,9 @@ public static class DateTimeDefinitions
public static readonly IList<string> YearTerms = new List<string>
{
@"año",
@"años"
@"años",
@"ano",
@"anos"
};
public static readonly IList<string> YearToDateTerms = new List<string>
{
Expand All @@ -770,7 +784,7 @@ public static class DateTimeDefinitions
public const string MonthTypeRegex = @"(mes(es)?|mensual(es|mente)?)$";
public const string QuarterTypeRegex = @"(trimestral(es|mente)?)$";
public const string SemiAnnualTypeRegex = @"(semestral(es|mente)?)$";
public const string YearTypeRegex = @"(años?|anual(mente)?)$";
public const string YearTypeRegex = @"(a[nñ]os?|anual(mente)?)$";
public static readonly IList<string> ThisTerms = new List<string>
{
@"esta"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,17 @@ public static class NumbersWithUnitDefinitions
{
public static readonly Dictionary<string, string> AgeSuffixList = new Dictionary<string, string>
{
{ @"Año", @"años|año" },
{ @"Año", @"años|anos|año|ano" },
{ @"Mes", @"meses|mes" },
{ @"Semana", @"semanas|semana" },
{ @"Día", @"dias|días|día|dia" }
};
public static readonly IList<string> AmbiguousAgeUnitList = new List<string>
{
@"años",
@"anos",
@"año",
@"ano",
@"meses",
@"mes",
@"semanas",
Expand Down Expand Up @@ -737,7 +739,7 @@ public static class NumbersWithUnitDefinitions
{ @"Yarda", @"yd|yarda|yardas" },
{ @"Pulgada", @"pulgada|pulgadas|""" },
{ @"Pie", @"pie|pies|ft" },
{ @"Año luz", @"año luz|años luz|al" }
{ @"Año luz", @"año luz|ano luz|años luz|anos luz|al" }
};
public static readonly IList<string> AmbiguousLengthUnitList = new List<string>
{
Expand Down
29 changes: 21 additions & 8 deletions Patterns/Spanish/Spanish-DateTime.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ MonthSuffixRegex: !nestedRegex
def: (?<msuf>((del?|la|el)\s+)?({RelativeMonthRegex}|{MonthRegex}))
references: [ RelativeMonthRegex, MonthRegex ]
DateUnitRegex: !simpleRegex
def: (?<unit>(año|(?<uoy>semana))(?<plural>s)?|(?<uoy>mes)(?<plural>es)?|(?<uoy>d[ií]a)(?<plural>s)?(?<business>\s+(h[aá]biles|laborales))?)\b
def: (?<unit>(a[nñ]o|(?<uoy>semana))(?<plural>s)?|(?<uoy>mes)(?<plural>es)?|(?<uoy>d[ií]a)(?<plural>s)?(?<business>\s+(h[aá]biles|laborales))?)\b
PastRegex: !simpleRegex
def: (?<past>\b(pasad(a|o)(s)?|[uú]ltim[oa](s)?|anterior(es)?|previo(s)?)\b)
FutureRegex: !simpleRegex
Expand Down Expand Up @@ -162,7 +162,7 @@ MonthOfRegex: !nestedRegex
def: (mes)(\s+)({OfPrepositionRegex})
references: [ OfPrepositionRegex ]
RangeUnitRegex: !simpleRegex
def: \b(?<unit>años?|mes(es)?|semanas?)\b
def: \b(?<unit>a[nñ]os?|mes(es)?|semanas?)\b
BeforeAfterRegex: !simpleRegex
def: ^[.]
InConnectorRegex: !simpleRegex
Expand Down Expand Up @@ -433,7 +433,7 @@ PeriodSpecificTimeOfDayRegex: !nestedRegex
def: \b(({LaterEarlyRegex}\s+)?est[ae]\s+{DateTimeTimeOfDayRegex}|({StrictRelativeRegex}\s+{PeriodTimeOfDayRegex})|anoche)\b
references: [ PeriodTimeOfDayRegex, StrictRelativeRegex, DateTimeTimeOfDayRegex, LaterEarlyRegex ]
UnitRegex: !simpleRegex
def: (?<unit>años?|(bi|tri|cuatri|se)mestre|mes(es)?|semanas?|fin(es)?\s+de\s+semana|finde|d[ií]as?|horas?|hra?s?|hs?|minutos?|mins?|segundos?|segs?|noches?)\b
def: (?<unit>a[nñ]os?|(bi|tri|cuatri|se)mestre|mes(es)?|semanas?|fin(es)?\s+de\s+semana|finde|d[ií]as?|horas?|hra?s?|hs?|minutos?|mins?|segundos?|segs?|noches?)\b
ConnectorRegex: !simpleRegex
def: ^(,|t|(para|y|a|en|por) las?|(\s*,\s*)?((cerca|alrededor)\s+)?(de\s+las?|del))$
# SpanishTimePeriodExtractorConfiguration
Expand Down Expand Up @@ -561,7 +561,7 @@ OrRegex: !simpleRegex
# TODO: modify below regex according to the counterpart in English
def: ^[.]
SpecialYearTermsRegex: !nestedRegex
def: \b(({SpecialYearPrefixes}\s+años?\s+|años?\s+({SpecialYearPrefixes}\s+)?)(de\s+)?)
def: \b(({SpecialYearPrefixes}\s+a[nñ]os?\s+|a[nñ]os?\s+({SpecialYearPrefixes}\s+)?)(de\s+)?)
references: [ SpecialYearPrefixes ]
YearPlusNumberRegex: !nestedRegex
def: \b({SpecialYearTermsRegex}((?<year>(\d{2,4}))|{FullTextYearRegex}))\b
Expand All @@ -580,7 +580,7 @@ CenturyRegex: !simpleRegex
DecadeRegex: !simpleRegex
def: (?<decade>diez|veinte|treinta|cuarenta|cincuenta|se[st]enta|ochenta|noventa)
DecadeWithCenturyRegex: !nestedRegex
def: (los\s+)?((((d[ée]cada(\s+de)?)\s+)(((?<century>\d|1\d|2\d)?(?<decade>\d0))))|a[ñn]os\s+((((dos\s+)?mil\s+)?({WrittenOneHundredToNineHundredRegex}\s+)?{DecadeRegex})|((dos\s+)?mil\s+)?({WrittenOneHundredToNineHundredRegex})(\s+{DecadeRegex}?)|((dos\s+)?mil)(\s+{WrittenOneHundredToNineHundredRegex}\s+)?{DecadeRegex}?))
def: (los\s+)?((((d[ée]cada(\s+de)?)\s+)(((?<century>\d|1\d|2\d)?(?<decade>\d0))))|a[ñn]os\s+((?<century>\d|1\d|2\d)?(?<decade>\d0)\b)|a[ñn]os\s+(((?<century>((dos\s+)?mil\s+)?({WrittenOneHundredToNineHundredRegex}\s+)?)?{DecadeRegex})|(?<century>((dos\s+)?mil\s+)?({WrittenOneHundredToNineHundredRegex}))(\s+{DecadeRegex}?)|(?<century>((dos\s+)?mil)(\s+{WrittenOneHundredToNineHundredRegex}\s+)?){DecadeRegex}?))
references: [ WrittenOneHundredToNineHundredRegex, DecadeRegex ]
RelativeDecadeRegex: !nestedRegex
def: \b(((el|las?)\s+)?{RelativeRegex}\s+((?<number>[\w,]+)\s+)?(d[eé]cada|decenio)s?)\b
Expand Down Expand Up @@ -609,6 +609,8 @@ UnitMap: !dictionary
entries:
años: Y
año: Y
anos: Y
ano: Y
meses: MON
mes: MON
trimestre: 3MON
Expand Down Expand Up @@ -651,6 +653,8 @@ UnitValueMap: !dictionary
entries:
años: 31536000
año: 31536000
anos: 31536000
ano: 31536000
meses: 2592000
mes: 2592000
semanas: 604800
Expand Down Expand Up @@ -1048,9 +1052,16 @@ ModSuffixRegex: !nestedRegex
references: [AgoRegex, LaterRegex, BeforeAfterRegex, FutureSuffixRegex, PastSuffixRegex]
WrittenDecades: !dictionary
types: [ string, int ]
# TODO: modify below dictionary according to the counterpart in English
entries:
"": 0
'diez': 10
'veinte': 20
'treinta': 30
'cuarenta': 40
'cincuenta': 50
'sesenta': 60
'setenta': 70
'ochenta': 80
'noventa': 90
SpecialDecadeCases: !dictionary
types: [ string, int ]
# TODO: modify below dictionary there're special cases for written decades
Expand Down Expand Up @@ -1167,6 +1178,8 @@ YearTerms: !list
entries:
- año
- años
- ano
- anos
YearToDateTerms: !list
types: [ string ]
entries:
Expand Down Expand Up @@ -1198,7 +1211,7 @@ QuarterTypeRegex: !simpleRegex
SemiAnnualTypeRegex: !simpleRegex
def: (semestral(es|mente)?)$
YearTypeRegex: !simpleRegex
def: (años?|anual(mente)?)$
def: (a[nñ]os?|anual(mente)?)$
ThisTerms: !list
types: [ string ]
entries:
Expand Down
6 changes: 4 additions & 2 deletions Patterns/Spanish/Spanish-NumbersWithUnit.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
AgeSuffixList: !dictionary
types: [ string, string ]
entries:
Año: años|año
Año: años|anos|año|ano
Mes: meses|mes
Semana: semanas|semana
Día: dias|días|día|dia
Expand All @@ -12,7 +12,9 @@ AmbiguousAgeUnitList: !list
types: [ string ]
entries:
- años
- anos
- año
- ano
- meses
- mes
- semanas
Expand Down Expand Up @@ -969,7 +971,7 @@ LengthSuffixList: !dictionary
Yarda: yd|yarda|yardas
Pulgada: pulgada|pulgadas|"
Pie: pie|pies|ft
Año luz: año luz|años luz|al
Año luz: año luz|ano luz|años luz|anos luz|al
AmbiguousLengthUnitList: !list
types: [ string ]
entries:
Expand Down
44 changes: 35 additions & 9 deletions Specs/DateTime/Spanish/DatePeriodExtractor.json
Original file line number Diff line number Diff line change
Expand Up @@ -3482,10 +3482,10 @@
"NotSupportedByDesign": "javascript, python",
"Results": [
{
"Text": "años 1970",
"Text": "los años 1970",
"Type": "daterange",
"Start": 7,
"Length": 9
"Start": 3,
"Length": 13
}
]
},
Expand All @@ -3495,10 +3495,10 @@
"NotSupportedByDesign": "javascript, python",
"Results": [
{
"Text": "años 2000",
"Text": "los años 2000",
"Type": "daterange",
"Start": 13,
"Length": 9
"Start": 9,
"Length": 13
}
]
},
Expand All @@ -3521,10 +3521,36 @@
"NotSupportedByDesign": "javascript, python",
"Results": [
{
"Text": "años 70",
"Text": "los años 70",
"Type": "daterange",
"Start": 7,
"Length": 7
"Start": 3,
"Length": 11
}
]
},
{
"Input": "escrito en los anos 90",
"NotSupported": "java",
"NotSupportedByDesign": "javascript, python",
"Results": [
{
"Text": "los anos 90",
"Type": "daterange",
"Start": 11,
"Length": 11
}
]
},
{
"Input": "escritos en los ultimos 3 anos",
"NotSupported": "java",
"NotSupportedByDesign": "javascript, python",
"Results": [
{
"Text": "ultimos 3 anos",
"Type": "daterange",
"Start": 16,
"Length": 14
}
]
},
Expand Down
Loading

0 comments on commit 7534fab

Please sign in to comment.