Skip to content

Commit

Permalink
Support mixed length unit foot and inch (#3169)
Browse files Browse the repository at this point in the history
* SupportMixedUnitLengthNotRecognized - Implemented

* SupportMixedUnitLengthNotRecognized - Update test cases

* SupportMixedUnitLengthNotRecognized - Update one test case to trigger rebuild

---------

Co-authored-by: Michael Wang (Centific Technologies Inc) <[email protected]>
  • Loading branch information
MichaelMWW and Michael Wang (Centific Technologies Inc) authored Oct 10, 2024
1 parent cdb178a commit 87ed3ea
Show file tree
Hide file tree
Showing 8 changed files with 460 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -919,5 +919,13 @@ public static class NumbersWithUnitDefinitions
{ @"\b(deg(rees?)?|°)$", @"\b((deg(rees?)?|°)\s*(c(elsius|entigrate)?|f(ah?renheit)?)|(temperature)(\s+(\p{L}+|\d+)){0,4}\s*(deg(rees?)?\b|°))" },
{ @"\b\d+\s*\p{L}+$", @"((\d+\s*\p{L}+\d+)|(\p{L}\d+\s*\p{L}+))" }
};
public static readonly Dictionary<string, long> LengthSubUnitFractionalRatios = new Dictionary<string, long>
{
{ @"Inch", 12 }
};
public static readonly Dictionary<string, string> LengthUnitToSubUnitMap = new Dictionary<string, string>
{
{ @"Foot", @"Inch" }
};
}
}
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

using System.Collections.Generic;
using System.Collections.Immutable;
using System.Globalization;
using System.Linq;

using Microsoft.Recognizers.Definitions.English;

namespace Microsoft.Recognizers.Text.NumberWithUnit.English
Expand All @@ -31,6 +31,10 @@ public class DimensionExtractorConfiguration : EnglishNumberWithUnitExtractorCon
.Concat(AngleExtractorConfiguration.AngleSuffixList.ToDictionary(x => x.Key, x => Constants.ANGLE))
.ToImmutableDictionary(x => x.Key, x => x.Value);

public static readonly IDictionary<string, string> LengthUnitToSubUnitMap = NumbersWithUnitDefinitions.LengthUnitToSubUnitMap;

public static readonly IDictionary<string, long> LengthSubUnitFractionalRatios = NumbersWithUnitDefinitions.LengthSubUnitFractionalRatios;

private static readonly ImmutableList<string> AmbiguousUnits =
NumbersWithUnitDefinitions.AmbiguousDimensionUnitList
.Concat(AreaExtractorConfiguration.AmbiguousUnits)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

using System.Collections.Generic;
using System.Globalization;

namespace Microsoft.Recognizers.Text.NumberWithUnit.English
{
public class DimensionParserConfiguration : EnglishNumberWithUnitParserConfiguration
{
public static readonly IDictionary<string, string> LengthUnitToSubUnitMap = DimensionExtractorConfiguration.LengthUnitToSubUnitMap;

public static readonly IDictionary<string, long> LengthSubUnitFractionalRatios = DimensionExtractorConfiguration.LengthSubUnitFractionalRatios;

public DimensionParserConfiguration()
: this(new CultureInfo(Culture.English))
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,24 @@ public List<ExtractResult> Extract(string source)
else if (CheckExtractorType(Constants.SYS_UNIT_DIMENSION))
{
result = FilterAmbiguity(result, source, this.config.DimensionAmbiguityFiltersDict);

// Only compound those dimensions that set within the LengthUnitToSubUnitMap, for now, it supports compound with foot and inch.
if (this.config as English.DimensionExtractorConfiguration != null
&& suffixMatches.Count > 0
&& result != null
&& result.Count >= 2)
{
var compoundUnit = English.DimensionExtractorConfiguration.DimensionSuffixList
.Where(kvp => English.DimensionExtractorConfiguration.LengthUnitToSubUnitMap.ContainsKey(kvp.Key))
.Where(kvp => kvp.Value.Split('|').Contains(suffixMatches[0].Text))
.Select(kvp => kvp)
.ToList();

if (compoundUnit.Any())
{
result = MergeCompoundUnits(result, source);
}
}
}

if (CheckExtractorType(Constants.SYS_UNIT_CURRENCY))
Expand Down Expand Up @@ -558,6 +576,191 @@ private List<ExtractResult> FilterAmbiguity(List<ExtractResult> extractResults,
return extractResults;
}

/// <summary>
/// Merge compound units when extracting, like compound 5 foot 3 inch as one entity.
/// </summary>
/// <param name="ers">Extract results.</param>
/// <param name="source">Input text.</param>
/// <returns>The compounded units.</returns>
private List<ExtractResult> MergeCompoundUnits(List<ExtractResult> ers, string source)
{
var result = new List<ExtractResult>();

MergePureNumber(source, ers);

if (ers.Count == 0)
{
return result;
}

var groups = new int[ers.Count];
groups[0] = 0;

for (var idx = 0; idx < ers.Count - 1; idx++)
{
if (ers[idx].Type != ers[idx + 1].Type &&
!ers[idx].Type.Equals(Constants.SYS_NUM, StringComparison.Ordinal) &&
!ers[idx + 1].Type.Equals(Constants.SYS_NUM, StringComparison.Ordinal))
{
continue;
}

if (ers[idx].Data is ExtractResult er &&
!er.Data.ToString().StartsWith(Number.Constants.INTEGER_PREFIX, StringComparison.Ordinal))
{
groups[idx + 1] = groups[idx] + 1;
continue;
}

var middleBegin = ers[idx].Start + ers[idx].Length ?? 0;
var middleEnd = ers[idx + 1].Start ?? 0;
var length = middleEnd - middleBegin;

if (length < 0)
{
continue;
}

var middleStr = source.Substring(middleBegin, length).Trim();

// Separated by whitespace
if (string.IsNullOrEmpty(middleStr))
{
groups[idx + 1] = groups[idx];
continue;
}

// Separated by connectors
var match = config.CompoundUnitConnectorRegex.Match(middleStr);
if (match.Success && match.Index == 0 && match.Length == middleStr.Length)
{
groups[idx + 1] = groups[idx];
}
else
{
groups[idx + 1] = groups[idx] + 1;
}
}

for (var idx = 0; idx < ers.Count; idx++)
{
if (idx == 0 || groups[idx] != groups[idx - 1])
{
var tmpExtractResult = ers[idx].Clone();

tmpExtractResult.Data = new List<ExtractResult>
{
new ExtractResult
{
Data = ers[idx].Data,
Length = ers[idx].Length,
Start = ers[idx].Start,
Text = ers[idx].Text,
Type = ers[idx].Type,
},
};

result.Add(tmpExtractResult);
}

// Reduce extract results in same group
if (idx + 1 < ers.Count && groups[idx + 1] == groups[idx])
{
var group = groups[idx];

var periodBegin = result[group].Start ?? 0;
var periodEnd = (ers[idx + 1].Start ?? 0) + (ers[idx + 1].Length ?? 0);

result[group].Length = periodEnd - periodBegin;
result[group].Text = source.Substring(periodBegin, periodEnd - periodBegin);
result[group].Type = Constants.SYS_UNIT_CURRENCY;
(result[group].Data as List<ExtractResult>)?.Add(ers[idx + 1]);
}
}

for (var idx = 0; idx < result.Count; idx++)
{
var innerData = result[idx].Data as List<ExtractResult>;
if (innerData?.Count == 1)
{
result[idx] = innerData[0];
}
}

result.RemoveAll(o => o.Type == Constants.SYS_NUM);

return result;
}

private void MergePureNumber(string source, List<ExtractResult> ers)
{
var numErs = config.UnitNumExtractor.Extract(source);

var unitNumbers = new List<ExtractResult>();
for (int i = 0, j = 0; i < numErs.Count; i++)
{
bool hasBehindExtraction = false;
while (j < ers.Count && ers[j].Start + ers[j].Length < numErs[i].Start)
{
hasBehindExtraction = true;
j++;
}

if (!hasBehindExtraction)
{
continue;
}

// Filter cases like "1 dollars 11a", "11" is not the fraction here.
if (source.Length > numErs[i].Start + numErs[i].Length)
{
var endChar = source.Substring(numErs[i].Length + numErs[i].Start ?? 0, 1);
if (char.IsLetter(endChar[0]) && !SimpleTokenizer.IsCjk(endChar[0]))
{
continue;
}
}

var middleBegin = ers[j - 1].Start + ers[j - 1].Length ?? 0;
var middleEnd = numErs[i].Start ?? 0;

var middleStr = source.Substring(middleBegin, middleEnd - middleBegin).Trim();

// Separated by whitespace
if (string.IsNullOrEmpty(middleStr))
{
unitNumbers.Add(numErs[i]);
continue;
}

// Separated by connectors
var match = config.CompoundUnitConnectorRegex.Match(middleStr);
if (match.Success && match.Index == 0 && match.Length == middleStr.Length)
{
unitNumbers.Add(numErs[i]);
}
}

foreach (var extractResult in unitNumbers)
{
var overlap = false;
foreach (var er in ers)
{
if (er.Start <= extractResult.Start && er.Start + er.Length >= extractResult.Start)
{
overlap = true;
}
}

if (!overlap)
{
ers.Add(extractResult);
}
}

ers.Sort((x, y) => x.Start - y.Start ?? 0);
}

private bool CheckExtractorType(string extractorType)
{
return this.config.ExtractType.Equals(extractorType, StringComparison.Ordinal);
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 87ed3ea

Please sign in to comment.