Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add parameter for filtering key terms by book/chapters #256

Merged
merged 1 commit into from
Oct 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions src/SIL.Machine/Corpora/ParatextBackupTermsCorpus.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ public class ParatextBackupTermsCorpus : DictionaryTextCorpus
public ParatextBackupTermsCorpus(
string fileName,
IEnumerable<string> termCategories,
bool useTermGlosses = true
bool useTermGlosses = true,
IDictionary<string, HashSet<int>> chapters = null
)
{
using (var archive = ZipFile.OpenRead(fileName))
Expand All @@ -18,7 +19,7 @@ public ParatextBackupTermsCorpus(
IEnumerable<(string, IReadOnlyList<string>)> glosses = new ZipParatextProjectTermsParser(
archive,
settings
).Parse(termCategories, useTermGlosses);
).Parse(termCategories, useTermGlosses, chapters);
string textId =
$"{settings.BiblicalTermsListType}:{settings.BiblicalTermsProjectName}:{settings.BiblicalTermsFileName}";

Expand Down
46 changes: 45 additions & 1 deletion src/SIL.Machine/Corpora/ParatextProjectTermsParserBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
using System.Text.RegularExpressions;
using System.Xml.Linq;
using SIL.Extensions;
using SIL.Scripture;

namespace SIL.Machine.Corpora
{
Expand Down Expand Up @@ -49,11 +50,13 @@ protected ParatextProjectTermsParserBase(ParatextProjectSettingsParserBase setti

public IEnumerable<(string TermId, IReadOnlyList<string> Glosses)> Parse(
IEnumerable<string> termCategories,
bool useTermGlosses = true
bool useTermGlosses = true,
IDictionary<string, HashSet<int>> chapters = null
)
{
XDocument biblicalTermsDoc;
IDictionary<string, string> termIdToCategoryDictionary;
IDictionary<string, ImmutableHashSet<VerseRef>> termIdToReferences;
if (_settings.BiblicalTermsListType == "Project")
{
if (Exists(_settings.BiblicalTermsFileName))
Expand All @@ -62,6 +65,7 @@ protected ParatextProjectTermsParserBase(ParatextProjectSettingsParserBase setti
{
biblicalTermsDoc = XDocument.Load(keyTermsFile);
termIdToCategoryDictionary = GetCategoryPerId(biblicalTermsDoc);
termIdToReferences = GetReferences(biblicalTermsDoc);
}
}
else
Expand All @@ -74,6 +78,7 @@ protected ParatextProjectTermsParserBase(ParatextProjectSettingsParserBase setti
{
biblicalTermsDoc = XDocument.Load(keyTermsFile);
termIdToCategoryDictionary = GetCategoryPerId(biblicalTermsDoc);
termIdToReferences = GetReferences(biblicalTermsDoc);
}
}
}
Expand All @@ -87,11 +92,13 @@ protected ParatextProjectTermsParserBase(ParatextProjectSettingsParserBase setti
{
biblicalTermsDoc = XDocument.Load(keyTermsFile);
termIdToCategoryDictionary = GetCategoryPerId(biblicalTermsDoc);
termIdToReferences = GetReferences(biblicalTermsDoc);
}
}
else
{
termIdToCategoryDictionary = new Dictionary<string, string>();
termIdToReferences = new Dictionary<string, ImmutableHashSet<VerseRef>>();
}

XDocument termsGlossesDoc = null;
Expand Down Expand Up @@ -124,6 +131,7 @@ protected ParatextProjectTermsParserBase(ParatextProjectSettingsParserBase setti
.Where(n => n.Name.LocalName == "TermRendering")
.Select(ele => (ele.Attribute("Id").Value, ele))
.Where(kvp => IsInCategory(kvp.Item1, termCategories, termIdToCategoryDictionary))
.Where(kvp => IsInChapters(kvp.Item1, chapters, termIdToReferences))
.Select(kvp =>
{
string id = kvp.Item1.Replace("\n", "&#xA");
Expand All @@ -144,6 +152,7 @@ protected ParatextProjectTermsParserBase(ParatextProjectSettingsParserBase setti
.Where(n => n.Name.LocalName == "Localization")
.Select(ele => (ele.Attribute("Id").Value, ele))
.Where(kvp => IsInCategory(kvp.Item1, termCategories, termIdToCategoryDictionary))
.Where(kvp => IsInChapters(kvp.Item1, chapters, termIdToReferences))
.Select(kvp =>
{
string id = kvp.Item1.Replace("\n", "&#xA");
Expand Down Expand Up @@ -175,6 +184,24 @@ IDictionary<string, string> termIdToCategoryDictionary
|| (termIdToCategoryDictionary.TryGetValue(id, out category) && termCategories.Contains(category));
}

private static bool IsInChapters(
string id,
IDictionary<string, HashSet<int>> chapters,
IDictionary<string, ImmutableHashSet<VerseRef>> termIdToReferences
)
{
ImmutableHashSet<VerseRef> verseRefs;
return termIdToReferences.Count() == 0
|| chapters == null
|| (
termIdToReferences.TryGetValue(id, out verseRefs)
&& verseRefs.Any(vr =>
chapters.TryGetValue(vr.Book, out HashSet<int> bookChapters)
&& (bookChapters.Count() == 0 || bookChapters.Contains(vr.ChapterNum))
)
);
}

public static IReadOnlyList<string> GetGlosses(string gloss)
{
//If entire term rendering is surrounded in square brackets, remove them
Expand Down Expand Up @@ -243,6 +270,23 @@ private static IDictionary<string, string> GetCategoryPerId(XDocument biblicalTe
.ToDictionary(e => e.Attribute("Id").Value, e => e.Element("Category")?.Value ?? "");
}

private static IDictionary<string, ImmutableHashSet<VerseRef>> GetReferences(XDocument biblicalTermsDocument)
{
return biblicalTermsDocument
.Descendants()
.Where(n => n.Name.LocalName == "Term")
.DistinctBy(e => e.Attribute("Id").Value)
.ToDictionary(
e => e.Attribute("Id").Value,
e =>
e.Element("References")
?.Descendants()
.Where(reference => int.TryParse(reference.Value.Substring(0, 9), out int _))
.Select(reference => new VerseRef(int.Parse(reference.Value.Substring(0, 9))))
.ToImmutableHashSet()
);
}

protected abstract Stream Open(string fileName);

protected abstract bool Exists(string fileName);
Expand Down
32 changes: 29 additions & 3 deletions tests/SIL.Machine.Tests/Corpora/ParatextProjectTermsParserTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ public void TestGetKeyTermsFromTermsLocalizations_NoTermRenderings_DoNotUseTermG
}

[Test]
public void TestGetKeyTermsFromTermsLocalizations_()
public void TestGetKeyTermsFromTermsLocalizations()
{
var env = new TestEnvironment(
new DefaultParatextProjectSettings(
Expand All @@ -88,6 +88,29 @@ public void TestGetKeyTermsFromTermsLocalizations_()
Assert.That(string.Join(" ", terms.First().Glosses), Is.EqualTo("Aaron"));
}

[Test]
public void TestGetKeyTermsFromTermsLocalizations_FilterByChapters()
{
var env = new TestEnvironment(
new DefaultParatextProjectSettings(
biblicalTermsListType: "Major",
biblicalTermsFileName: "BiblicalTerms.xml",
languageCode: "fr"
),
useTermGlosses: true,
chapters: new Dictionary<string, HashSet<int>>()
{
{
"HAB",
new() { 1 }
}
}
);
IEnumerable<(string TermId, IReadOnlyList<string> Glosses)> terms = env.GetGlosses();
Assert.That(terms.Count, Is.EqualTo(3)); //Habakkuk, YHWH, Kashdi/Chaldean are the only PN terms in HAB 1
Assert.That(string.Join(" ", terms.First().Glosses), Is.EqualTo("Habaquq"));
}

[Test]
public void TestGetKeyTermsFromTermsLocalizations_TermRenderingsExists_PreferLocalization()
{
Expand Down Expand Up @@ -150,16 +173,19 @@ public void TestGetGlosses(string glossString, IReadOnlyList<string> expectedOut
private class TestEnvironment(
ParatextProjectSettings? settings = null,
Dictionary<string, string>? files = null,
bool useTermGlosses = true
bool useTermGlosses = true,
IDictionary<string, HashSet<int>>? chapters = null
)
{
private readonly bool _useTermGlosses = useTermGlosses;
private readonly IDictionary<string, HashSet<int>>? _chapters = chapters;

public ParatextProjectTermsParserBase Parser { get; } =
new MemoryParatextProjectTermsParser(settings ?? new DefaultParatextProjectSettings(), files ?? new());

public IEnumerable<(string TermId, IReadOnlyList<string> Glosses)> GetGlosses()
{
return Parser.Parse(new string[] { "PN" }, _useTermGlosses);
return Parser.Parse(new string[] { "PN" }, _useTermGlosses, _chapters);
}
}

Expand Down
Loading