Skip to content

Commit

Permalink
Getting there
Browse files Browse the repository at this point in the history
  • Loading branch information
johnml1135 committed Jan 16, 2025
1 parent d1cc368 commit 4e1eec1
Show file tree
Hide file tree
Showing 9 changed files with 86 additions and 122 deletions.
4 changes: 2 additions & 2 deletions src/SIL.Machine/Corpora/IUsfmParserHandler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -81,12 +81,12 @@ IReadOnlyList<UsfmAttribute> attributes
/// <summary>
/// Start of a note text
/// </summary>
void StartNoteText(UsfmParserState state, string marker);
void StartNoteText(UsfmParserState state);

/// <summary>
/// End of a note text
/// </summary>
void EndNoteText(UsfmParserState state, string marker);
void EndNoteText(UsfmParserState state);

/// <summary>
/// Start of a table
Expand Down
41 changes: 11 additions & 30 deletions src/SIL.Machine/Corpora/ScriptureRefUsfmParserHandlerBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ public enum ScriptureTextType
None,
NonVerse,
Verse,
Note
Note,
NoteText
}

public abstract class ScriptureRefUsfmParserHandlerBase : UsfmParserHandlerBase
Expand All @@ -18,7 +19,6 @@ public abstract class ScriptureRefUsfmParserHandlerBase : UsfmParserHandlerBase
private readonly Stack<ScriptureElement> _curElements;
private readonly Stack<ScriptureTextType> _curTextType;
private bool _duplicateVerse = false;
private bool _inNoteText = false;

protected ScriptureRefUsfmParserHandlerBase()
{
Expand Down Expand Up @@ -162,10 +162,16 @@ public override void StartNote(UsfmParserState state, string marker, string call
}
}

public override void EndNote(UsfmParserState state, string marker, bool closed)
public override void StartNoteText(UsfmParserState state)
{
if (_inNoteText && !_duplicateVerse)
EndNoteText(state);
_curTextType.Push(ScriptureTextType.NoteText);
StartNoteText(state, CreateNonVerseRef());
}

public override void EndNoteText(UsfmParserState state)
{
EndNoteText(state, CreateNonVerseRef());
_curTextType.Pop();
}

public override void Text(UsfmParserState state, string text)
Expand All @@ -192,17 +198,6 @@ IReadOnlyList<UsfmAttribute> attributes
CheckConvertVerseParaToNonVerse(state);
}

public override void EndChar(
UsfmParserState state,
string marker,
IReadOnlyList<UsfmAttribute> attributes,
bool closed
)
{
if (_inNoteText && !_duplicateVerse && UsfmStylesheet.IsNoteOrCrossReferencePart(marker))
EndNoteText(state);
}

protected virtual void StartVerseText(UsfmParserState state, IReadOnlyList<ScriptureRef> scriptureRefs) { }

protected virtual void EndVerseText(UsfmParserState state, IReadOnlyList<ScriptureRef> scriptureRefs) { }
Expand Down Expand Up @@ -242,20 +237,6 @@ private void EndNonVerseText(UsfmParserState state)
_curTextType.Pop();
}

public void StartNoteText(UsfmParserState state)
{
_curTextType.Push(ScriptureTextType.Note);
StartNoteText(state, CreateNonVerseRef());
_inNoteText = true;
}

private void EndNoteText(UsfmParserState state)
{
EndNoteText(state, CreateNonVerseRef());
_curTextType.Pop();
_inNoteText = false;
}

private void UpdateVerseRef(VerseRef verseRef, string marker)
{
if (!VerseRef.AreOverlappingVersesRanges(verseRef, _curVerseRef))
Expand Down
63 changes: 26 additions & 37 deletions src/SIL.Machine/Corpora/UpdateUsfmParserHandler.cs
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;

namespace SIL.Machine.Corpora
{
Expand Down Expand Up @@ -32,7 +31,6 @@ public class UpdateUsfmParserHandler : ScriptureRefUsfmParserHandlerBase
private readonly UpdateUsfmIntraVerseMarkerBehavior _noteBehavior;
private readonly UpdateUsfmIntraVerseMarkerBehavior _formattingBehavior;
private readonly Stack<bool> _replace;
private readonly Regex _nonAlpha = new Regex("[^a-zA-Z0-9]");
private int _rowIndex;
private int _tokenIndex;

Expand Down Expand Up @@ -190,7 +188,7 @@ bool closed
)
{
// strip out char-style markers in verses that are being replaced
if (ReplaceWithNewTokens(state, closed: closed, endCharacter: true))
if (ReplaceWithNewTokens(state, closed: closed))
SkipTokens(state);
else
CollectTokens(state);
Expand Down Expand Up @@ -362,57 +360,48 @@ private void SkipTokens(UsfmParserState state)
_tokenIndex = state.Index + 1 + state.SpecialTokenCount;
}

private bool ReplaceWithNewTokens(
UsfmParserState state,
bool closed = true,
bool endCharacter = false,
bool endNote = false
)
private bool ReplaceWithNewTokens(UsfmParserState state, bool closed = true, bool endNote = false)
{
bool stripExistingText = _textBehavior == UpdateUsfmTextBehavior.StripExisting;
bool newText = _replace.Count > 0 && _replace.Peek();
bool inNote = state.NoteTag != null || endNote;
bool inNoteText = CurrentTextType == ScriptureTextType.NoteText;
bool isNoteTag =
state.Token.Marker != null && UsfmStylesheet.IsNoteOrCrossReferencePart(state.Token.Marker);
bool isFormattingTag =
state.Token.Marker != null && !UsfmStylesheet.IsNoteOrCrossReferencePart(state.Token.Marker);

bool existingText = state
.Tokens.Skip(_tokenIndex)
.Take(state.Index + 1 + state.SpecialTokenCount - _tokenIndex)
.Any(t => t.Type == UsfmTokenType.Text && t.Text.Length > 0);

int tokenEnd = state.Index + state.SpecialTokenCount;
bool existingText = false;
for (int index = _tokenIndex; index <= tokenEnd; index++)
{
if (state.Tokens[index].Type == UsfmTokenType.Text && state.Tokens[index].Text.Length > 0)
{
existingText = true;
break;
}
}
bool useNewTokens =
stripExistingText
|| (newText && !existingText)
|| (newText && _textBehavior == UpdateUsfmTextBehavior.PreferNew && !state.IsReferenceText);
|| (newText && _textBehavior == UpdateUsfmTextBehavior.PreferNew && (!inNote || inNoteText));

if (useNewTokens && _newTokens.Count > 0)
_tokens.AddRange(_newTokens);

_newTokens.Clear();

bool skipTokens = useNewTokens && closed;

// figure out when to skip the existing text
bool withinNewText = _replace.Any(r => r);

if (withinNewText)
if (withinNewText && inNote)
{
string bareMarker = _nonAlpha.Replace(state.Token.Marker ?? "", "");
if (state.Token.Type == UsfmTokenType.Character || endCharacter)
{
var behavior = UsfmStylesheet.IsNoteOrCrossReferencePart(bareMarker)
? _noteBehavior
: _formattingBehavior;
skipTokens = stripExistingText || behavior == UpdateUsfmIntraVerseMarkerBehavior.Strip;
}
if (_noteBehavior == UpdateUsfmIntraVerseMarkerBehavior.Strip)
return true;

if (state.NoteTag != null || endNote)
{
skipTokens = stripExistingText || _noteBehavior == UpdateUsfmIntraVerseMarkerBehavior.Strip;
}
if (!inNoteText || isNoteTag)
return false;
}

bool skipTokens = useNewTokens && closed;

if (newText && isFormattingTag)
{
skipTokens = _formattingBehavior == UpdateUsfmIntraVerseMarkerBehavior.Strip;
}
return skipTokens;
}

Expand Down
24 changes: 19 additions & 5 deletions src/SIL.Machine/Corpora/UsfmParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -486,10 +486,15 @@ public bool ProcessToken()
Handler?.Ref(State, token.Marker, display, target);
break;
}
if (IsNoteTextStart(token) && State.NoteTag != null)

if (UsfmStylesheet.IsNoteOrCrossReferencePart(token.Marker))
CloseNoteText();

if (IsNoteText(token))
{
Handler?.StartNoteText(State, token.Marker);
break;
// Note text should be handled as a full segment
State.Push(new UsfmParserElement(UsfmElementType.NoteText, token.Marker));
Handler?.StartNoteText(State);
}

string actualMarker;
Expand Down Expand Up @@ -623,6 +628,12 @@ private void CloseNote(bool closed = false)
}
}

private void CloseNoteText()
{
while (State.Stack.Count > 0 && State.Peek().Type == UsfmElementType.NoteText)
CloseElement();
}

private void CloseCharStyles()
{
while (State.Stack.Count > 0 && State.Peek().Type == UsfmElementType.Char)
Expand All @@ -646,6 +657,9 @@ private void CloseElement(bool closed = false)
case UsfmElementType.Note:
Handler?.EndNote(State, element.Marker, closed);
break;
case UsfmElementType.NoteText:
Handler?.EndNoteText(State);
break;
case UsfmElementType.Table:
Handler?.EndTable(State);
break;
Expand Down Expand Up @@ -678,9 +692,9 @@ private bool IsRef(UsfmToken token)
&& (token.Marker == "ref");
}

private bool IsNoteTextStart(UsfmToken token)
private bool IsNoteText(UsfmToken token)
{
return token.Marker == "ft";
return token.Marker == "ft" && State.Stack.Any(elem => elem.Type == UsfmElementType.Note);
}
}
}
4 changes: 2 additions & 2 deletions src/SIL.Machine/Corpora/UsfmParserHandlerBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,9 @@ public virtual void StartNote(UsfmParserState state, string marker, string calle

public virtual void EndNote(UsfmParserState state, string marker, bool closed) { }

public virtual void StartNoteText(UsfmParserState state, string marker) { }
public virtual void StartNoteText(UsfmParserState state) { }

public virtual void EndNoteText(UsfmParserState state, string marker) { }
public virtual void EndNoteText(UsfmParserState state) { }

public virtual void StartTable(UsfmParserState state) { }

Expand Down
5 changes: 1 addition & 4 deletions src/SIL.Machine/Corpora/UsfmParserState.cs
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,6 @@ public UsfmParserState(UsfmStylesheet stylesheet, ScrVers versification, IReadOn
/// </summary>
public int SpecialTokenCount { get; internal set; }

public bool IsReferenceText =>
!(CharTag is null)
&& (UsfmStylesheet.IsReference(CharTag.Marker) || UsfmStylesheet.IsFigure(CharTag.Marker));

/// <summary>
/// Current paragraph tag or null for none.
/// Note that book and table rows are considered paragraphs for legacy checking reasons.
Expand Down Expand Up @@ -210,6 +206,7 @@ public enum UsfmElementType
Row,
Cell,
Note,
NoteText,
Sidebar
};

Expand Down
33 changes: 8 additions & 25 deletions src/SIL.Machine/Corpora/UsfmStylesheet.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,25 +11,6 @@ namespace SIL.Machine.Corpora
public class UsfmStylesheet
{
private static readonly Regex CellRangeRegex = new Regex(@"^(t[ch][cr]?[1-5])-([2-5])$", RegexOptions.Compiled);
private static readonly HashSet<string> ReferenceTags = new HashSet<string>
{
"fl",
"fr",
"fv",
"r",
"rq",
"va",
"vp",
"xo",
"xop",
"xot",
"xnt",
"xdc",
"xt",
"zpa-xb",
"zpa-xc",
"zpa-xv"
};

private static readonly HashSet<string> NoteTextTags = new HashSet<string> { "ft", };

Expand Down Expand Up @@ -63,6 +44,8 @@ public class UsfmStylesheet
"zpa-xv"
};

private static readonly Regex NonAlpha = new Regex("[^a-zA-Z0-9]");

private static readonly Dictionary<string, UsfmJustification> JustificationMappings = new Dictionary<
string,
UsfmJustification
Expand Down Expand Up @@ -162,21 +145,21 @@ public static bool IsCellRange(string tag, out string baseMarker, out int colSpa
return false;
}

public static bool IsReference(string tag)
{
return !(tag is null) && ReferenceTags.Contains(tag);
}

public static bool IsNoteOrCrossReferencePart(string tag)
{
return !(tag is null) && NoteAndCrossReferencePartTags.Contains(tag);
return !(tag is null) && NoteAndCrossReferencePartTags.Contains(NonAlpha.Replace(tag ?? "", ""));
}

public static bool IsFigure(string tag)
{
return tag == "fig";
}

public static bool IsNoteText(string tag)
{
return !(tag is null) && NoteTextTags.Contains(tag);
}

private static IEnumerable<string> GetEmbeddedStylesheet(string fileName)
{
using (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
\s1 Chapter \it Two \it*
\p
\p
\v 1 Chapter \add two\add*, verse \f + \fr 2:1: \ft This is a footnote.\f*one.
\v 1 Chapter \add two\add*, verse \f + \fr 2:1: \ft This is a\bd footnote.\bd*\f*one.
\v 2-3 Chapter two, // verse \fm ∆\fm*two.
\esb
\ms This is a sidebar
Expand Down
Loading

0 comments on commit 4e1eec1

Please sign in to comment.