From 169d04c9743c5deac5309bef496029f172dd0afe Mon Sep 17 00:00:00 2001 From: Deyan Ginev Date: Fri, 29 Dec 2023 11:23:42 -0800 Subject: [PATCH] respect \@sanitize guard for \index (#2249) * respect \@sanitize guard for \index via new SanitizedVerbatim parameter type * stricter balancing check in Tokens::isBalanced * more correct emulation of parameter tokens for \index * tweak comment --- lib/LaTeXML/Core/Tokens.pm | 6 ++++- lib/LaTeXML/Package/LaTeX.pool.ltxml | 34 +++++++++++++++++++++++++--- 2 files changed, 36 insertions(+), 4 deletions(-) diff --git a/lib/LaTeXML/Core/Tokens.pm b/lib/LaTeXML/Core/Tokens.pm index 053309fa1..5fa66ef0c 100644 --- a/lib/LaTeXML/Core/Tokens.pm +++ b/lib/LaTeXML/Core/Tokens.pm @@ -94,7 +94,11 @@ sub isBalanced { foreach my $t (@$self) { my $cc = $$t[1]; # INLINE $level++ if $cc == CC_BEGIN; - $level-- if $cc == CC_END; } + if ($cc == CC_END) { + $level--; + # Note that '{ }} {' is still unbalanced + # even though the left and right braces match in count. + last if $level < 0; } } return $level == 0; } # NOTE: Assumes each arg either undef or also Tokens diff --git a/lib/LaTeXML/Package/LaTeX.pool.ltxml b/lib/LaTeXML/Package/LaTeX.pool.ltxml index 4b167b6c6..a6e6e08b0 100644 --- a/lib/LaTeXML/Package/LaTeX.pool.ltxml +++ b/lib/LaTeXML/Package/LaTeX.pool.ltxml @@ -4422,9 +4422,14 @@ our %index_style = (textbf => 'bold', bf => 'bold', textrm => '', rm => '', sub process_index_phrases { my ($gullet, $phrases, $inlist) = @_; my @expansion = (); + my @tokens = $phrases->unlist; + # check we have a well-formed argument + return unless @tokens; + if (!$phrases->isBalanced) { # if ill-formed, discard; + Warn("malformed", "indexentry", $gullet, + 'index entry has unbalanced groups, discarding: "' . ToString($phrases) . '"'); + return; } # Split the text into phrases, separated by "!" - my @tokens = $phrases->unlist; - return unless @tokens; push(@tokens, T_OTHER('!')) unless $tokens[-1]->getString eq '!'; # Add terminal ! my @phrase = (); my @sortas = (); @@ -4462,7 +4467,30 @@ sub process_index_phrases { T_BEGIN, @expansion, T_END); return @expansion; } -DefMacro('\index{}', \&process_index_phrases); +# read verbatim, as if with LaTeX's \@sanitize; +# useful for \index (maybe others?) +DefParameterType('SanitizedVerbatim', sub { + my ($gullet) = @_; + $gullet->readUntil(T_BEGIN); + # crucial: deactivate the backslash to avoid activating command sequences + # chars switched to CC_OTHER by \@sanitize: ' ', '\\', '$', '&', '#', '^', '_', '%', '~' + # some of those are already in state's "SPECIALS", so only adding the rest: + StartSemiverbatim(' ', '\\', '%'); + my $arg = $gullet->readBalanced(); + EndSemiverbatim(); + # now that we have the semiverbatim tokens, retokenize. + # this may seem like wasted work, but it avoids very unfortunate error propagation in cases + # where the \index argument was malformed for one reason or another. + # + # the strangeness comes from the original TeX workflow requiring multiple conversion calls, + # alongside a call to the `makeidx` binary, which we don't do in latexml. This parameter type + # emulates one important aspect implied by those steps. + $arg = TokenizeInternal(UnTeX($arg)); + return $arg; }, + reversion => sub { (T_BEGIN, Revert($_[0]), T_END); }); + +# real-world LaTeX \index +DefMacro('\index SanitizedVerbatim', \&process_index_phrases); Tag('ltx:indexphrase', afterClose => \&addIndexPhraseKey); Tag('ltx:glossaryphrase', afterClose => \&addIndexPhraseKey);