Skip to content

Commit

Permalink
respect \@sanitize guard for \index (#2249)
Browse files Browse the repository at this point in the history
* respect \@sanitize guard for \index via new SanitizedVerbatim parameter type

* stricter balancing check in Tokens::isBalanced

* more correct emulation of parameter tokens for \index

* tweak comment
  • Loading branch information
dginev authored Dec 29, 2023
1 parent 98b2102 commit 169d04c
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 4 deletions.
6 changes: 5 additions & 1 deletion lib/LaTeXML/Core/Tokens.pm
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,11 @@ sub isBalanced {
foreach my $t (@$self) {
my $cc = $$t[1]; # INLINE
$level++ if $cc == CC_BEGIN;
$level-- if $cc == CC_END; }
if ($cc == CC_END) {
$level--;
# Note that '{ }} {' is still unbalanced
# even though the left and right braces match in count.
last if $level < 0; } }
return $level == 0; }

# NOTE: Assumes each arg either undef or also Tokens
Expand Down
34 changes: 31 additions & 3 deletions lib/LaTeXML/Package/LaTeX.pool.ltxml
Original file line number Diff line number Diff line change
Expand Up @@ -4422,9 +4422,14 @@ our %index_style = (textbf => 'bold', bf => 'bold', textrm => '', rm => '',
sub process_index_phrases {
my ($gullet, $phrases, $inlist) = @_;
my @expansion = ();
my @tokens = $phrases->unlist;
# check we have a well-formed argument
return unless @tokens;
if (!$phrases->isBalanced) { # if ill-formed, discard;
Warn("malformed", "indexentry", $gullet,
'index entry has unbalanced groups, discarding: "' . ToString($phrases) . '"');
return; }
# Split the text into phrases, separated by "!"
my @tokens = $phrases->unlist;
return unless @tokens;
push(@tokens, T_OTHER('!')) unless $tokens[-1]->getString eq '!'; # Add terminal !
my @phrase = ();
my @sortas = ();
Expand Down Expand Up @@ -4462,7 +4467,30 @@ sub process_index_phrases {
T_BEGIN, @expansion, T_END);
return @expansion; }

DefMacro('\index{}', \&process_index_phrases);
# read verbatim, as if with LaTeX's \@sanitize;
# useful for \index (maybe others?)
DefParameterType('SanitizedVerbatim', sub {
my ($gullet) = @_;
$gullet->readUntil(T_BEGIN);
# crucial: deactivate the backslash to avoid activating command sequences
# chars switched to CC_OTHER by \@sanitize: ' ', '\\', '$', '&', '#', '^', '_', '%', '~'
# some of those are already in state's "SPECIALS", so only adding the rest:
StartSemiverbatim(' ', '\\', '%');
my $arg = $gullet->readBalanced();
EndSemiverbatim();
# now that we have the semiverbatim tokens, retokenize.
# this may seem like wasted work, but it avoids very unfortunate error propagation in cases
# where the \index argument was malformed for one reason or another.
#
# the strangeness comes from the original TeX workflow requiring multiple conversion calls,
# alongside a call to the `makeidx` binary, which we don't do in latexml. This parameter type
# emulates one important aspect implied by those steps.
$arg = TokenizeInternal(UnTeX($arg));
return $arg; },
reversion => sub { (T_BEGIN, Revert($_[0]), T_END); });

# real-world LaTeX \index
DefMacro('\index SanitizedVerbatim', \&process_index_phrases);

Tag('ltx:indexphrase', afterClose => \&addIndexPhraseKey);
Tag('ltx:glossaryphrase', afterClose => \&addIndexPhraseKey);
Expand Down

0 comments on commit 169d04c

Please sign in to comment.