-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathreadme_expex.tex
909 lines (804 loc) · 34.1 KB
/
readme_expex.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
% Options for packages loaded elsewhere
\PassOptionsToPackage{unicode}{hyperref}
\PassOptionsToPackage{hyphens}{url}
\documentclass[
]{article}
\usepackage{xcolor}
\usepackage{amsmath,amssymb}
\setcounter{secnumdepth}{5}
\usepackage{iftex}
\ifPDFTeX
\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage{textcomp} % provide euro and other symbols
\else % if luatex or xetex
\usepackage{unicode-math} % this also loads fontspec
\defaultfontfeatures{Scale=MatchLowercase}
\defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1}
\fi
\usepackage{lmodern}
\ifPDFTeX\else
% xetex/luatex font selection
\fi
% Use upquote if available, for straight quotes in verbatim environments
\IfFileExists{upquote.sty}{\usepackage{upquote}}{}
\IfFileExists{microtype.sty}{% use microtype if available
\usepackage[]{microtype}
\UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
}{}
\makeatletter
\@ifundefined{KOMAClassName}{% if non-KOMA class
\IfFileExists{parskip.sty}{%
\usepackage{parskip}
}{% else
\setlength{\parindent}{0pt}
\setlength{\parskip}{6pt plus 2pt minus 1pt}}
}{% if KOMA class
\KOMAoptions{parskip=half}}
\makeatother
\usepackage{graphicx}
\makeatletter
\newsavebox\pandoc@box
\newcommand*\pandocbounded[1]{% scales image to fit in text height/width
\sbox\pandoc@box{#1}%
\Gscale@div\@tempa{\textheight}{\dimexpr\ht\pandoc@box+\dp\pandoc@box\relax}%
\Gscale@div\@tempb{\linewidth}{\wd\pandoc@box}%
\ifdim\@tempb\p@<\@tempa\p@\let\@tempa\@tempb\fi% select the smaller of both
\ifdim\@tempa\p@<\p@\scalebox{\@tempa}{\usebox\pandoc@box}%
\else\usebox{\pandoc@box}%
\fi%
}
% Set default figure placement to htbp
\def\fps@figure{htbp}
\makeatother
\setlength{\emergencystretch}{3em} % prevent overfull lines
\providecommand{\tightlist}{%
\setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
\usepackage{expex}
\lingset{
belowglpreambleskip = -1.5ex,
aboveglftskip = -1.5ex,
exskip = 0ex,
interpartskip = -0.5ex,
belowpreambleskip = -2ex
}
\usepackage{bookmark}
\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available
\urlstyle{same}
\hypersetup{
pdftitle={Using pandoc-ling},
pdfauthor={Michael Cysouw},
hidelinks,
pdfcreator={LaTeX via pandoc}}
\title{Using pandoc-ling}
\author{Michael Cysouw}
\date{}
\begin{document}
\maketitle
{
\setcounter{tocdepth}{3}
\tableofcontents
}
\section{pandoc-ling}\label{pandoc-ling}
\emph{Michael Cysouw}
\textless{}\href{mailto:[email protected]}{\nolinkurl{[email protected]}}\textgreater{}
A Pandoc filter for linguistic examples
tl;dr
\begin{itemize}
\tightlist
\item
Easily write linguistic examples including basic interlinear glossing.
\item
Let numbering and cross-referencing be done for you.
\item
Export to (almost) any format of your wishes for final polishing.
\item
As an example, check out this readme in
\href{https://cysouw.github.io/pandoc-ling/readme.html}{HTML} or
\href{https://cysouw.github.io/pandoc-ling/readme_gb4e.pdf}{Latex}.
\end{itemize}
\section{Rationale}\label{rationale}
In the field of linguistics there is an outspoken tradition to format
example sentences in research papers in a very specific way. In the
field, it is a perennial problem to get such example sentences to look
just right. Within Latex, there are numerous packages to deal with this
problem (e.g.~covington, linguex, gb4e, expex, etc.). Depending on your
needs, there is some Latex solution for almost everyone. However, these
solutions in Latex are often cumbersome to type, and they are not
portable to other formats. Specifically, transfer between latex, html,
docx, odt or epub would actually be highly desirable. Such transfer is
the hallmark of \href{https://pandoc.org}{Pandoc}, a tool by John
MacFarlane that provides conversion between these (and many more)
formats.
Any such conversion between text-formats naturally never works
perfectly: every text-format has specific features that are not
transferable to other formats. A central goal of Pandoc (at least in my
interpretation) is to define a set of shared concepts for text-structure
(a `common denominator' if you will, but surely not `least'!) that can
then be mapped to other formats. In many ways, Pandoc tries (again) to
define a set of logical concepts for text structure (`semantic markup'),
which can then be formatted by your favourite typesetter. As long as you
stay inside the realm of this `common denominator' (in practice that
means Pandoc's extended version of Markdown/CommonMark), conversion
works reasonably well (think 90\%-plus).
Building on John Gruber's
\href{https://daringfireball.net/projects/markdown/syntax}{Markdown
philosophy}, there is a strong urge here to learn to restrain oneself
while writing, and try to restrict the number of layout-possibilities to
a minimum. In this sense, with \texttt{pandoc-ling} I propose a
Markdown-structure for linguistic examples that is simple, easy to type,
easy to read, and portable through the Pandoc universe by way of an
extension mechanism of Pandoc, called a `Pandoc Lua Filter'. This
extension will not magically allow you to write every linguistic example
thinkable, but my guess is that in practice the present proposal covers
the majority of situations in linguistic publications (think 90\%-plus).
As an example (and test case) I have included automatic conversions into
various formats in this repository (chech them out in the directory
\texttt{tests} to get an idea of the strengths and weaknesses of the
current implementation).
\section{The basic structure of a linguistic
example}\label{the-basic-structure-of-a-linguistic-example}
Basically, a linguistic example consists of 6 possible building blocks,
of which only the number and at least one example line are necessary.
The space between the building blocks is kept as minimal as possible
without becoming cramped. When (optional) building blocks are not
included, then the other blocks shift left and up (only exception: a
preamble without labels is not shifted left completely, but left-aligned
with the example, not with the judgement).
\begin{itemize}
\tightlist
\item
\textbf{Number}: Running tally of all examples in the work, possibly
restarting at chapters or other major headings. Typically between
round brackets, possibly with a chapter number added before in long
works, e.g.~example (7.26). Aligned top-left, typically left-aligned
to main text margin.
\item
\textbf{Preamble}: Optional information about the content/kind of
example. Aligned top-left: to the top with the number, to the left
with the (optional) label. When there is no label, then preamble is
aligned with the example, not with the judgment.
\item
\textbf{Label}: Indices for sub-examples. Only present when there are
more than one example grouped together inside one numbered entity.
Typically these sub-example labels use latin letters followed by a
full stop. They are left-aligned with the preamble, and each label is
top-aligned with the top-line of the corresponding example (important
for longer line-wrapped examples).
\item
\textbf{Judgment}: Examples can optionally have grammaticality
judgments, typically symbols like **?!* sometimes in superscript
relative to the corresponding example. judgements are right-aligned to
each other, typically with only minimal space to the left-aligned
examples.
\item
\textbf{Line example}: A minimal linguistic example has at least one
line example, i.e.~an utterance of interest. Building blocks in
general shift left and up when other (optional) building blocks are
not present. Minimally, this results in a number with one line
example.
\item
\textbf{Interlinear example}: A complex structure typically used for
examples from languages unknown to most readers. Consist of three or
four lines that are left-aligned:
\begin{itemize}
\tightlist
\item
\textbf{Header}: An optional header is typically used to display
information about the language of the example, including literature
references. When not present, then all other lines from the
interlinear example shift upwards.
\item
\textbf{Source}: The actual language utterance, often typeset in
italics. This line is internally separated at spaces, and each
sub-block is left-aligned with the corresponding sub-blocks of the
gloss.
\item
\textbf{Gloss}: Explanation of the meaning of the source, often
using abbreviations in small caps. This line is internally separated
at spaces, and each block is left-aligned with the block from
source.
\item
\textbf{Translation}: Free translation of the source, typically
quoted. Not separated in blocks, but freely extending to the right.
Left-aligned with the other lines from the interlinear example.
\end{itemize}
\end{itemize}
\begin{figure}
\centering
\pandocbounded{\includegraphics[keepaspectratio]{figure/ExampleStructure.png}}
\caption{The structure of a linguistic example.}
\end{figure}
There are of course much more possibilities to extend the structure of a
linguistic examples, like third or fourth subdivisions of labels (often
using small roman numerals as a third level) or multiple glossing lines
in the interlinear example. Also, the content of the header is sometimes
found right-aligned to the right of the interlinear example (language
into to the top, reference to the bottom). All such options are
currently not supported by \texttt{pandoc-ling}.
Under the hood, this structure is prepared by \texttt{pandoc-ling} as a
table. Tables are reasonably well transcoded to different document
formats. Specific layout considerations mostly have to be set manually.
Alignment of the text should work in most exports. Some \texttt{CSS}
styling is proposed by \texttt{pandoc-ling}, but can of course be
overruled. For latex (and beamer) special output is prepared using
various available latex packages (see options, below).
\section{\texorpdfstring{Introducing
\texttt{pandoc-ling}}{Introducing pandoc-ling}}\label{introducing-pandoc-ling}
\subsection{Editing linguistic
examples}\label{editing-linguistic-examples}
To include a linguistic example in Markdown \texttt{pandoc-ling} uses
the \texttt{div} structure, which is indicated in Pandoc-Markdown by
typing three colons at the start and three colons at the end. To
indicate the \texttt{class} of this \texttt{div} the letters `ex' (for
`example') should be added after the top colons (with or without space
in between). This `ex'-class is the signal for \texttt{pandoc-ling} to
start processing such a \texttt{div}. The numbering of these examples
will be inserted by \texttt{pandoc-ling}.
Empty lines can be added inside the \texttt{div} for visual pleasure, as
they mostly do not have an influence on the output. Exception: do
\emph{not} use empty lines between unlabelled line examples. Multiple
lines of text can be used (without empty lines in between), but they
will simply be interpreted as one sequential paragraph.
\begin{verbatim}
::: ex
This is the most basic structure of a linguistic example.
:::
\end{verbatim}
\begin{samepage}
\ex<ex1>
This is the most basic structure of a linguistic example.
\xe
\end{samepage}
Alternatively, the \texttt{class} can be put in curled brackets (and
then a leading full stop is necessary before \texttt{ex}). Inside these
brackets more attributes can be added (separated by space), for example
an id, using a hash, or any attribute=value pairs that should apply to
this example. Currently there is only one real attribute implemented
(\texttt{formatGloss}), but in principle it is possible to add more
attributes that can be used to fine-tune the typesetting of the example
(see below for a description of such \texttt{local\ options}).
\begin{verbatim}
::: {#id .ex formatGloss=false}
This is a multi-line example.
But that does not mean anything for the result
All these lines are simply treated as one paragraph.
They will become one example with one number.
:::
\end{verbatim}
\begin{samepage}
\ex<id>
This is a multi-line example. But that does not mean anything for the
result All these lines are simply treated as one paragraph. They will
become one example with one number.
\xe
\end{samepage}
A preamble can be added by inserting an empty line between preamble and
example. The same considerations about multiple text-lines apply.
\begin{verbatim}
:::ex
Preamble
This is an example with a preamble.
:::
\end{verbatim}
\begin{samepage}
\ex<ex3> Preamble\\
This is an example with a preamble.
\xe
\end{samepage}
Sub-examples with labels are entered by starting each sub-example with a
small latin letter and a full stop. Empty lines between labels are
allowed. Subsequent lines without labels are treated as one paragraph.
Empty lines \emph{not} followed by a label with a full stop will result
in errors.
\begin{verbatim}
:::ex
a. This is the first example.
b. This is the second.
a. The actual letters are not important, `pandoc-ling` will put them in order.
e. Empty lines are allowed between labelled lines
Subsequent lines are again treated as one sequential paragraph.
:::
\end{verbatim}
\begin{samepage}
\pex[*=]<ex4>
\a This is the first example.
\a This is the second.
\a The actual letters are not important, \texttt{pandoc-ling} will put
them in order.
\a Empty lines are allowed between labelled lines Subsequent lines are
again treated as one sequential paragraph.
\xe
\end{samepage}
A labelled list can be combined with a preamble.
\begin{verbatim}
:::ex
Any nice description here
a. one example sentence.
b. two
c. three
:::
\end{verbatim}
\begin{samepage}
\pex[*=]<ex5> Any nice description here\\
\a one example sentence.
\a two
\a three
\xe
\end{samepage}
Grammaticality judgements should be added before an example, and after
an optional label, separated from both by spaces (though four spaces in
a row should be avoided, that could lead to layout errors). To indicate
that any sequence of symbols is a judgements, prepend the judgement with
a caret \texttt{\^{}}. Alignment will be figured out by
\texttt{pandoc-ling}.
\begin{verbatim}
:::ex
Throwing in a preamble for good measure
a. ^* This traditionally signals ungrammaticality.
b. ^? Question-marks indicate questionable grammaticality.
c. ^^whynot?^ But in principle any sequence can be used (here even in superscript).
d. However, such long sequences sometimes lead to undesirable effects in the layout.
:::
\end{verbatim}
\begin{samepage}
\pex[*=whynot?]<ex6> Throwing in a preamble for good measure\\
\a \ljudge{*}This traditionally signals ungrammaticality.
\a \ljudge{?}Question-marks indicate questionable grammaticality.
\a \ljudge{\textsuperscript{whynot?}}But in principle any sequence can
be used (here even in superscript).
\a However, such long sequences sometimes lead to undesirable effects
in the layout.
\xe
\end{samepage}
A minor detail is the alignment of a single example with a preamble and
grammaticality judgements. In this case it looks better for the preamble
to be left aligned with the example and not with the judgement.
\begin{verbatim}
:::ex
Here is a special case with a preamble
^^???^ With a singly questionably example.
Note the alignment! Especially with this very long example
that should go over various lines in the output.
:::
\end{verbatim}
\begin{samepage}
\ex<ex7> Here is a special case with a preamble\\
\judge{\textsuperscript{???}} With a singly questionably example. Note
the alignment! Especially with this very long example that should go
over various lines in the output.
\xe
\end{samepage}
For the lazy writers among us, it is also possible to use a simple
bullet list instead of a labelled list. Note that the listed elements
will still be formatted as a labelled list.
\begin{verbatim}
:::ex
- This is a lazy example.
- ^# It should return letters at the start just as before.
- ^% Also testing some unusual judgements.
:::
\end{verbatim}
\begin{samepage}
\pex[*=\#]<ex8>
\a This is a lazy example.
\a \ljudge{\#}It should return letters at the start just as before.
\a \ljudge{\%}Also testing some unusual judgements.
\xe
\end{samepage}
Just for testing: a single example with a judgement (which resulted in
an error in earlier versions).
\begin{verbatim}
::: ex
^* This traditionally signals ungrammaticality.
:::
\end{verbatim}
\begin{samepage}
\ex<ex9>
\judge{*} This traditionally signals ungrammaticality.
\xe
\end{samepage}
\subsection{Interlinear examples}\label{interlinear-examples}
For interlinear examples with aligned source and gloss, the structure of
a \texttt{lineblock} is used, starting the lines with a vertical line
\texttt{\textbar{}}. There should always be four vertical lines (for
header, source, gloss and translation, respectively), although the
content after the first vertical line can be empty. The source and gloss
lines are separated at spaces, and all parts are right-aligned. If you
want to have a space that is not separated, you will have to `protect'
the space, either by putting a backslash before the space, or by
inserting a non-breaking space instead of a normal space (either type
\texttt{\ } or insert an actual non-breaking space, i.e.~unicode
character \texttt{U+00A0}).
\begin{verbatim}
:::ex
| Dutch (Germanic)
| Deze zin is in het nederlands.
| DEM sentence AUX in DET dutch.
| This sentence is dutch.
:::
\end{verbatim}
\begin{samepage}
\ex[*=]<ex10>
\begingl
\glpreamble Dutch (Germanic)//
\gla Deze zin is in het nederlands. //
\glb DEM sentence AUX in DET dutch. //
\glft This sentence is dutch.//
\endgl
\xe
\end{samepage}
An attempt is made to format interlinear examples when the option
\texttt{formatGloss=true} is added. This will:
\begin{itemize}
\tightlist
\item
remove formatting from the source and set everything in italics,
\item
remove formatting from the gloss and set sequences (\textgreater1) of
capitals and numbers into small caps (note that the positioning of
small caps on web pages is
\href{https://iamvdo.me/en/blog/css-font-metrics-line-height-and-vertical-align}{highly
complex}),
\item
a tilde \texttt{\textasciitilde{}} between spaces in the gloss is
treated as a shortcut for an empty gloss (internally, the sequence
\texttt{space-tilde-space} is replaced by
\texttt{space-space-nonBreakingSpace-space-space}),
\item
consistently put translations in single quotes, possibly removing
other quotes.
\end{itemize}
\begin{verbatim}
::: {.ex formatGloss=true}
| Dutch (Germanic)
| Is deze zin in het nederlands ?
| AUX DEM sentence in DET dutch Q
| Is this sentence dutch?
:::
\end{verbatim}
\begin{samepage}
\ex[*=]<ex11>
\begingl
\glpreamble Dutch (Germanic)//
\gla \emph{Is} \emph{deze} \emph{zin} \emph{in} \emph{het}
\emph{nederlands} \emph{?} //
\glb \textsc{aux} \textsc{dem} sentence in \textsc{det} dutch
\textsc{q} //
\glft `Is this sentence dutch?'//
\endgl
\xe
\end{samepage}
The results of such formatting will not always work, but it seems to be
quite robust in my testing. The next example brings everything together:
\begin{itemize}
\tightlist
\item
a preamble,
\item
labels, both for single lines and for interlinear examples,
\item
interlinear examples start on a new line immediately after the
letter-label,
\item
grammaticality judgements with proper alignment,
\item
when the header of an interlinear example is left out, everything is
shifted up,
\item
The formatting of the interlinear is harmonised.
\end{itemize}
\begin{verbatim}
::: {.ex formatGloss=true samePage=false}
Completely superfluous preamble, but it works ...
a. Mixing single line examples with interlinear examples.
a. This is of course highly unusal.
Just for this example, let's add some extra material in this example.
a.
| Dutch (Germanic) Note the grammaticality judgement!
| ^^:–)^ Deze zin is (dit\ is test) nederlands.
| DEM sentence AUX ~ dutch.
| This sentence is dutch.
b.
|
| Deze tweede zin heeft geen header.
| DEM second sentence have.3SG.PRES no header.
| This second sentence does not have a header.
:::
\end{verbatim}
\pex[*=:–)]<ex12> Completely superfluous preamble, but it works
\ldots{}\\
\a Mixing single line examples with interlinear examples.
\a This is of course highly unusal. Just for this example, let's add
some extra material in this example.
\a
\begingl
\glpreamble Dutch (Germanic) Note the grammaticality judgement!//
\gla \ljudge{\textsuperscript{:--)}}\emph{Deze} \emph{zin} \emph{is}
\emph{(dit~is~test)} \emph{nederlands.} //
\glb \textsc{dem} sentence \textsc{aux} ~ dutch. //
\glft `This sentence is dutch.'//
\endgl
\a
\begingl
\gla \emph{Deze} \emph{tweede} \emph{zin} \emph{heeft} \emph{geen}
\emph{header.} //
\glb \textsc{dem} second sentence have.\textsc{3sg}.\textsc{pres} no
header. //
\glft `This second sentence does not have a header.'//
\endgl
\xe
Just for testing: a single interlinear example without header (which
resulted in a latex-linguex error in earlier versions).
\begin{verbatim}
::: {.ex formatGloss=true}
|
| Dit is een lui voorbeeld
| DEM COP DET lazy example
| This is a lazy example.
:::
\end{verbatim}
\begin{samepage}
\ex[*=]<ex13>
\begingl
\gla \emph{Dit} \emph{is} \emph{een} \emph{lui} \emph{voorbeeld} //
\glb \textsc{dem} \textsc{cop} \textsc{det} lazy example //
\glft `This is a lazy example.'//
\endgl
\xe
\end{samepage}
\subsection{Cross-referencing
examples}\label{cross-referencing-examples}
The examples are automatically numbered by \texttt{pandoc-ling}.
Cross-references to examples inside a document can be made by using the
\texttt{{[}@ID{]}} format (used by Pandoc for citations). When an
example has an explicit identifier (like \texttt{\#test} in the next
example), then a reference can be made to this example with
\texttt{{[}@test{]}}, leading to (\getref{test}) when formatted (note
that the formatting does not work on the github website. Please check
the `docs' subdirectory).
\begin{verbatim}
::: {#test .ex}
This is a test
:::
\end{verbatim}
\begin{samepage}
\ex<test>
This is a test
\xe
\end{samepage}
Inspired by the \texttt{linguex}-approach, you can also use the keywords
\texttt{next} or \texttt{last} to refer to the next or the last example,
e.g.~\texttt{{[}@last{]}} will be formatted as (\getref{test}). By
doubling the first letters to \texttt{nnext} or \texttt{llast} reference
to the next/last-but-one can be made. Actually, the number of starting
letters can be repeated at will in \texttt{pandoc-ling}, so something
like \texttt{{[}@llllllllast{]}} will also work. It will be formatted as
(\getref{ex7}) after the processing of \texttt{pandoc-ling}. Needless to
say that in such a situation an explicit identifier would be a better
choice.
Referring to sub-examples can be done by manually adding a suffix into
the cross reference, simply separated from the identifier by a space.
For example, \texttt{{[}@lllast~c{]}} will refer to the third
sub-example of the last-but-two example. Formatted this will look like
this: (\getref{ex13}\,c), smile! However, note that the ``c'' has to be
manually determined. It is simply a literal suffix that will be copied
into the cross-reference. Something like \texttt{{[}@last\ hA1l0{]}}
will work also, leading to (\getref{test}\,hA1l0) when formatted (which
is of course nonsensical).
For exports that include attributes (like html), the examples have an
explicit id of the form \texttt{exNUMBER} in which \texttt{NUMBER} is
the actual number as given in the formatted output. This means that it
is possible to refer to an example on any web-page by using the
hash-mechanism to refer to a part of the web-page. For example
\texttt{\#ex4.7} at can be used to refer to the seventh example in the
html-output of this readme (try
\href{https://cysouw.github.io/pandoc-ling/readme.html\#ex4.7}{this
link}). The id in this example has a chapter number `4' because in the
html conversion I have set the option \texttt{addChapterNumber} to
\texttt{true}. (Note: when numbers restart the count in each chapter
with the option \texttt{restartAtChapter}, then the id is of the form
\texttt{exCHAPTER.NUMBER}. This is necessary to resolve clashing ids, as
the same number might then be used in different chapters.)
I propose to use these ids also to refer to examples in citations when
writing scholarly papers, e.g.~(Cysouw 2021: \#ex7), independent of
whether the links actually resolve. In principle, such citations could
easily be resolved when online publications are properly prepared. The
same proposal could also work for other parts of research papers, for
example using tags like \texttt{\#sec,\ \#fig,\ \#tab,\ \#eq} (see the
Pandoc filter
\href{https://github.com/cysouw/crossref-adapt}{\texttt{crossref-adapt}}).
To refer to paragraphs (which should replace page numbers in a future of
adaptive design), I propose to use no tag, but directly add the number
to the hash (see the Pandoc filter
\href{https://github.com/cysouw/count-para}{\texttt{count-para}} for a
practical mechanism to add such numbering).
\subsection{\texorpdfstring{Options of
\texttt{pandoc-ling}}{Options of pandoc-ling}}\label{options-of-pandoc-ling}
\subsubsection{Global options}\label{global-options}
The following global options are available with \texttt{pandoc-ling}.
These can be added to the
\href{https://pandoc.org/MANUAL.html\#metadata-blocks}{Pandoc metadata}.
An example of such metadata can be found at the bottom of this
\texttt{readme} in the form of a YAML-block. Pandoc allows for various
methods to provide metadata (see the link above).
\begin{itemize}
\tightlist
\item
\textbf{\texttt{formatGloss}} (boolean, default \texttt{false}):
should all interlinear examples be consistently formatted? If you use
this option, you can simply use capital letters for abbreviations in
the gloss, and they will be changed to small caps. The source line is
set to italics, and the translations is put into single quotes.
\item
\textbf{\texttt{samePage}} (boolean, default \texttt{true}, only for
Latex): should examples be kept together on the same page? Can also be
overriden for individual examples by adding
\texttt{\{.ex\ samePage=false\}} at the start of an example (cf.~below
on \texttt{local\ options}).
\item
\textbf{\texttt{xrefSuffixSep}} (string, defaults to no-break-space):
When cross references have a suffix, how should the separator be
formatted? The defaults `no-break-space' is a safe options. I
personally like a `narrow no-break space' better (Unicode
\texttt{U+202F}), but this symbol does not work with all fonts, and
might thus lead to errors. For Latex typesetting, all space-like
symbols are converted to a Latex thin space
\texttt{\textbackslash{},}.
\item
\textbf{\texttt{restartAtChapter}} (boolean, default \texttt{false}):
should the counting restart for each chapter?
\begin{itemize}
\tightlist
\item
Actually, when \texttt{true} this setting will restart the counting
at the highest heading level, which for various output formats can
be set by the Pandoc option \texttt{top-level-division}.
\item
The id of each example will now be of the form
\texttt{exCHAPTER.NUMBER} to resolve any clashes when the same
number appears in different chapter.
\item
Depending on your Latex setup, an explicit entry
\texttt{top-level-division:\ chapter} might be necessary in your
metadata.
\end{itemize}
\item
\textbf{\texttt{addChapterNumber}} (boolean, default \texttt{false}):
should the chapter (= highest heading level) number be added to the
number of the example? When setting this to \texttt{true} any setting
of \texttt{restartAtChapter} will be ignored. In most Latex situations
this only works in combination with a \texttt{documentclass:\ book}.
\item
\textbf{\texttt{latexPackage}} (one of: \texttt{linguex},
\texttt{gb4e}, \texttt{langsci-gb4e}, \texttt{expex}, default
\texttt{linguex}): Various options for converting examples to Latex
packages that typeset linguistic examples. None of the conversions
works perfectly, though in should work in most normal situations
(think 90\%-plus). It might be necessary to first convert to
\texttt{Latex}, correct the output, and then typeset separately with a
latex compiler like \texttt{xelatex}. Using the direct option insider
Pandoc might also work in many situations. Export to
\textbf{\texttt{beamer}} seems to work reasonably well with the
\texttt{gb4e} package. All others have artefacts or errors.
\end{itemize}
\subsubsection{Local options}\label{local-options}
Local options are options that can be set for each individual example.
The \texttt{formatGloss} option can be used to have an individual
example be formatted differently from the global setting. For example,
when the global setting is \texttt{formatGloss:\ true} in the metadata,
then adding \texttt{formatGloss=false} in the curly brackets of a
specific example will block the formatting. This is especially useful
when the automatic formatting does not give the desired result.
If you want to add something else (not a linguistic example) in a
numbered example, then there is the local option \texttt{noFormat=true}.
An attempt will be made to try and do a reasonable layout. Multiple
paragraphs will simply we taken as is, and the number will be put in
front. In HTML the number will be centred. It is usable for an
incidental mathematical formula.
\begin{verbatim}
::: {.ex noFormat=true}
$$\sum_{i=1}^{n}{i}=\frac{n^2-n}{2}$$
:::
\end{verbatim}
\begin{samepage}
\ex<ex15>
\[\sum_{i=1}^{n}{i}=\frac{n^2-n}{2}\]\\
\xe
\end{samepage}
\subsection{\texorpdfstring{Issues with
\texttt{pandoc-ling}}{Issues with pandoc-ling}}\label{issues-with-pandoc-ling}
\begin{itemize}
\tightlist
\item
Manually provided identifiers for examples should not be purely
numerical (so do not use e.g.~\texttt{\#5789}). In some situation this
interferes with the setting of the cross-references.
\item
Because the cross-references use the same structure as citations in
Pandoc, the processing of citations (by \texttt{citeproc}) should be
performed \textbf{after} the processing by \texttt{pandoc-ling}.
Another Pandoc filter,
\href{https://github.com/lierdakil/pandoc-crossref}{\texttt{pandoc-crossref}},
for numbering figures and other captions, also uses the same system.
There seems to be no conflict between \texttt{pandoc-ling} and
\texttt{pandoc-crossref}.
\item
Interlinear examples will will not wrap at the end of the page. There
is no solution yet for longer examples that are longer than the size
of the page.
\item
It is not (yet) possible to have more than one glossing line.
\item
When exporting to \texttt{docx} there is a problem because there are
paragraphs inserted after tables, which adds space in lists with
multiple interlinear examples (except when they have exactly the same
number of columns). This is
\href{https://answers.microsoft.com/en-us/msoffice/forum/msoffice_word-mso_windows8-mso_2013_release/how-to-remove-extra-paragraph-after-table/995b3811-9f55-4df1-bbbc-9f672b1ad262}{by
design}. The official solution is to set font-size to 1 for this
paragraph inside MS Word.
\item
Multi-column cells are crucial for \texttt{pandoc-ling} to work
properly. These are only introduced in new table format with Pandoc
2.10 (so older Pandoc version are not supported). Also note that these
structures are not yet exported to all formats, e.g.~it will not be
displayed correctly in \texttt{docx}. However, this is currently an
area of active development
\item
\texttt{langsci-gb4e} is only available as part of the
\href{https://ctan.org/pkg/langsci?lang=en}{\texttt{langsci} package}.
You have to make it available to Pandoc, e.g.~by adding it into the
same directory as the pandoc-ling.lua filter. I have added a recent
version of \texttt{langsci-gb4e} here for convenience, but this one
might be outdated at some time in the future.
\item
\texttt{beamer} output seems to work best with
\texttt{latexPackage:\ gb4e}.
\end{itemize}
\subsection{A note on Latex
conversion}\label{a-note-on-latex-conversion}
Originally, I decided to write this filter as a two-pronged conversion,
making a markdown version myself, but using a mapping to one of the many
latex libraries for linguistics examples as a quick fix. I assumed that
such a mapping would be the easy part. However, it turned out that the
mapping to latex was much more difficult that I anticipated. Basically,
it turned out that the `common denominator' that I was aiming for was
not necessarily the `common denominator' provided by the latex packages.
I worked on mapping to various packages (linguex, gb4e, langsci-gb4e and
expex) with growing dismay. This approach resulted in a first version.
However, after this version was (more or less) finished, I realised that
it would be better to first define the `common denominator' more clearly
(as done here), and then implement this purely in Pandoc. From that
basis I have then made attempts to map them to the various latex
packages.
\subsection{A note on implementation}\label{a-note-on-implementation}
The basic structure of the examples are transformed into Pandoc tables.
Tables are reasonably safe for converting in other formats. Care has
been taken to add \texttt{classes} to all elements of the tables
(e.g.~the preamble has the class \texttt{linguistic-example-preamble}).
When exported formats are aware of these classes, they can be used to
fine-tune the formatting. I have used a few such fine-tunings into the
html output of this filter by adding a few CSS-style statements. The
naming of the classes is quite transparent, using the form
\texttt{linguistic-example-STRUCTURE}.
The whole table is encapsulated in a \texttt{div} with class \texttt{ex}
and an id of the form \texttt{exNUMBER}. This means that an example can
be directly referred to in web-links by using the hash-mechanism. For
example, adding \texttt{\#ex3} to the end of a link will immediately
jump to this example in a browser.
The current implementation is completely independent from the
\href{https://pandoc.org/MANUAL.html\#numbered-example-lists}{Pandoc
numbered examples implementation} and both can work side by side, like
(2):
\begin{enumerate}
\def\labelenumi{(\arabic{enumi})}
\item
These are native Pandoc numbered examples
\item
They are independent of \texttt{pandoc-ling} but use the same output
formatting in many default exports, like latex.
\end{enumerate}
However, in practice various output-formats of Pandoc (e.g.~latex) also
use numbers in round brackets for these, so in practice it might be
confusing to combine both.
\end{document}