forked from rc0/jbofihe
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmorf_nfa.in
1044 lines (829 loc) · 29.9 KB
/
morf_nfa.in
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
###########################################################
# NFA definition file for matching Lojban morphology
#
# (process with nfa2dfa.pl to produce the DFA state/transition
# and accept tables for use with a suitable parsing fn.)
# This generates a scanner with the following functions
# - recognizes type of word
# - for gismu/lujvo/fu'ivla, provides enough info to work out where word
# starts, i.e. how many cmavo are prefixed. (Equivalently, works out whether
# the consonant cluster or the preceding single consonant is that start of
# the word.)
# - rigorously checks the word form for errors (bad clusters, y where not
# required, bad hyphenation after initial CVV rafsi, bad vowel pairing etc)
#
# $Header$
#
# COPYRIGHT
#
###########################################################
# Stuff to pass through verbatim to C output file
%{
#include "morf_dfa.h"
%}
###########################################################
# Declare all symbols in same order as lexer (lexer returns (0 .. whatever)
# when it recognizes the corresponding token)
Tokens UNK V APOS Y R N C NR CI CSI CP CS CN H HS BT VV VX VO VY YY
# Token meanings are as follows
# UNK : Unknown character
# V : vowel [aeiou]
# APOS : '
# Y : y
# R : r following vowel
# N : n following vowel
# C : consonant other than r or n after vowel
# NR : r in the pair nr (or triple nyr)
# CI : 2nd letter of permissible initial consonant pair
# CSI : 2nd letter of permissible initial pair which is syllabic (l,m,n,r)
# CP : 2nd letter of permissible consonant pair (except nr)
# CS : 2nd letter of permissible pair which is syllabic (l,m,n,r)
# CN : 2nd letter of impermissible consonant pair
# H : rnC,CrC (hyphen occuring in stage III fu'ivla)
# HS : rln,nlr,Cnr (H with syllabic 3rd letter)
# BT : 3rd letter of one of the banned triples (ntc,nts,ndj,ndz)
# VV : 2nd vowel of ai|au|ei|oi (allowed in any word type), or 2nd vowel
# of v,v pattern. [Comma treated the same as apostrophe between vowels]
# VX : (extended) 2nd vowel of [iu][aeiou] (allowed as single VV cmavo,
# and in fu'ivla & cmene)
# VO : other vowel pairs (aa,ae,ao,ea,ee,eo,eu,oa,oe,oo,ou)
# VY : vowel pair forms involving y, maybe with a comma between (valid only
# in cmene)
# YY : 2 copies of the letter y adjacent to each other with no separation
# Notes
# - pairs & triples may have y within them. This allows this 'grammar' to
# specify the checks for whether the y is actually required or not.
# - NR is separated out from CP (and R, N separated from C) to allow checking
# the validity/necessity of the hyphenation structure after an initial CVV
# rafsi in a lujvo.
# - At the moment, the NFA contains some dead paths, mostly concerned with
# saying CI can follow N or R (which can never happen). This is where
# C|N|R is used to specify any single consonant after a vowel, and the next
# letter may or may not be part of an initial cluster. It's too tedious to
# optimise away all such cases. The net result is that the resuting DFA is
# larger than it needs to be and has unreachable states in it.
# - Support for the cultural rafsi (section 4.16 of the Reference Grammar) has
# been made optional. To disable them, pass the file through 'grep -v
# CULTURAL' first.
# Prefix applied to the tables written out for inclusion into the C program.
Prefix morf
Abbrev CNR = C|N|R
Abbrev LCI = CI|CSI
Abbrev LCP = CI|CSI|CP|CS|NR
Abbrev FVV = VV|VX|VO
Abbrev FC = C|N|R
Abbrev FCP = CI|CP|H
Abbrev FCS = CSI|CS|HS|NR
###########################################################
# Subcomponents for lujvo matching
#{{{ BLOCK SYL1
BLOCK SYL1
STATE in
CNR ; V ; APOS ; V -> ex_nr # Requires nr hyphen after or just final CCV
CNR ; V ; VV -> ex_nr # Requires nr hyphen after or just final CCV
C ; LCI ; V -> ex_cln # No special binding to next syl
CNR ; V ; CNR -> ex_cvc # Starts CVC, may require tosmabru check
CNR ; V ; CNR ; LCP -> ex_cvy # Ditto, starts CV
C ; LCI ; V ; CNR -> ex_y # Requires y before next syl
ENDBLOCK
#}}}
#{{{ BLOCK AFTER1
BLOCK AFTER1
# Glue coming between first syllable (i.e. rafsi) and what follows
STATE in_nr
R ; CP|CS -> to_lujvo1, to_after_nr_hyphen
N ; NR -> to_lujvo1, to_after_nr_hyphen
CNR -> to_final_ccv
STATE in_cln
CNR -> to_lujvo0, to_pair0
STATE in_y
Y ; LCP|CN -> to_lujvo0, to_pair0
Y ; CP ; BT -> to_in_after_cc0
STATE in_cvy
Y ; LCP|CN|HS|H|BT -> to_lujvo1, to_pair1
Y ; CP|H ; BT -> to_in_after_cc1
STATE in_cvc
LCP -> to_lujvo1, to_pair1
Y ; CN -> to_lujvo1, to_pair1
Y ; LCI -> to_lujvo1t, to_pair1t, to_tosmabru
Y ; CP ; BT -> to_in_after_cc1
ENDBLOCK
#}}}
#{{{ BLOCK SYL2
BLOCK SYL2 # Lujvo syllables (i.e. rafsi) 2 .. (N-1)
STATE in
V ; VV -> ex_cln
V ; APOS ; V -> ex_cln
LCI ; V -> ex_cln
V ; CNR -> ex_cvc
V ; CNR ; LCP -> ex_y
LCI ; V ; CNR -> ex_y
STATE in_after_cc # get here if last syl. ends in c, which when combined
# with 1st & 2nd letters of this syl. forms a bad consonant triple
V -> ex_cln # from ccv form
V ; CNR -> ex_y # from ccvc form
ENDBLOCK
#}}}
#{{{ BLOCK AFTER2
BLOCK AFTER2
# Linkage from rafsi 2->3, ..., (N-1)->N
STATE in_cln
CNR -> exit
STATE in_cvc
Y ; CN -> exit
Y ; CP ; BT -> exit_after_cc
LCP -> exit
STATE in_y
Y ; LCP|CN|HS|H|BT -> exit
Y ; CP|H ; BT -> exit_after_cc
ENDBLOCK
#}}}
#{{{ BLOCK SYLN
BLOCK SYLN
# Lujvo final syllable
STATE in_main
V ; APOS ; V -> exit # final CV'V
V ; VV -> exit # final CVV
LCI ; V -> exit # final CCV
LCI ; V ; CNR ; V -> exit # final CCVCV
V ; CNR ; LCP ; V -> exit # final CVCCV
STATE in_after_nr_hyphen
V ; APOS ; V -> exit # final CV'V
V ; VV -> exit # final CVV
LCI ; V ; CNR ; V -> exit # final CCVCV
V ; CNR ; LCP ; V -> exit # final CVCCV
STATE in_to_ccv
LCI ; V -> exit # final CCV
# Used to support bad triples with n at end of previous syllable
STATE in_after_cc
V -> exit # final CCV
V ; CNR ; V -> exit # final CCVCV
ENDBLOCK
#}}}
#{{{ BLOCK SYL2N
BLOCK SYL2N
# Everything from start of 2nd syllable (less initial consonant picked off in
# AFTER1) through to end of lujvo. This is in a block because 3 instances are
# made ; one to recognize lujvo which start with a cluster (lujvo_0), a second
# to recognize those starting with CV.. (i.e. deferred cluster), and a third to
# recognize the subset of the second where a y has been inserted after an
# initial CVC to prevent a tosmabru failure. (For that third case, the
# 'tosmabru' block scans the sequence that would have been the shorter lujvo to
# check it's valid; if not, the 'y' was a bogus insertion.)
s2 : SYL2
a2 : AFTER2
sn : SYLN
STATE in -> s2.in
STATE in_after_cc -> s2.in_after_cc, sn.in_after_cc
# Bridge s2->a2
STATE s2.ex_cln -> a2.in_cln
STATE s2.ex_cvc -> a2.in_cvc
STATE s2.ex_y -> a2.in_y
# Bind a2->s2 (loop or goto final syl.)
STATE a2.exit -> s2.in, sn.in_main
STATE a2.exit_after_cc -> s2.in_after_cc, sn.in_after_cc
ENDBLOCK
#}}}
#{{{ BLOCK TOS_SYL1
BLOCK TOS_SYL1
# Match syllable 1 of what would be the shorter lujvo in a potential tosmabru
# failure.
STATE in
V -> ex_cln
V ; CNR -> ex_y
ENDBLOCK
#}}}
#{{{ BLOCK TOS_AFTER1
BLOCK TOS_AFTER1
# Glue after 1st syllable of potential shorter lujvo (binds 1st syl. to 2nd)
STATE in_cln
CNR -> exit
STATE in_y
Y ; LCP|CN -> exit
Y ; CP|CS ; BT -> exit_to_after_cc
ENDBLOCK
#}}}
#{{{ BLOCK TOSMABRU
BLOCK TOSMABRU
# To check tail portion of word to see if it too is of lujvo
# form.
s1 : TOS_SYL1
a1 : TOS_AFTER1
tail : SYL2N
STATE in -> s1.in
STATE s1.ex_cln -> a1.in_cln
STATE s1.ex_y -> a1.in_y
STATE a1.exit -> tail.sn.in_main, tail.in
STATE a1.exit_to_after_cc -> tail.in_after_cc
STATE tail.sn.exit -> exit
ENDBLOCK
#}}}
#{{{ BLOCK LUJVO_BODY
BLOCK LUJVO_BODY
# The complete NFA for matching a word of lujvo form.
s1 : SYL1
a1 : AFTER1
t0 : SYL2N
t1 : SYL2N
t1t : SYL2N
tos : TOSMABRU
STATE in -> s1.in
# Bridge s1->a1
STATE s1.ex_nr -> a1.in_nr
STATE s1.ex_cln -> a1.in_cln
STATE s1.ex_y -> a1.in_y
STATE s1.ex_cvy -> a1.in_cvy
STATE s1.ex_cvc -> a1.in_cvc
# Bridge a1->t1.sn
STATE a1.to_final_ccv -> t1.sn.in_to_ccv
STATE a1.to_after_nr_hyphen -> t1.sn.in_after_nr_hyphen
STATE a1.to_in_after_cc0 -> t0.in_after_cc
STATE a1.to_in_after_cc1 -> t1.in_after_cc
# Bridge a1 to final syllables for cases where a 2 rafsi lujvo is valid
# this way
STATE a1.to_pair0 -> t0.sn.in_main
STATE a1.to_pair1 -> t1.sn.in_main
STATE a1.to_pair1t -> t1t.sn.in_main
# Bridge a1 to tosmabru
STATE a1.to_tosmabru -> tos.in
# Bridge a1->t0
STATE a1.to_lujvo0 -> t0.in
# Bridge a1->t1
STATE a1.to_lujvo1 -> t1.in
# Bridge a1->t1t
STATE a1.to_lujvo1t -> t1t.in
ENDBLOCK
#}}}
#{{{ BLOCK LUJVO
BLOCK LUJVO
# This block deals with recognition of 'normal' lujvo.
body : LUJVO_BODY
STATE in -> body.in
# Set exit states on t0
STATE body.t0.sn.exit = TAG_LUJVO_0
# Set exit states on t1
STATE body.t1.sn.exit = TAG_LUJVO_1
# Set exit states on t1t
STATE body.t1t.sn.exit = TAG_LUJVO_1T
# Set exit status when potentially shorter word is of valid lujvo form
# (e.g. the smabru in tosmabru)
STATE body.tos.exit = TAG_LUJVO_TAIL_OK
ENDBLOCK
#}}}
###########################################################
#{{{ BLOCK CULTURAL_BRIDGE
BLOCK CULTURAL_BRIDGE
# This blocks describes the extra NFA states that have to bridged on top of the
# standard lujvo to get something that copes with cultural rafsi too.
STATE in_before_c
C -> in # No point using CNR, because N&R can't start initial pair
STATE in
LCI -> in_after_cc
STATE in_after_cc
V ; APOS ; V ; CNR -> exit
V ; VV ; CNR -> exit
ENDBLOCK
#}}}
###########################################################
#{{{ BLOCK CULTURAL_LUJVO
BLOCK CULTURAL_LUJVO
# Block to recognise lujvo which have 'cultural' rafsi in them.
# Obviously this recognises all normal lujvo as well, because it will cope
# with >=0 of the rafsi being cultural. That is not important, because
# this case is 'set differenced' away in the priority logic at the end of
# the file : if the word's a normal lujvo, it is never considered for
# recognition as a cultural one.
# The core lujvo - replicates the main lujvo matching block.
body : LUJVO_BODY
# The extra bits
s1 : CULTURAL_BRIDGE
tos_s1 : CULTURAL_BRIDGE
t0 : CULTURAL_BRIDGE
t1 : CULTURAL_BRIDGE
t1t : CULTURAL_BRIDGE
tos : CULTURAL_BRIDGE
STATE in -> body.in
##############
# Add bridging between states arising from cultural rafsi being present.
STATE body.s1.in -> s1.in_before_c
STATE s1.exit -> body.s1.ex_y
STATE body.t0.s2.in -> t0.in
STATE body.t0.s2.in_after_cc -> t0.in_after_cc
STATE t0.exit -> body.t0.s2.ex_y
STATE body.t1.s2.in -> t1.in
STATE body.t1.s2.in_after_cc -> t1.in_after_cc
STATE t1.exit -> body.t1.s2.ex_y
STATE body.t1t.s2.in -> t1t.in
STATE body.t1t.s2.in_after_cc -> t1t.in_after_cc
STATE t1t.exit -> body.t1t.s2.ex_y
STATE body.tos.tail.s2.in -> tos.in
STATE body.tos.tail.s2.in_after_cc -> tos.in_after_cc
STATE tos.exit -> body.tos.tail.s2.ex_y
STATE body.tos.s1.in -> tos_s1.in_after_cc
STATE tos_s1.exit -> body.tos.s1.ex_y
##############
# Set exit states on t0
STATE body.t0.sn.exit = TAG_CULTURAL_LUJVO_0
# Set exit states on t1
STATE body.t1.sn.exit = TAG_CULTURAL_LUJVO_1
# Set exit states on t1t
STATE body.t1t.sn.exit = TAG_CULTURAL_LUJVO_1T
# Set exit status when potentially shorter word is of valid lujvo form
# (e.g. the smabru in tosmabru)
STATE body.tos.exit = TAG_CULTURAL_LUJVO_TAIL_OK
ENDBLOCK
#}}}
###########################################################
#{{{ BLOCK LUJVO_NO_Y_BAD_VOWELS
# The idea of this block is to pick out words that have lujvo consonant
# structure, but which can contain invalid vowel pairs. These have to be
# filtered out of the stage-IV fu'ivla set later on. Hence a big
# simplification : don't care about lujvo forms with 'y' in.
BLOCK LUJVO_NO_Y_BAD_VOWELS
STATE c
LCP ; V ; FVV -> v, exit
LCP ; V ; APOS ; V -> v, exit
LCP ; V ; CNR -> c
LCP ; LCI ; V -> v, exit
STATE v
CNR ; V ; FVV -> v, exit
CNR ; V ; APOS ; V -> v, exit
CNR ; LCI ; V -> v, exit
CNR ; V ; CNR -> c
STATE cvv1
R ; LCP ; V ; FVV -> v, exit
R ; LCP ; V ; APOS ; V -> v, exit
R ; LCP ; V ; CNR -> c
R ; LCP ; LCI ; V -> v, exit
N ; NR ; V ; FVV -> v, exit
N ; NR ; V ; APOS ; V -> v, exit
N ; NR ; V ; CNR -> c
STATE exit = TAG_LUJVO_NO_Y_BAD_VOWELS
STATE in
CNR ; V ; FVV -> cvv1
CNR ; V ; APOS ; V -> cvv1
CNR ; LCI ; V -> v
CNR ; V ; CNR -> c
CNR ; V ; FVV ; CNR ; LCI ; V -> exit
CNR ; V ; APOS ; V ; CNR ; LCI ; V -> exit
ENDBLOCK
#}}}
###########################################################
#{{{ BLOCK CMAVOSEQ
BLOCK CMAVOSEQ
# Recognize a sequence of cmavo. There are two exit cases : first is a
# sequence of 'normal' cmavo; this can potentially be prefixed onto a
# gismu, lujvo or fu'ivla. The 2nd may start with some 'normal' cmavo, but
# ends with one or more cmavo of the Cy form. This has to occur at the end
# of the word.
STATE in
V -> m2, mv, in1
CNR -> main, cy1
Y -> y
STATE in1
VX -> in2
STATE in2
-> exit_prefixable
CNR -> main
STATE main
V -> m2, mv
STATE mv
VV -> m2, mv
STATE m2
CNR -> main, cy1
APOS -> main
-> exit_prefixable
STATE cy1
Y -> cy2
STATE cy2
-> exit_standalone
CI|CSI|CP|CS|CN|NR|H|HS|BT -> cy1
STATE y
YY -> yy
-> exit_standalone
APOS -> ya
STATE yy
YY -> yy
-> exit_standalone
STATE ya
Y -> exit_standalone
STATE exit_prefixable = TAG_CMAVOS
STATE exit_standalone = TAG_CMAVOS_END_CY
ENDBLOCK
#}}}
#{{{ BLOCK GISMU
BLOCK GISMU
# Recognize a gismu. The two cases CVC/CV and CCVCV get different exit
# statuses; this allows the scanner to back up one potential prefix cmavo in
# the CVC/CV case. (See how this is much simpler than the lujvo matcher!)
STATE in
C ; LCI ; V ; CNR ; V = TAG_GISMU_0
CNR ; V ; CNR ; LCP ; V = TAG_GISMU_1
ENDBLOCK
#}}}
###########################################################
#{{{ BLOCK SLINKUI
BLOCK SLINKUI
# Recognize a slinku'i
#
# This is basically like recognizing a lujvo but with a much reduced state
# topology, because the letter 'y' can't occur anywhere. So the final rafsi
# could be any of the valid forms, however, all earlier ones are restricted to
# CVV, CVC or CCV.
# For the first syllable, we jump in as though we've already recognized CV.
# Although the potential lujvo is always going to start CV, we distinguish the
# cases based on whether it's fu'ivla_0 or fu'ivla_1 that's going to be
# squashed by a match, to make sure the fu'ivla NFA and slinku'i NFA are
# treating the same length word tail as the match string.
# SYL2N is a superset of what's needed, because it allows y's. We'll never
# check for slinkui unless we find it's a fu'ivla so this won't cause false
# matches. Ideally, a custom SYL2N block is required, however using the
# existing lujvo one at least gives code commonality so is easier to
# maintain. It probably also makes the DFA smaller, because it will keep
# more states common with the lujvo NFA.
t0 : SYL2N
STATE in_after_c # First syl is CVC
# Link to final syl
# CVC + CC... or CVC + CV...
# Must be valid pair across transition, no y allowed
# Initial C => fu'ivla has init. cons. cluster, so fu'ivla_0
# will match if anything. Hence need SLINKUI_0 result
LCP -> t0.sn.in_main, t0.in
STATE t0.sn.exit = TAG_SLINKUI_0
ENDBLOCK
#}}}
###########################################################
# Fu'ivla matching blocks (including syllabic consonant rules)
#{{{ BLOCK FV_VOWELS
BLOCK FV_VOWELS
# This describes a valid sequence of vowels within a fu'ivla
STATE in
V -> ex_single
V ; FVV -> ex_single
V ; FVV ; FVV -> main
V ; APOS ; V -> main
V ; FVV ; APOS ; V -> main
STATE main
APOS ; V -> main
FVV -> main
-> ex_multi
ENDBLOCK
#}}}
#{{{ BLOCK FV_INITIAL_CLUSTER
BLOCK FV_INITIAL_CLUSTER
STATE in
C ; CI -> exit
C ; CSI -> exit
C ; CI ; CI -> exit
C ; CI ; CSI -> exit
C ; CI ; CSI ; CSI -> exit
C ; CI ; CI ; CSI ; CSI -> exit
ENDBLOCK
#}}}
#{{{ BLOCK FV_INTERNAL_CONS_GROUP
BLOCK FV_INTERNAL_CONS_GROUP
STATE in
FC -> c1
STATE c1
FCS -> cs
FCP -> c2
-> exit
STATE c2
FCS -> cs
FCP -> c3
-> exit
STATE c3
FCS ; FCS -> cs
FCS ; FCP -> c1
-> exit
STATE cs
FCS -> cs
FCP -> c1
-> exit
STATE in_req_clus
FC ; FCS -> cs
FC ; FCP -> c2
# Coming in after seeing a stage 3 hyphenation triple
# ending in a non-syllabic
STATE in_after_c -> c1
# Coming in after seeing a stage 3 hyphenation triple
# ending in a syllabic
STATE in_after_s -> cs
ENDBLOCK
#}}}
#{{{ BLOCK FUIVLA_START_V
BLOCK FUIVLA_START_V
# fu'ivla starting with up to 3 vowels, maybe with apostrophes
# between them, then a cluster.
cons : FV_INTERNAL_CONS_GROUP
later_v : FV_VOWELS
STATE in
V -> v1
STATE v1
FVV -> v2
APOS ; V -> v2
-> cons.in_req_clus
STATE v2
FVV -> v3
APOS ; V -> v3
-> cons.in_req_clus
STATE v3 -> cons.in_req_clus
STATE cons.exit -> later_v.in
STATE later_v.ex_single
-> exit, cons.in
STATE later_v.ex_multi
-> exit, cons.in
ENDBLOCK
#}}}
###########################################################
#{{{ BLOCK FUIVLA_START_CV
BLOCK FUIVLA_START_CV
# fu'ivla starting with a single consonant and up to 2
# vowels, maybe with apostrophes between, then a cluster.
cons : FV_INTERNAL_CONS_GROUP
later_v : FV_VOWELS
STATE in
FC -> c
STATE c
V -> v1
STATE v1
FVV -> v2
APOS ; V -> v2
-> cons.in_req_clus
STATE v2 -> cons.in_req_clus
STATE cons.exit -> later_v.in
STATE later_v.ex_single
-> exit, cons.in
STATE later_v.ex_multi
-> exit, cons.in
ENDBLOCK
#}}}
#{{{ BLOCK FUIVLA_START_CC
BLOCK FUIVLA_START_CC
init_cc : FV_INITIAL_CLUSTER
early_v : FV_VOWELS
later_c : FV_INTERNAL_CONS_GROUP
later_v : FV_VOWELS
STATE in
FC -> goto_slinkui
-> init_cc.in
STATE init_cc.exit -> early_v.in
STATE early_v.ex_multi -> exit, later_c.in
STATE early_v.ex_single -> later_c.in
STATE later_c.exit -> later_v.in
STATE later_v.ex_multi -> exit, later_c.in
STATE later_v.ex_single -> exit, later_c.in
ENDBLOCK
#}}}
#{{{ BLOCK STAGE3_TAIL
BLOCK STAGE3_TAIL
later_c : FV_INTERNAL_CONS_GROUP
later_v : FV_VOWELS
STATE in -> before_hyph
# Central letter of hyphen (l,n or r) is always syllabic
STATE before_hyph
CSI|CS|HS|NR -> after_hyph
STATE after_hyph
H -> goto_c
HS -> goto_s
STATE goto_c -> later_c.in_after_c
STATE goto_s -> later_c.in_after_s
STATE later_c.exit -> later_v.in
STATE later_v.ex_multi -> exit, later_c.in
STATE later_v.ex_single -> exit, later_c.in
ENDBLOCK
#}}}
#{{{ BLOCK STAGE3_SHORT
BLOCK STAGE3_SHORT
# Recognize a stage-3 fu'ivla starting CVC
tail : STAGE3_TAIL
STATE in
CNR ; V ; CNR -> tail.in
STATE tail.goto_c (AT_S3_3)
STATE tail.goto_s (AT_S3_3)
STATE tail.exit -> exit
ENDBLOCK
#}}}
#{{{ BLOCK STAGE3_LONG
BLOCK STAGE3_LONG
# Recognize a stage-3 fu'ivla starting CVCC or CCVC
tail : STAGE3_TAIL
STATE in
C ; LCI ; V ; CNR -> tail.in
CNR ; V ; CNR ; LCP -> tail.in
STATE tail.goto_c (AT_S3_4)
STATE tail.goto_s (AT_S3_4)
STATE tail.exit -> exit
ENDBLOCK
#}}}
###########################################################
# Recognize an "extended" stage-3, i.e. one with multiple
# rafsi prior to the hyphen.
#{{{ BLOCK X_STAGE3_CC_HEAD
BLOCK X_STAGE3_CC_HEAD
STATE in
CNR ; LCI ; V ; CNR -> exit
ENDBLOCK
#}}}
#{{{ BLOCK X_STAGE3_CV_HEAD
BLOCK X_STAGE3_CV_HEAD
STATE in
CNR ; V ; APOS ; V -> after_cvv
CNR ; V ; VV -> after_cvv
CNR ; V ; CNR -> after_cvc
STATE after_cvv
R ; CP | CS -> exit
N ; NR -> exit
STATE after_cvc
CP|CS|NR -> exit
ENDBLOCK
#}}}
#{{{ BLOCK X_STAGE3_OTHER_RAFSI
BLOCK X_STAGE3_OTHER_RAFSI
STATE in
LCI ; V ; CNR -> in
V ; VV ; CNR -> in
V ; CNR ; LCP -> in
V ; CNR -> exit3
LCI ; V ; CNR -> exit4
V ; CNR ; LCP -> exit4
ENDBLOCK
#}}}
#{{{ BLOCK X_STAGE3
BLOCK X_STAGE3
cc_head : X_STAGE3_CC_HEAD
cv_head : X_STAGE3_CV_HEAD
other_rafsi : X_STAGE3_OTHER_RAFSI
short_tail : STAGE3_TAIL
long_tail : STAGE3_TAIL
STATE in -> cc_head.in, cv_head.in
STATE cc_head.exit -> other_rafsi.in
STATE cv_head.exit -> other_rafsi.in
STATE other_rafsi.exit3 -> short_tail.in
STATE other_rafsi.exit4 -> long_tail.in
STATE short_tail.exit = TAG_X_STAGE3_CVC
STATE long_tail.exit = TAG_X_STAGE3_LONG
# Add attributes for grabbing hyphen position
STATE short_tail.goto_c (AT_XS3_3)
STATE short_tail.goto_s (AT_XS3_3)
STATE long_tail.goto_c (AT_XS3_4)
STATE long_tail.goto_s (AT_XS3_4)
ENDBLOCK
#}}}
###########################################################
#{{{ BLOCK FUIVLA
BLOCK FUIVLA
# Recognize a fuivla
start_cc : FUIVLA_START_CC
start_cv : FUIVLA_START_CV
start_v : FUIVLA_START_V
slinkui : SLINKUI
stage3_short : STAGE3_SHORT
stage3_long : STAGE3_LONG
xstage3 : X_STAGE3
STATE in_no_prefix -> start_cc.in, start_cv.in, start_v.in,
stage3_short.in, stage3_long.in,
xstage3.in
STATE in_prefixed -> start_cc.in, start_cv.in,
stage3_short.in, stage3_long.in,
xstage3.in
STATE start_cc.goto_slinkui -> slinkui.in_after_c
STATE start_cc.exit = TAG_FUIVLA_0
STATE start_cv.exit = TAG_FUIVLA_1
STATE start_v.exit = TAG_FUIVLA_1
STATE stage3_short.exit = TAG_STAGE3_CVC
STATE stage3_long.exit = TAG_STAGE3_LONG
ENDBLOCK
#}}}
###########################################################
#{{{ BLOCK CMENE
BLOCK CMENE
# Recognize a cmene. Has to end with consonant, and y is treated like a vowel.
# Take care with just a y occurring between consonants; the front end returns
# consonant pair tokens in this case (only real vowels clear the front-end
# state machine); that behaviour is needed so that when ..CyC.. occurs in
# lujvo, the lujvo matching NFA can check whether the consonant cluster was
# such as to require the y. (Extra y's are illegal if not necessary.) Checks
# for la, doi etc within the word are done later. It's too hard to do those
# checks and word splits in here without conflicing with the processing of
# lujvo etc.
# Note, uppercase validation is also separate. The front end tracks whether an
# uppercase letter has been seen, then case-folds the letter. At the end the
# condition (had_uppercase & !cmene) implies a bad word.
STATE in
CNR -> c
V -> v
Y ; CNR -> c
STATE c
V -> v
Y -> y
LCP|HS|H -> c
-> exit
STATE v
FVV |VY -> v
CNR -> c
APOS -> a
STATE a
V|Y -> v
STATE y
APOS -> a
VY -> v
YY -> y
LCP|HS|H|CN|BT -> c
CP ; BT -> c # deal with nytc, nyts, nydj, nydz
STATE exit
= TAG_CMENE
ENDBLOCK
#}}}
#{{{ BLOCK WORD
BLOCK WORD
# Top level NFA to recognize a word.
gismu : GISMU
lujvo : LUJVO
cultural_lujvo : CULTURAL_LUJVO
lujvo_no_y_bad_vowels : LUJVO_NO_Y_BAD_VOWELS
cms : CMAVOSEQ
cmene : CMENE
fuivla : FUIVLA
STATE in -> gismu.in, lujvo.in, cultural_lujvo.in,
fuivla.in_no_prefix, cms.in, cmene.in,
lujvo_no_y_bad_vowels.in
STATE cms.exit_prefixable -> gismu.in, lujvo.in, cultural_lujvo.in,
fuivla.in_prefixed,
lujvo_no_y_bad_vowels.in
#####
STATE in # ENTRY STATE NAMED LAST IN FILE
ENDBLOCK
#}}}
###########################################################
# Results definition section
# This is the priority encoding logic to determine
# the final word type
Result TAG_GISMU_0 -> R_GISMU_0
Result TAG_GISMU_1 -> R_GISMU_1
SymResult TAG_LUJVO_0 -> R_LUJVO_0
SymResult TAG_LUJVO_1 & ~TAG_LUJVO_0 -> R_LUJVO_1
Result TAG_LUJVO_1T & ~TAG_LUJVO_TAIL_OK -> R_BAD_TOSMABRU
# Can't do this in a single stage, because the name conflicts with
# R_LUJVO_1 above.
Symbol S_LUJVO_1T = TAG_LUJVO_1T & TAG_LUJVO_TAIL_OK
Result S_LUJVO_1T -> R_LUJVO_1
# 'cultural' lujvo, i.e. ones containing >=1 cultural rafsi (CCVVCV).
# These have some simplifications; we know one of these lujvo must contain 'y',
# so these don't have to enter into checking fu'ivla validity later on.
# Recall that the CULTURAL_LUJVO block matches all standard lujvo too, so
# these have to be factored out.
# Fortunately, there is direct equivalence between the ordinary and cultural
# varieties in terms of the 0/1/1T status.
Symbol S_CULTURAL_0 = TAG_CULTURAL_LUJVO_0 & ~TAG_LUJVO_0
Symbol S_CULTURAL_1 = TAG_CULTURAL_LUJVO_1 & ~TAG_LUJVO_1
Symbol S_CULTURAL_1T = TAG_CULTURAL_LUJVO_1T & ~TAG_LUJVO_1T
Symbol S_CULTURAL_TAIL_OK = TAG_CULTURAL_LUJVO_TAIL_OK
Result S_CULTURAL_0 -> R_CULTURAL_LUJVO_0
Result S_CULTURAL_1 & ~S_CULTURAL_0 -> R_CULTURAL_LUJVO_1
Result S_CULTURAL_1T & ~S_CULTURAL_TAIL_OK -> R_CULTURAL_BAD_TOSMABRU
Result S_CULTURAL_1T & S_CULTURAL_TAIL_OK -> R_CULTURAL_LUJVO_1
Symbol VALID_LUJVO = S_LUJVO_1T | R_LUJVO_1 | R_LUJVO_0
Symbol VALID_GISMU = TAG_GISMU_0 | TAG_GISMU_1
Symbol VLG = VALID_LUJVO | VALID_GISMU
Result ~VLG & TAG_FUIVLA_0 & TAG_SLINKUI_0 & ~TAG_FUIVLA_1 -> R_BAD_SLINKUI
Symbol S_FUIVLA_0 = ~VLG & TAG_FUIVLA_0 & ~TAG_SLINKUI_0
Symbol S_FUIVLA_1A = ~VLG & TAG_FUIVLA_0 & TAG_SLINKUI_0 & TAG_FUIVLA_1
Symbol S_FUIVLA_1B = ~VLG & ~TAG_FUIVLA_0 & TAG_FUIVLA_1
Symbol S_FUIVLA_1 = S_FUIVLA_1A | S_FUIVLA_1B
# The logic here is that if a stage 3 fu'ivla could start with either a CVC or
# a 4 letter rafsi prefix, prefer the 4 letter form. Otherwise, you could not
# write a stage 3 fu'ivla starting with that rafsi! (If this renders a