Skip to content

Commit

Permalink
gendir fixes
Browse files Browse the repository at this point in the history
fixes #48
  • Loading branch information
flammie committed Apr 10, 2024
1 parent d34af3a commit 43594b0
Showing 1 changed file with 11 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -38,19 +38,19 @@ mob_corpus_size_limit_command=$(shell \
$(AM_V_GEN)$(HFST_TXT2FST) $(HFST_FLAGS) -f openfst-tropical $< -o $@

# sort the clean corpus:
%.sort.txt: weights/%.clean.txt
$(AM_V_GEN)sort < $< > $@
.generated/%.sort.txt: weights/%.clean.txt $(GENDIR)
$(AM_V_GEN)LC_ALL=C.utf8 sort < $< > $@

# token count:
%.wordcount.txt: %.sort.txt
.generated/%.wordcount.txt: .generated/%.sort.txt
$(AM_V_GEN)wc -l < $< > $@

# Unique the sorted, clean corpus:
mob_%.uniq.txt: %.sort.txt
.generated/mob_%.uniq.txt: .generated/%.sort.txt
$(AM_V_GEN)uniq -c < $< | sort -nr $(mob_corpus_size_limit_command) > $@

# type count:
mob_%.typecount.txt: mob_%.uniq.txt
.generated/mob_%.typecount.txt: .generated/mob_%.uniq.txt
$(AM_V_GEN)wc -l < $< > $@

# calculate unit weight, smoothed using ALPHA:
Expand All @@ -66,16 +66,16 @@ mob_%.typecount.txt: mob_%.uniq.txt
# | $(BC) -l > $@
#
# add tropical weights to the corpus:
mob_%.tropical.txt: mob_%.uniq.txt %.wordcount.txt mob_%.typecount.txt
.generated/mob_%.tropical.txt: .generated/mob_%.uniq.txt .generated/%.wordcount.txt .generated/mob_%.typecount.txt
$(AM_V_GEN)cat $< |\
$(GAWK) -v CS="$$(cat $*.wordcount.txt)" \
-v DS="$$(cat mob_$*.typecount.txt)" \
$(GAWK) -v CS="$$(cat .generated/$*.wordcount.txt)" \
-v DS="$$(cat .generated/mob_$*.typecount.txt)" \
-v ALPHA=$(ALPHA) \
-f $(GTCORE)/scripts/uniq_count2tropical_weight.awk \
> $@

# build an fst of surface forms with tropical weights for each word form:
.generated/mob_%.surfs.hfst: mob_%.tropical.txt $(GENDIR)
.generated/mob_%.surfs.hfst: .generated/mob_%.tropical.txt
$(AM_V_STR2FST)cat $< |\
$(HFST_STRINGS2FST) -j $(HFST_FLAGS) -f openfst-tropical -o $@

Expand All @@ -89,10 +89,10 @@ mob_%.tropical.txt: mob_%.uniq.txt %.wordcount.txt mob_%.typecount.txt

# Add the unit weight to each unit in compounds, both dynamic and lexical:
.generated/mob_unitweighted.hfst: $(UW_SPELLER_SRC) \
$(MOB_UNITWEIGHT) \
.generated/$(MOB_UNITWEIGHT) \
$(srcdir)/weights/word-boundary.txt
$(AM_V_REWEIGHT)$(HFST_REWEIGHT) $(HFST_FLAGS) \
-e -a $$(cat $(MOB_UNITWEIGHT)) $< \
-e -a $$(cat .generated/$(MOB_UNITWEIGHT)) $< \
-o $@

# Keep these intermediate targets when building using --debug:
Expand Down

0 comments on commit 43594b0

Please sign in to comment.