diff --git a/MWES.txt b/MWES.txt index 7028db8..7d37bf1 100644 --- a/MWES.txt +++ b/MWES.txt @@ -1,5 +1,7 @@ -Strong MWEs: 3001 +Strong MWEs: 3018 +...not counting goeswith MWEs: 3001 Weak MWEs: 705 +...not counting goeswith MWEs: 705 MWE Gaps ======== @@ -33,24 +35,24 @@ Weak MWE token positions Strong MWEs by LexCat ===================== - 107 ADJ - 45 ADV + 115 ADJ + 47 ADV 131 AUX 9 CCONJ 99 DET 155 DISC 2 INF.P 2 INTJ - 1225 N + 1226 N 1 NUM - 135 P + 137 P 187 PP - 11 PRON - 8 SCONJ + 13 PRON + 9 SCONJ 3 SYM 121 V.IAV 12 V.LVC.cause 124 V.LVC.full - 319 V.VID + 320 V.VID 207 V.VPC.full 98 V.VPC.semi diff --git a/dev/MWES.txt b/dev/MWES.txt index 4396223..f6ae70d 100644 --- a/dev/MWES.txt +++ b/dev/MWES.txt @@ -1,5 +1,7 @@ -Strong MWEs: 284 +Strong MWEs: 287 +...not counting goeswith MWEs: 284 Weak MWEs: 67 +...not counting goeswith MWEs: 67 MWE Gaps ======== @@ -27,7 +29,7 @@ Weak MWE token positions Strong MWEs by LexCat ===================== - 11 ADJ + 13 ADJ 4 ADV 5 AUX 2 CCONJ @@ -35,7 +37,7 @@ Strong MWEs by LexCat 10 DISC 157 N 1 NUM - 9 P + 10 P 19 PP 1 PRON 1 SYM diff --git a/releaseutil/stats.sh b/releaseutil/stats.sh index 7ae3fe9..7d56027 100755 --- a/releaseutil/stats.sh +++ b/releaseutil/stats.sh @@ -20,10 +20,16 @@ echo "* [MWEs](MWES.txt)" >> STATS.md echo "* [Supersenses](SUPERSENSES.txt)" >> STATS.md echo -n "Strong MWEs: " > MWES.txt +SMWES=`egrep -v '^$' $DATA | egrep -v '^#' | cut -f11 | egrep ':1$' | wc -l` +printf "%4d\n" "$SMWES" >> MWES.txt +echo -n "...not counting goeswith MWEs: " >> MWES.txt SMWES=`egrep -v '^$' $DATA | egrep -v '^#' | cut -f13 | fgrep ' ' | wc -l` printf "%4d\n" "$SMWES" >> MWES.txt echo -n "Weak MWEs: " >> MWES.txt +WMWES=`egrep -v '^$' $DATA | egrep -v '^#' | cut -f16 | egrep ':1$' | wc -l` +printf "%4d\n" "$WMWES" >> MWES.txt +echo -n "...not counting goeswith MWEs: " >> MWES.txt WMWES=`egrep -v '^$' $DATA | egrep -v '^#' | cut -f18 | fgrep ' ' | wc -l` printf "%4d\n" "$WMWES" >> MWES.txt @@ -62,7 +68,7 @@ echo "" >> MWES.txt echo "Strong MWEs by LexCat" >> MWES.txt echo "=====================" >> MWES.txt -egrep -v '^$' $DATA | egrep -v '^#' | cut -f12-13 | fgrep ' ' | cut -f1 | sort | uniq -c >> MWES.txt +egrep -v '^$' $DATA | egrep -v '^#' | cut -f11-12 | egrep ':1 ' | cut -f2 | sort | uniq -c >> MWES.txt diff --git a/test/MWES.txt b/test/MWES.txt index 03a18a5..78abce4 100644 --- a/test/MWES.txt +++ b/test/MWES.txt @@ -1,5 +1,7 @@ -Strong MWEs: 281 +Strong MWEs: 283 +...not counting goeswith MWEs: 281 Weak MWEs: 80 +...not counting goeswith MWEs: 80 MWE Gaps ======== @@ -32,7 +34,7 @@ Weak MWE token positions Strong MWEs by LexCat ===================== - 17 ADJ + 19 ADJ 7 ADV 10 AUX 10 DET diff --git a/train/MWES.txt b/train/MWES.txt index 5f6d7da..721de48 100644 --- a/train/MWES.txt +++ b/train/MWES.txt @@ -1,5 +1,7 @@ -Strong MWEs: 2436 +Strong MWEs: 2448 +...not counting goeswith MWEs: 2436 Weak MWEs: 558 +...not counting goeswith MWEs: 558 MWE Gaps ======== @@ -33,23 +35,23 @@ Weak MWE token positions Strong MWEs by LexCat ===================== - 79 ADJ - 34 ADV + 83 ADJ + 36 ADV 116 AUX 7 CCONJ 78 DET 135 DISC 2 INF.P 1 INTJ - 939 N - 116 P + 940 N + 117 P 150 PP - 8 PRON - 7 SCONJ + 10 PRON + 8 SCONJ 2 SYM 99 V.IAV 11 V.LVC.cause 109 V.LVC.full - 271 V.VID + 272 V.VID 184 V.VPC.full 88 V.VPC.semi