add lm-based filters

WorksApplications · Dec 11, 2023 · fed8a9c · fed8a9c
1 parent 40e844a
commit fed8a9c
Show file tree

Hide file tree

Showing 5 changed files with 136 additions and 18 deletions.
diff --git a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/KenLMDocAvgPerplexity.scala b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/KenLMDocAvgPerplexity.scala
@@ -1,10 +1,11 @@
 package com.worksap.nlp.uzushio.lib.filters
 
 import com.github.jbaiter.kenlm.BufferEvaluator
-import com.worksap.nlp.sudachi.Dictionary
+import com.worksap.nlp.sudachi.{Dictionary, Morpheme}
 import com.worksap.nlp.uzushio.lib.cleaning.{Document, Paragraph}
 import com.worksap.nlp.uzushio.lib.filters.base.{DocFilter, HighLowDocFilter}
 import com.worksap.nlp.uzushio.lib.resources.{KenLM, Sudachi}
+import com.worksap.nlp.uzushio.lib.utils.Paragraphs
 
 class KenLMDocAvgPerplexity(
     sudachi: String,
@@ -15,7 +16,7 @@ class KenLMDocAvgPerplexity(
 ) extends HighLowDocFilter {
 
   @transient
-  private val processor = KenLMEvaluator.make(sudachi, kenlm, outliers)
+  private lazy val processor = KenLMEvaluator.make(sudachi, kenlm, outliers)
 
   override def checkDocument(doc: Document): Document = {
     val perplexity = measureDoc(doc)
@@ -40,17 +41,6 @@ class KenLMDocAvgPerplexity(
   override def describeFilter: String = s"KenLMAvgDoc($outliers)"
 }
 
-class KenLMParagraphPerplexity(
-    sudachi: String,
-    kenlm: String,
-    adjacent: Int = 3,
-    threshold: Float = 1e6f
-) extends DocFilter {
-  override def checkDocument(doc: Document): Document = ???
-
-  override val toString = s"KenLMPar($adjacent,$threshold)"
-}
-
 class KenLMEvaluator(sudachi: String, kenlm: String) {
   private val dictionary: Dictionary = Sudachi.get(sudachi)
   final protected val tokenizer = dictionary.create()
@@ -64,14 +54,30 @@ class KenLMEvaluator(sudachi: String, kenlm: String) {
     ev.clear()
     while (iter.hasNext && continue) {
       val token = iter.next()
-      if (token.normalizedForm() != " ") {
+      if (acceptedToken(token)) {
         val remaining = ev.append(token.surface())
         continue = remaining > 0
       }
     }
     ev
   }
 
+  def acceptedToken(x: Morpheme): Boolean = {
+    if (x.normalizedForm() == " ") {
+      return false
+    }
+
+    val s = x.surface()
+    if (s.length == 1) {
+      s.charAt(0) match {
+        case Paragraphs.HTML_LINK_START | Paragraphs.HTML_LINK_END | '\n' => return false
+        case _ => return true
+      }
+    }
+
+    true
+  }
+
   def extractScore(ev: BufferEvaluator): Double = ev.evaluate()
 
   def scoreParagraph(p: Paragraph): Double = {

diff --git a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/KenLMParagraphPerplexity.scala b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/KenLMParagraphPerplexity.scala
@@ -0,0 +1,106 @@
+package com.worksap.nlp.uzushio.lib.filters
+
+import com.worksap.nlp.uzushio.lib.cleaning.{Document, Paragraph}
+import com.worksap.nlp.uzushio.lib.filters.base.DocFilter
+
+import scala.collection.mutable
+
+final case class ParagraphWithPerplexity(p: Paragraph, ppx: Float) {
+  def isAlive: Boolean = p.isAlive
+
+  def remove(x: AnyRef): ParagraphWithPerplexity = copy(p = p.copy(remove = x))
+}
+
+class KenLMParagraphPerplexity(
+    sudachi: String,
+    kenlm: String,
+    outliers: Float = 0.02f,
+    count: Int = 3,
+    threshold: Float = 1e6f
+) extends DocFilter {
+  private val lmScore = -Math.log10(threshold).toFloat
+
+  @transient
+  private lazy val processor = KenLMEvaluator.make(sudachi, kenlm, outliers)
+
+  override def checkDocument(doc: Document): Document = {
+    val proc = processor
+    val paragraphs = doc.paragraphs
+      .map(p => ParagraphWithPerplexity(p, proc.scoreParagraph(p).toFloat)).toBuffer
+
+    val nchanged = markParagraphs(paragraphs)
+
+    if (nchanged > 0) {
+      doc.copy(paragraphs = paragraphs.map(_.p))
+    } else {
+      doc
+    }
+  }
+
+  def markParagraphs(paragraphs: mutable.Buffer[ParagraphWithPerplexity]): Int = {
+    var nchanged = 0
+    var idx = 0
+    val len = paragraphs.length
+    while (idx < len) {
+      val p = paragraphs(idx)
+      if (p.isAlive && (shouldRemoveBack(paragraphs, idx) || shouldRemoveFwd(paragraphs, idx, len))) {
+        paragraphs(idx) = p.remove(this)
+        nchanged += removePrev(paragraphs, idx)
+        nchanged += 1
+      }
+      idx += 1
+    }
+    nchanged
+  }
+
+  def removePrev(paragraphs: mutable.Buffer[ParagraphWithPerplexity], offset: Int): Int = {
+    var result = 0
+    val end = math.max(offset - count, 0)
+    var idx = offset - 1
+    while (idx >= end) {
+      val p = paragraphs(idx)
+      if (p.isAlive && p.ppx <= lmScore) {
+        paragraphs(idx) = p.remove(this)
+        result += 1
+      }
+
+      idx -= 1
+    }
+    result
+  }
+
+  def shouldRemoveBack(
+      paragraphs: mutable.Buffer[ParagraphWithPerplexity],
+      offset: Int
+  ): Boolean = {
+    var idx = offset
+    val end = math.max(offset - count + 1, 0)
+    while (idx >= end) {
+      val p = paragraphs(idx)
+      if (p.ppx > lmScore) {
+        return false
+      }
+      idx -= 1
+    }
+    true
+  }
+
+  def shouldRemoveFwd(
+      paragraphs: mutable.Buffer[ParagraphWithPerplexity],
+      offset: Int,
+      length: Int
+  ): Boolean = {
+    var idx = offset
+    val end = math.min(offset + count, length)
+    while (idx < end) {
+      val p = paragraphs(idx)
+      if (p.ppx > lmScore) {
+        return false
+      }
+      idx += 1
+    }
+    true
+  }
+
+  override val toString = s"KenLMPar($outliers,$count,$threshold)"
+}
diff --git a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/base/HighLowDocFilter.scala b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/base/HighLowDocFilter.scala
@@ -22,10 +22,10 @@ trait HighLowDocFilter extends DocFilter { self =>
   }
 
   @transient object High {
-    override val toString = s"${self.getClass.getSimpleName}.High($high)"
+    override val toString = s"$describeFilter.High($high)"
   }
 
-  override def toString = s"${self.getClass.getSimpleName}($low,$high)"
+  override def toString = s"$describeFilter($low,$high)"
 }
 
 trait HighLowDocIntFilter extends DocFilter { self =>

diff --git a/lib/src/main/scala/com/worksap/nlp/uzushio/lib/runners/DeduplicateParagraphs.scala b/lib/src/main/scala/com/worksap/nlp/uzushio/lib/runners/DeduplicateParagraphs.scala
@@ -950,9 +950,8 @@ object DeduplicateParagraphs {
       descr = "Spark StorageLevel for caching operations"
     )
     val textOnly = toggle(default = Some(false), descrYes = "output only text")
-    verify()
-
     val replacements = props[String]('P', descr = "Properties to resolve in filter chains")
+    verify()
 
     def toArgs: Args = Args(
       inputs = input(),

diff --git a/scripts/pipeline_test_perplexity.conf b/scripts/pipeline_test_perplexity.conf
@@ -0,0 +1,7 @@
+filters: [
+    {"class": "DocLength", "low": 50},
+    {"class": "KenLMDocAvgPerplexity", sudachi: ${sudachi}, kenlm: ${kenlm}, outliers: 0.02, low: 0, high: 1e7 },
+    {"class": "KenLMDocAvgPerplexity", sudachi: ${sudachi}, kenlm: ${kenlm}, outliers: 0.02, low: 0, high: 1e6 },
+    {"class": "KenLMDocAvgPerplexity", sudachi: ${sudachi}, kenlm: ${kenlm}, outliers: 0.02, low: 0, high: 1e5 },
+    {"class": "KenLMParagraphPerplexity", sudachi: ${sudachi}, kenlm: ${kenlm}, outliers: 0.02, count: 3, threshold: 1e6 },
+]