Skip to content

Commit

Permalink
add lm-based filters
Browse files Browse the repository at this point in the history
  • Loading branch information
eiennohito committed Dec 11, 2023
1 parent 40e844a commit fed8a9c
Show file tree
Hide file tree
Showing 5 changed files with 136 additions and 18 deletions.
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
package com.worksap.nlp.uzushio.lib.filters

import com.github.jbaiter.kenlm.BufferEvaluator
import com.worksap.nlp.sudachi.Dictionary
import com.worksap.nlp.sudachi.{Dictionary, Morpheme}
import com.worksap.nlp.uzushio.lib.cleaning.{Document, Paragraph}
import com.worksap.nlp.uzushio.lib.filters.base.{DocFilter, HighLowDocFilter}
import com.worksap.nlp.uzushio.lib.resources.{KenLM, Sudachi}
import com.worksap.nlp.uzushio.lib.utils.Paragraphs

class KenLMDocAvgPerplexity(
sudachi: String,
Expand All @@ -15,7 +16,7 @@ class KenLMDocAvgPerplexity(
) extends HighLowDocFilter {

@transient
private val processor = KenLMEvaluator.make(sudachi, kenlm, outliers)
private lazy val processor = KenLMEvaluator.make(sudachi, kenlm, outliers)

override def checkDocument(doc: Document): Document = {
val perplexity = measureDoc(doc)
Expand All @@ -40,17 +41,6 @@ class KenLMDocAvgPerplexity(
override def describeFilter: String = s"KenLMAvgDoc($outliers)"
}

class KenLMParagraphPerplexity(
sudachi: String,
kenlm: String,
adjacent: Int = 3,
threshold: Float = 1e6f
) extends DocFilter {
override def checkDocument(doc: Document): Document = ???

override val toString = s"KenLMPar($adjacent,$threshold)"
}

class KenLMEvaluator(sudachi: String, kenlm: String) {
private val dictionary: Dictionary = Sudachi.get(sudachi)
final protected val tokenizer = dictionary.create()
Expand All @@ -64,14 +54,30 @@ class KenLMEvaluator(sudachi: String, kenlm: String) {
ev.clear()
while (iter.hasNext && continue) {
val token = iter.next()
if (token.normalizedForm() != " ") {
if (acceptedToken(token)) {
val remaining = ev.append(token.surface())
continue = remaining > 0
}
}
ev
}

def acceptedToken(x: Morpheme): Boolean = {
if (x.normalizedForm() == " ") {
return false
}

val s = x.surface()
if (s.length == 1) {
s.charAt(0) match {
case Paragraphs.HTML_LINK_START | Paragraphs.HTML_LINK_END | '\n' => return false
case _ => return true
}
}

true
}

def extractScore(ev: BufferEvaluator): Double = ev.evaluate()

def scoreParagraph(p: Paragraph): Double = {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
package com.worksap.nlp.uzushio.lib.filters

import com.worksap.nlp.uzushio.lib.cleaning.{Document, Paragraph}
import com.worksap.nlp.uzushio.lib.filters.base.DocFilter

import scala.collection.mutable

final case class ParagraphWithPerplexity(p: Paragraph, ppx: Float) {
def isAlive: Boolean = p.isAlive

def remove(x: AnyRef): ParagraphWithPerplexity = copy(p = p.copy(remove = x))
}

class KenLMParagraphPerplexity(
sudachi: String,
kenlm: String,
outliers: Float = 0.02f,
count: Int = 3,
threshold: Float = 1e6f
) extends DocFilter {
private val lmScore = -Math.log10(threshold).toFloat

@transient
private lazy val processor = KenLMEvaluator.make(sudachi, kenlm, outliers)

override def checkDocument(doc: Document): Document = {
val proc = processor
val paragraphs = doc.paragraphs
.map(p => ParagraphWithPerplexity(p, proc.scoreParagraph(p).toFloat)).toBuffer

val nchanged = markParagraphs(paragraphs)

if (nchanged > 0) {
doc.copy(paragraphs = paragraphs.map(_.p))
} else {
doc
}
}

def markParagraphs(paragraphs: mutable.Buffer[ParagraphWithPerplexity]): Int = {
var nchanged = 0
var idx = 0
val len = paragraphs.length
while (idx < len) {
val p = paragraphs(idx)
if (p.isAlive && (shouldRemoveBack(paragraphs, idx) || shouldRemoveFwd(paragraphs, idx, len))) {
paragraphs(idx) = p.remove(this)
nchanged += removePrev(paragraphs, idx)
nchanged += 1
}
idx += 1
}
nchanged
}

def removePrev(paragraphs: mutable.Buffer[ParagraphWithPerplexity], offset: Int): Int = {
var result = 0
val end = math.max(offset - count, 0)
var idx = offset - 1
while (idx >= end) {
val p = paragraphs(idx)
if (p.isAlive && p.ppx <= lmScore) {
paragraphs(idx) = p.remove(this)
result += 1
}

idx -= 1
}
result
}

def shouldRemoveBack(
paragraphs: mutable.Buffer[ParagraphWithPerplexity],
offset: Int
): Boolean = {
var idx = offset
val end = math.max(offset - count + 1, 0)
while (idx >= end) {
val p = paragraphs(idx)
if (p.ppx > lmScore) {
return false
}
idx -= 1
}
true
}

def shouldRemoveFwd(
paragraphs: mutable.Buffer[ParagraphWithPerplexity],
offset: Int,
length: Int
): Boolean = {
var idx = offset
val end = math.min(offset + count, length)
while (idx < end) {
val p = paragraphs(idx)
if (p.ppx > lmScore) {
return false
}
idx += 1
}
true
}

override val toString = s"KenLMPar($outliers,$count,$threshold)"
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ trait HighLowDocFilter extends DocFilter { self =>
}

@transient object High {
override val toString = s"${self.getClass.getSimpleName}.High($high)"
override val toString = s"$describeFilter.High($high)"
}

override def toString = s"${self.getClass.getSimpleName}($low,$high)"
override def toString = s"$describeFilter($low,$high)"
}

trait HighLowDocIntFilter extends DocFilter { self =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -950,9 +950,8 @@ object DeduplicateParagraphs {
descr = "Spark StorageLevel for caching operations"
)
val textOnly = toggle(default = Some(false), descrYes = "output only text")
verify()

val replacements = props[String]('P', descr = "Properties to resolve in filter chains")
verify()

def toArgs: Args = Args(
inputs = input(),
Expand Down
7 changes: 7 additions & 0 deletions scripts/pipeline_test_perplexity.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
filters: [
{"class": "DocLength", "low": 50},
{"class": "KenLMDocAvgPerplexity", sudachi: ${sudachi}, kenlm: ${kenlm}, outliers: 0.02, low: 0, high: 1e7 },
{"class": "KenLMDocAvgPerplexity", sudachi: ${sudachi}, kenlm: ${kenlm}, outliers: 0.02, low: 0, high: 1e6 },
{"class": "KenLMDocAvgPerplexity", sudachi: ${sudachi}, kenlm: ${kenlm}, outliers: 0.02, low: 0, high: 1e5 },
{"class": "KenLMParagraphPerplexity", sudachi: ${sudachi}, kenlm: ${kenlm}, outliers: 0.02, count: 3, threshold: 1e6 },
]

0 comments on commit fed8a9c

Please sign in to comment.