-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
40e844a
commit fed8a9c
Showing
5 changed files
with
136 additions
and
18 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
106 changes: 106 additions & 0 deletions
106
lib/src/main/scala/com/worksap/nlp/uzushio/lib/filters/KenLMParagraphPerplexity.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
package com.worksap.nlp.uzushio.lib.filters | ||
|
||
import com.worksap.nlp.uzushio.lib.cleaning.{Document, Paragraph} | ||
import com.worksap.nlp.uzushio.lib.filters.base.DocFilter | ||
|
||
import scala.collection.mutable | ||
|
||
final case class ParagraphWithPerplexity(p: Paragraph, ppx: Float) { | ||
def isAlive: Boolean = p.isAlive | ||
|
||
def remove(x: AnyRef): ParagraphWithPerplexity = copy(p = p.copy(remove = x)) | ||
} | ||
|
||
class KenLMParagraphPerplexity( | ||
sudachi: String, | ||
kenlm: String, | ||
outliers: Float = 0.02f, | ||
count: Int = 3, | ||
threshold: Float = 1e6f | ||
) extends DocFilter { | ||
private val lmScore = -Math.log10(threshold).toFloat | ||
|
||
@transient | ||
private lazy val processor = KenLMEvaluator.make(sudachi, kenlm, outliers) | ||
|
||
override def checkDocument(doc: Document): Document = { | ||
val proc = processor | ||
val paragraphs = doc.paragraphs | ||
.map(p => ParagraphWithPerplexity(p, proc.scoreParagraph(p).toFloat)).toBuffer | ||
|
||
val nchanged = markParagraphs(paragraphs) | ||
|
||
if (nchanged > 0) { | ||
doc.copy(paragraphs = paragraphs.map(_.p)) | ||
} else { | ||
doc | ||
} | ||
} | ||
|
||
def markParagraphs(paragraphs: mutable.Buffer[ParagraphWithPerplexity]): Int = { | ||
var nchanged = 0 | ||
var idx = 0 | ||
val len = paragraphs.length | ||
while (idx < len) { | ||
val p = paragraphs(idx) | ||
if (p.isAlive && (shouldRemoveBack(paragraphs, idx) || shouldRemoveFwd(paragraphs, idx, len))) { | ||
paragraphs(idx) = p.remove(this) | ||
nchanged += removePrev(paragraphs, idx) | ||
nchanged += 1 | ||
} | ||
idx += 1 | ||
} | ||
nchanged | ||
} | ||
|
||
def removePrev(paragraphs: mutable.Buffer[ParagraphWithPerplexity], offset: Int): Int = { | ||
var result = 0 | ||
val end = math.max(offset - count, 0) | ||
var idx = offset - 1 | ||
while (idx >= end) { | ||
val p = paragraphs(idx) | ||
if (p.isAlive && p.ppx <= lmScore) { | ||
paragraphs(idx) = p.remove(this) | ||
result += 1 | ||
} | ||
|
||
idx -= 1 | ||
} | ||
result | ||
} | ||
|
||
def shouldRemoveBack( | ||
paragraphs: mutable.Buffer[ParagraphWithPerplexity], | ||
offset: Int | ||
): Boolean = { | ||
var idx = offset | ||
val end = math.max(offset - count + 1, 0) | ||
while (idx >= end) { | ||
val p = paragraphs(idx) | ||
if (p.ppx > lmScore) { | ||
return false | ||
} | ||
idx -= 1 | ||
} | ||
true | ||
} | ||
|
||
def shouldRemoveFwd( | ||
paragraphs: mutable.Buffer[ParagraphWithPerplexity], | ||
offset: Int, | ||
length: Int | ||
): Boolean = { | ||
var idx = offset | ||
val end = math.min(offset + count, length) | ||
while (idx < end) { | ||
val p = paragraphs(idx) | ||
if (p.ppx > lmScore) { | ||
return false | ||
} | ||
idx += 1 | ||
} | ||
true | ||
} | ||
|
||
override val toString = s"KenLMPar($outliers,$count,$threshold)" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
filters: [ | ||
{"class": "DocLength", "low": 50}, | ||
{"class": "KenLMDocAvgPerplexity", sudachi: ${sudachi}, kenlm: ${kenlm}, outliers: 0.02, low: 0, high: 1e7 }, | ||
{"class": "KenLMDocAvgPerplexity", sudachi: ${sudachi}, kenlm: ${kenlm}, outliers: 0.02, low: 0, high: 1e6 }, | ||
{"class": "KenLMDocAvgPerplexity", sudachi: ${sudachi}, kenlm: ${kenlm}, outliers: 0.02, low: 0, high: 1e5 }, | ||
{"class": "KenLMParagraphPerplexity", sudachi: ${sudachi}, kenlm: ${kenlm}, outliers: 0.02, count: 3, threshold: 1e6 }, | ||
] |