Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Develop DeduplicateDocuments filter #25

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
cc5d11f
develop DeduplicateDocuments filter
otakumesi Sep 21, 2023
ef71563
Refactoring DedupicateDocuments for test
otakumesi Sep 21, 2023
3fe58d2
Update logic computing duplication ratio
otakumesi Sep 22, 2023
b91ffb5
Apply scalafmt
otakumesi Sep 22, 2023
b2f8743
Merge remote-tracking branch 'origin/main' into dev-duplicate-filters
otakumesi Sep 22, 2023
4500c66
modify oneline implementation
otakumesi Sep 23, 2023
283c2f0
modify typo and operators
otakumesi Sep 23, 2023
9f7a793
add filtering document logic in processDocumentParts
otakumesi Sep 23, 2023
c96e5d6
fix operator and variable
otakumesi Sep 25, 2023
e967f37
sniff encodings more leniently
eiennohito Sep 22, 2023
d491e10
modify oneline implementation
otakumesi Sep 23, 2023
e4d21b2
modify typo and operators
otakumesi Sep 23, 2023
511cb5d
add filtering document logic in processDocumentParts
otakumesi Sep 23, 2023
9d5c2f5
fix operator and variable
otakumesi Sep 25, 2023
1bc277c
add debug tool for displaying metrics with dedup stats
eiennohito Sep 25, 2023
6bc1e71
add Gaussian Random Value Generator, and refactoring RandomGeneratorF…
otakumesi Sep 25, 2023
a093809
t Merge branch 'main' of ssh://github.com/WorksApplications/uzushio i…
otakumesi Sep 25, 2023
02e68d3
Merge branch 'dev-duplicate-filters' of github.com:asahi-research/uzu…
otakumesi Sep 25, 2023
87d037b
add tests for DeduplicateDocuments
otakumesi Sep 26, 2023
9316695
add duplication-score and apply scalafmt
otakumesi Sep 26, 2023
f52a1f6
put back codes
otakumesi Sep 26, 2023
23b2e7c
modify DeduplicationFilter parameters
otakumesi Sep 27, 2023
2652ac4
remove println
otakumesi Sep 27, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
filters: [
{"class": "DuplicateDocuments"},
{"class": "DuplicateParagraphs", "limit": 2}
]
Original file line number Diff line number Diff line change
Expand Up @@ -88,13 +88,13 @@ class PerParagraphFilter(val filter: ParagraphFilter) extends DocFilter {
.copy(paragraphs = doc.paragraphs.map(filter.checkParagraph))
}

final class Pipeline(filers: Array[DocFilter]) extends Serializable {
final class Pipeline(filters: Array[DocFilter]) extends Serializable {
def applyFilters(doc: Document): Document = {
var i = 0
val len = filers.length
val len = filters.length
var state = doc
while (i < len && state.remove != null) {
val f = filers(i)
while (i < len && state.remove == null) {
val f = filters(i)
state = f.checkDocument(state)
i += 1
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
package com.worksap.nlp.uzushio.lib.filters

import com.worksap.nlp.uzushio.lib.stats.NgramHashExtractor
import com.worksap.nlp.uzushio.lib.cleaning.Document
import com.worksap.nlp.uzushio.lib.filters.base.DocFilter
import com.worksap.nlp.uzushio.lib.utils.MathUtil
import scala.math._
import scala.util.Random

trait RandomGeneratorFromStringBase {
def generateRandom(docId: String): Double
}

// An object in arguments of DocFilter on Spark needs to mixin Serializable.
object RandomGeneratorFromString extends RandomGeneratorFromStringBase with Serializable {
def generateRandom(docId: String): Double = {
val seed = NgramHashExtractor.hashString(docId)
MathUtil.asRandomDouble(seed)
}
}

class GaussianRandomGeneratorFromString(
val mu: Double = 0.3,
val sd: Double = 0.1
) extends RandomGeneratorFromStringBase
with Serializable {
def generateRandom(docId: String): Double = {
val seed = NgramHashExtractor.hashString(docId)
Random.setSeed(seed)
Random.nextGaussian() * mu + sd
}
}

class DeduplicateDocuments(
val baseNumFreq: Int = 10,
val randomGenerator: RandomGeneratorFromStringBase = new GaussianRandomGeneratorFromString
) extends DocFilter {

def computeNearDuplicateTextRatio(doc: Document): Float = {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

指標はいいと思います!

val iter = doc.aliveParagraphs

var totalLengthWeightedNearFreq = 0.0
var totalLength = 0.0

while (iter.hasNext) {
val paragraph = iter.next()
val text = paragraph.text
val textLength = text.length()
val nearFreq = if (paragraph.nearFreq < baseNumFreq) paragraph.nearFreq else baseNumFreq
val weight = log(nearFreq) / log(baseNumFreq)

totalLength += textLength
totalLengthWeightedNearFreq += (textLength * weight)
}

MathUtil.ratio(totalLengthWeightedNearFreq.toFloat, totalLength.toFloat)
}

def shouldRemoveDocument(doc: Document) = {
val nearDuplicateTextRatio = computeNearDuplicateTextRatio(doc)
val thresholdProb = randomGenerator.generateRandom(doc.render())

nearDuplicateTextRatio >= thresholdProb
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

この条件は正しくないと思います。

文書の9割は1回以上コーパスで出てきたと100回以上出てきたことを同じ確率にするのは正しくないと思います。

Copy link
Contributor Author

@otakumesi otakumesi Sep 21, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@eiennohito なるほど、確かに現在は2回以上の重複を同じ扱いにして、その重複しているテキストの割合を計算してました。
そうなると、例えばnearFreqの数字を使って重み付きのratioを計算したり、平均nearFreqなどでthresholdを調整してみるとかをした方が良いということでしょうか。

Copy link
Contributor Author

@otakumesi otakumesi Sep 21, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

擬似コードですが、例えばこんな感じの実装はどうでしょう。
nearFreqが大きいほどratioが大きくなる(重複率の指標が大きくなる)という考えです。

weights = ln(nearFreqList) / (ln(nearFreqList)+1)
# or
# nearFreqListCut100= [nearFreq if nearFreq < 100 else 100 for nearFreq in nearFreqList]
# weights = ln(nearFreqList) / ln(100)
ratio = weightedSum(paragrahLengths, weights)  / documentLength

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ただ、平均的なFreqが大きいほどthresholdProbが小さくなるように実装した方が自然かもしれませんね。
となると、乱数の分布パラメータに平均Freqをいれてあげるとかですかね。

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://github.com/WorksApplications/uzushio/pull/25/files#diff-b97c0389f5b19a25a0e0b320169e9d0e137b9e24fc420835827209d4e2ceda7bR24-R47
反映させてみました。

Freqが大きい段落があるほど、1に近づくという指標です。

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

結果的に重複ドキュメントは何個残りそうですか?

Copy link
Contributor Author

@otakumesi otakumesi Sep 25, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@eiennohito 軽く確認をしてみたところ、こんな感じでした。

  • (フィルタ前) -> (フィルタ後)
  • 244 -> 228
  • 276 -> 266
  • 267 -> 229
  • 239 -> 228

あまり大きくフィルタはされてないですね。
閾値となる乱数を調整するのが良さそうです。

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mu=0.1、sd=0.1の正規乱数で本指標をフィルタしたところ、

  • 244 -> 137
  • 276 -> 147
  • 267 -> 139
  • 239 -> 124

だいたい半分くらい減りましたね。
フィルタできたドキュメントをうまく抽出できないか見てみます。

}

override def checkDocument(doc: Document): Document = {
doc.removeWhen(shouldRemoveDocument(doc), this)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package com.worksap.nlp.uzushio.lib.runners

import com.worksap.nlp.uzushio.lib.cleaning.Document
import com.worksap.nlp.uzushio.lib.utils.Resources.AutoClosableResource
import com.worksap.nlp.uzushio.lib.filters.DeduplicateDocuments
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.functions.{collect_list, octet_length, udf}
import org.apache.spark.sql.{SaveMode, SparkSession}
Expand Down Expand Up @@ -42,7 +43,7 @@ object DedupFilterStatistics {

val withValues = assembledDocs.select(
metric($"docId", $"text", $"pos", $"exactFreq", $"nearFreq") as "res"
).select(// make columns in the same order as FilterStatistics
).select( // make columns in the same order as FilterStatistics
$"res._2" as "value",
$"res._1" as "text"
)
Expand All @@ -52,6 +53,11 @@ object DedupFilterStatistics {
.csv(args.output())
}

def computeDuplicationScore(doc: Document, baseNumFreq: Int = 10) = {
val filter = new DeduplicateDocuments(baseNumFreq)
filter.computeNearDuplicateTextRatio(doc)
}

def ratioUdfConstructor[T: TypeTag](sample: Double)(extractor: Document => Float): UserDefinedFunction = {
udf {
(
Expand Down Expand Up @@ -86,6 +92,8 @@ object DedupFilterStatistics {
udfMaker(doc => doc.aliveParagraphs.map(_.nearFreq).foldRight(0)(_.max(_)))
case "min-near-freq" =>
udfMaker(doc => doc.aliveParagraphs.map(_.nearFreq).foldRight(0)(_.min(_)))
case "duplication-score" =>
udfMaker(doc => computeDuplicationScore(doc))
case _ => throw new IllegalArgumentException(s"unknown metric $ftype")
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -699,7 +699,10 @@ object DeduplicateParagraphs {
): String = {
val doc = Document(parts)
val filtered = args.pipeline.applyFilters(doc)
filtered.copy(paragraphs = doc.paragraphs.filter(_.remove != null)).render()
if (filtered.remove != null) {
return filtered.copy(IndexedSeq()).render()
}
filtered.copy(paragraphs = filtered.paragraphs.filter(_.remove == null)).render()
}

// noinspection TypeAnnotation,ScalaWeakerAccess
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,13 @@ public static int matchingBits(long x, long y) {
return Long.bitCount(~(x ^ y));
}

public static float ratio(float sum, float total) {
if (sum == 0 || total == 0) {
return 0.0f;
}
return sum / total;
}

public static float ratio(int count, int total) {
if (count == 0 || total == 0) {
return 0.0f;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
package com.worksap.nlp.uzushio.lib.filters

import com.worksap.nlp.uzushio.lib.cleaning.Document
import javax.swing.tree.FixedHeightLayoutCache
import org.scalatest.freespec.AnyFreeSpec


class FixedProbRandomGenerator(
val returnProb: Double = 0.5
) extends RandomGeneratorFromStringBase {
def generateRandom(docId: String): Double = returnProb
}


class DeduplicateDocumentsSpec extends AnyFreeSpec {
def generateFilter(returnProb: Double): DeduplicateDocuments = {
val randomGenerator = new FixedProbRandomGenerator(returnProb)
new DeduplicateDocuments(100, randomGenerator)
}

"DeduplicateDocumentsSpec" - {
val filter = generateFilter(0.5)

"computes correct ratio for non-deuplicated documents" in {
val paragraphs = testParagraphs(
Seq("test", "test", "test", "test"),
Seq(1, 1, 1, 1)
)
val doc = Document(paragraphs, "test")
assert(0.0f == filter.computeNearDuplicateTextRatio(doc))
assert(false == filter.shouldRemoveDocument(doc))
}

"computes correct ratio for non-deuplicated documents (boundary)" in {
val paragraphs = testParagraphs(
Seq("test", "test", "test", "test"),
Seq(1, 1, 99, 100)
)
val doc = Document(paragraphs, "test")
assert(0.5f > filter.computeNearDuplicateTextRatio(doc))
assert(false == filter.shouldRemoveDocument(doc))
}

"computes correct ratio for deuplicated documents" in {
val paragraphs = testParagraphs(
Seq("test", "test", "test", "test"),
Seq(100, 100, 100, 100)
)
val doc = Document(paragraphs, "test")
assert(1.0f == filter.computeNearDuplicateTextRatio(doc))
assert(true == filter.shouldRemoveDocument(doc))
}

"computes correct ratio for deuplicated documents (boundary)" in {
val paragraphs = testParagraphs(
Seq("test", "test", "test", "test"),
Seq(1, 1, 100, 100)
)
val doc = Document(paragraphs, "test")
assert(0.5f == filter.computeNearDuplicateTextRatio(doc))
assert(true == filter.shouldRemoveDocument(doc))
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,11 @@ package object filters {
}.toIndexedSeq
)
}

def testParagraphs(texts: Seq[String], nearFreqs: Seq[Int]): IndexedSeq[Paragraph] = {
(texts, nearFreqs)
.zipped
.map ((text, freq) => Paragraph("", text, 0, 1, freq))
.toIndexedSeq
}
}