-
Notifications
You must be signed in to change notification settings - Fork 5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Develop DeduplicateDocuments filter #25
Changes from all commits
cc5d11f
ef71563
3fe58d2
b91ffb5
b2f8743
4500c66
283c2f0
9f7a793
c96e5d6
e967f37
d491e10
e4d21b2
511cb5d
9d5c2f5
1bc277c
6bc1e71
a093809
02e68d3
87d037b
9316695
f52a1f6
23b2e7c
2652ac4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
filters: [ | ||
{"class": "DuplicateDocuments"}, | ||
{"class": "DuplicateParagraphs", "limit": 2} | ||
] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
package com.worksap.nlp.uzushio.lib.filters | ||
|
||
import com.worksap.nlp.uzushio.lib.stats.NgramHashExtractor | ||
import com.worksap.nlp.uzushio.lib.cleaning.Document | ||
import com.worksap.nlp.uzushio.lib.filters.base.DocFilter | ||
import com.worksap.nlp.uzushio.lib.utils.MathUtil | ||
import scala.math._ | ||
import scala.util.Random | ||
|
||
trait RandomGeneratorFromStringBase { | ||
def generateRandom(docId: String): Double | ||
} | ||
|
||
// An object in arguments of DocFilter on Spark needs to mixin Serializable. | ||
object RandomGeneratorFromString extends RandomGeneratorFromStringBase with Serializable { | ||
def generateRandom(docId: String): Double = { | ||
val seed = NgramHashExtractor.hashString(docId) | ||
MathUtil.asRandomDouble(seed) | ||
} | ||
} | ||
|
||
class GaussianRandomGeneratorFromString( | ||
val mu: Double = 0.3, | ||
val sd: Double = 0.1 | ||
) extends RandomGeneratorFromStringBase | ||
with Serializable { | ||
def generateRandom(docId: String): Double = { | ||
val seed = NgramHashExtractor.hashString(docId) | ||
Random.setSeed(seed) | ||
Random.nextGaussian() * mu + sd | ||
} | ||
} | ||
|
||
class DeduplicateDocuments( | ||
val baseNumFreq: Int = 10, | ||
val randomGenerator: RandomGeneratorFromStringBase = new GaussianRandomGeneratorFromString | ||
) extends DocFilter { | ||
|
||
def computeNearDuplicateTextRatio(doc: Document): Float = { | ||
val iter = doc.aliveParagraphs | ||
|
||
var totalLengthWeightedNearFreq = 0.0 | ||
var totalLength = 0.0 | ||
|
||
while (iter.hasNext) { | ||
val paragraph = iter.next() | ||
val text = paragraph.text | ||
val textLength = text.length() | ||
val nearFreq = if (paragraph.nearFreq < baseNumFreq) paragraph.nearFreq else baseNumFreq | ||
val weight = log(nearFreq) / log(baseNumFreq) | ||
|
||
totalLength += textLength | ||
totalLengthWeightedNearFreq += (textLength * weight) | ||
} | ||
|
||
MathUtil.ratio(totalLengthWeightedNearFreq.toFloat, totalLength.toFloat) | ||
} | ||
|
||
def shouldRemoveDocument(doc: Document) = { | ||
val nearDuplicateTextRatio = computeNearDuplicateTextRatio(doc) | ||
val thresholdProb = randomGenerator.generateRandom(doc.render()) | ||
|
||
nearDuplicateTextRatio >= thresholdProb | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. この条件は正しくないと思います。 文書の9割は1回以上コーパスで出てきたと100回以上出てきたことを同じ確率にするのは正しくないと思います。 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @eiennohito なるほど、確かに現在は2回以上の重複を同じ扱いにして、その重複しているテキストの割合を計算してました。 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 擬似コードですが、例えばこんな感じの実装はどうでしょう。
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ただ、平均的なFreqが大きいほどthresholdProbが小さくなるように実装した方が自然かもしれませんね。 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Freqが大きい段落があるほど、1に近づくという指標です。 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 結果的に重複ドキュメントは何個残りそうですか? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @eiennohito 軽く確認をしてみたところ、こんな感じでした。
あまり大きくフィルタはされてないですね。 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. mu=0.1、sd=0.1の正規乱数で本指標をフィルタしたところ、
だいたい半分くらい減りましたね。 |
||
} | ||
|
||
override def checkDocument(doc: Document): Document = { | ||
doc.removeWhen(shouldRemoveDocument(doc), this) | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
package com.worksap.nlp.uzushio.lib.filters | ||
|
||
import com.worksap.nlp.uzushio.lib.cleaning.Document | ||
import javax.swing.tree.FixedHeightLayoutCache | ||
import org.scalatest.freespec.AnyFreeSpec | ||
|
||
|
||
class FixedProbRandomGenerator( | ||
val returnProb: Double = 0.5 | ||
) extends RandomGeneratorFromStringBase { | ||
def generateRandom(docId: String): Double = returnProb | ||
} | ||
|
||
|
||
class DeduplicateDocumentsSpec extends AnyFreeSpec { | ||
def generateFilter(returnProb: Double): DeduplicateDocuments = { | ||
val randomGenerator = new FixedProbRandomGenerator(returnProb) | ||
new DeduplicateDocuments(100, randomGenerator) | ||
} | ||
|
||
"DeduplicateDocumentsSpec" - { | ||
val filter = generateFilter(0.5) | ||
|
||
"computes correct ratio for non-deuplicated documents" in { | ||
val paragraphs = testParagraphs( | ||
Seq("test", "test", "test", "test"), | ||
Seq(1, 1, 1, 1) | ||
) | ||
val doc = Document(paragraphs, "test") | ||
assert(0.0f == filter.computeNearDuplicateTextRatio(doc)) | ||
assert(false == filter.shouldRemoveDocument(doc)) | ||
} | ||
|
||
"computes correct ratio for non-deuplicated documents (boundary)" in { | ||
val paragraphs = testParagraphs( | ||
Seq("test", "test", "test", "test"), | ||
Seq(1, 1, 99, 100) | ||
) | ||
val doc = Document(paragraphs, "test") | ||
assert(0.5f > filter.computeNearDuplicateTextRatio(doc)) | ||
assert(false == filter.shouldRemoveDocument(doc)) | ||
} | ||
|
||
"computes correct ratio for deuplicated documents" in { | ||
val paragraphs = testParagraphs( | ||
Seq("test", "test", "test", "test"), | ||
Seq(100, 100, 100, 100) | ||
) | ||
val doc = Document(paragraphs, "test") | ||
assert(1.0f == filter.computeNearDuplicateTextRatio(doc)) | ||
assert(true == filter.shouldRemoveDocument(doc)) | ||
} | ||
|
||
"computes correct ratio for deuplicated documents (boundary)" in { | ||
val paragraphs = testParagraphs( | ||
Seq("test", "test", "test", "test"), | ||
Seq(1, 1, 100, 100) | ||
) | ||
val doc = Document(paragraphs, "test") | ||
assert(0.5f == filter.computeNearDuplicateTextRatio(doc)) | ||
assert(true == filter.shouldRemoveDocument(doc)) | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
指標はいいと思います!