Skip to content

Commit

Permalink
fix paragraphs split logic
Browse files Browse the repository at this point in the history
  • Loading branch information
eiennohito committed Dec 23, 2023
1 parent 657e211 commit c1a765d
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ object Paragraphs {
final val FULLWIDTH_SPACE: Char = ' '

def extractCleanParagraphs(text: String): Seq[String] = {
val paragraphs = StringUtils.split(text, PARAGRAPH_SEP)
val paragraphs = StringUtils.splitByWholeSeparator(text, PARAGRAPH_SEP)
paragraphs.flatMap { x =>
val par = extractCleanParagraph(x)
if (hasContent(par)) Some(par) else None
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package com.worksap.nlp.uzushio.lib.utils

import org.scalatest.freespec.AnyFreeSpec

class ParagraphsSpec extends AnyFreeSpec {
"correctly splits paragraphs" in {
val doc = "test1\n\ntest2\ntest3"
val pars = Paragraphs.extractCleanParagraphs(doc)
assert(pars.length == 2)
assert(pars == Seq("test1", "test2\ntest3"))
}
}

0 comments on commit c1a765d

Please sign in to comment.