Skip to content

Commit

Permalink
use fast_langdetect replace cld2
Browse files Browse the repository at this point in the history
  • Loading branch information
myhloli committed Jun 17, 2024
1 parent 0606301 commit ce0d990
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 13 deletions.
17 changes: 6 additions & 11 deletions magic_pdf/libs/language.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import pycld2 as cld2
import regex
import unicodedata

from fast_langdetect import detect_langs

RE_BAD_CHARS = regex.compile(r"\p{Cc}|\p{Cs}")

Expand All @@ -13,17 +12,13 @@ def remove_bad_chars(text):
def detect_lang(text: str) -> str:
if len(text) == 0:
return ""

try:
_, _, details = cld2.detect(text)
lang_upper = detect_langs(text)
except:
# cld2 doesn't like control characters
# https://github.com/mikemccand/chromium-compact-language-detector/issues/22#issuecomment-435904616
html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C',]])
_, _, details = cld2.detect(html_no_ctrl_chars)
lang = ""
html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
lang_upper = detect_langs(html_no_ctrl_chars)
try:
lang = details[0][1].lower()
lang = lang_upper.lower()
except:
lang = ""
return lang
Expand All @@ -33,4 +28,4 @@ def detect_lang(text: str) -> str:
print(detect_lang("This is a test."))
print(detect_lang("<html>This is a test</html>"))
print(detect_lang("这个是中文测试。"))
print(detect_lang("<html>这个是中文测试。</html>"))
print(detect_lang("<html>这个是中文测试。</html>"))
3 changes: 1 addition & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,12 @@ loguru>=0.6.0
matplotlib>=3.8.3
numpy>=1.21.6
pandas>=1.3.5
pycld2>=0.41
fast-langdetect>=0.1.1
regex>=2023.12.25
termcolor>=2.4.0
wordninja>=2.0.0
scikit-learn>=1.0.2
nltk==3.8.1
s3pathlib>=2.1.1
pytest
paddlepaddle
paddleocr @ https://github.com/magicpdf/PaddleOCR/releases/download/paddleocr-2.8.2-released/paddleocr-2.8.2-py3-none-any.whl

0 comments on commit ce0d990

Please sign in to comment.