Skip to content

Commit

Permalink
Allow import of text files in non utf-8 encoding (#299)
Browse files Browse the repository at this point in the history
  • Loading branch information
lemig authored Jan 31, 2025
1 parent 2b61ad1 commit 18fd54f
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 4 deletions.
4 changes: 3 additions & 1 deletion .env.sample
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@ BACKEND_PORT=8000
BACKEND_RELOAD=True

FRONTEND_HOST=0.0.0.0
FRONTEND_PORT=3000
FRONTEND_PORT=3000

TEXT_FILE_ENCODINGS=utf-8,latin1,cp1252,iso-8859-1
24 changes: 21 additions & 3 deletions server/app/routes/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,9 +173,7 @@ async def convert_documents(
# Process all files
for filename, file_path, is_txt in zip(original_filenames, file_paths, txt_files):
if is_txt:
# For txt files, just read the content
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
content = try_read_file_with_encodings(file_path)
results.append({
"filename": filename,
"markdown": content
Expand Down Expand Up @@ -243,4 +241,24 @@ async def azure_convert_documents(

return {"documents": formatted_results}

def get_supported_encodings():
"""Get list of supported encodings from environment or use default."""
encodings_str = os.getenv("TEXT_FILE_ENCODINGS", "utf-8")
return [enc.strip() for enc in encodings_str.split(",")]

def try_read_file_with_encodings(file_path: str) -> str:
"""Try to read a file with configured encodings."""
encodings = get_supported_encodings()

for encoding in encodings:
try:
with open(file_path, 'r', encoding=encoding) as f:
return f.read()
except UnicodeDecodeError:
print(f"Failed to decode {file_path} with encoding {encoding}")
continue

# If all encodings fail, try with the most permissive one and replace errors
with open(file_path, 'r', encoding='latin1', errors='replace') as f:
return f.read()

0 comments on commit 18fd54f

Please sign in to comment.