From dc5b27331a09a97a2e60ed5233a837cf93aefb76 Mon Sep 17 00:00:00 2001 From: Marsman1996 Date: Wed, 8 Jan 2025 19:05:14 +0800 Subject: [PATCH] community: add init for `UnstructuredHTMLLoader` to solve pathlib paths --- .../document_loaders/html.py | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/libs/community/langchain_community/document_loaders/html.py b/libs/community/langchain_community/document_loaders/html.py index 857142bce96ae..9ea781a7e91cb 100644 --- a/libs/community/langchain_community/document_loaders/html.py +++ b/libs/community/langchain_community/document_loaders/html.py @@ -1,4 +1,5 @@ -from typing import List +from pathlib import Path +from typing import Any, List, Union from langchain_community.document_loaders.unstructured import UnstructuredFileLoader @@ -27,6 +28,23 @@ class UnstructuredHTMLLoader(UnstructuredFileLoader): https://unstructured-io.github.io/unstructured/bricks.html#partition-html """ + def __init__( + self, + file_path: Union[str, Path], + mode: str = "single", + **unstructured_kwargs: Any, + ): + """ + + Args: + file_path: The path to the HTML file to load. + mode: The mode to use when loading the file. Can be one of "single", + "multi", or "all". Default is "single". + **unstructured_kwargs: Any kwargs to pass to the unstructured. + """ + file_path = str(file_path) + super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) + def _get_elements(self) -> List: from unstructured.partition.html import partition_html