diff --git a/hugegraph-llm/requirements.txt b/hugegraph-llm/requirements.txt index e10cb22f..755f5bea 100644 --- a/hugegraph-llm/requirements.txt +++ b/hugegraph-llm/requirements.txt @@ -4,7 +4,7 @@ qianfan~=0.3.18 retry~=0.9.2 tiktoken>=0.7.0 nltk~=3.8.1 -gradio~=4.43.0 +gradio~=4.44.1 jieba>=0.42.1 numpy~=1.24.4 python-docx~=1.1.2 diff --git a/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/app.py b/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/app.py index 912b064f..89696b94 100644 --- a/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/app.py +++ b/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/app.py @@ -56,9 +56,9 @@ def authenticate(credentials: HTTPAuthorizationCredentials = Depends(sec)): def init_rag_ui() -> gr.Interface: with gr.Blocks( - theme="default", - title="HugeGraph RAG Platform", - css=CSS, + theme="default", + title="HugeGraph RAG Platform", + css=CSS, ) as hugegraph_llm_ui: gr.Markdown("# HugeGraph LLM RAG Demo") @@ -84,7 +84,6 @@ def init_rag_ui() -> gr.Interface: = else if settings.reranker_type == siliconflow [settings.reranker_api_key, "BAAI/bge-reranker-v2-m3", ""] = else ["","",""] """ - textbox_array_graph_config = create_configs_block() @@ -94,7 +93,6 @@ def init_rag_ui() -> gr.Interface: textbox_inp, textbox_answer_prompt_input = create_rag_block() with gr.Tab(label="3. Others Tools 🚧"): create_other_block() - def refresh_ui_config_prompt() -> tuple: settings.from_env() @@ -105,7 +103,6 @@ def refresh_ui_config_prompt() -> tuple: prompt.default_question, prompt.answer_prompt ) - hugegraph_llm_ui.load(fn=refresh_ui_config_prompt, outputs=[ textbox_array_graph_config[0], textbox_array_graph_config[1], diff --git a/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/rag_block.py b/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/rag_block.py index 66c9b19f..3dd9eec3 100644 --- a/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/rag_block.py +++ b/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/rag_block.py @@ -105,10 +105,10 @@ def create_rag_block(): ) with gr.Column(scale=1): with gr.Row(): - raw_radio = gr.Radio(choices=[True, False], value=True, label="Basic LLM Answer") + raw_radio = gr.Radio(choices=[True, False], value=False, label="Basic LLM Answer") vector_only_radio = gr.Radio(choices=[True, False], value=False, label="Vector-only Answer") with gr.Row(): - graph_only_radio = gr.Radio(choices=[True, False], value=False, label="Graph-only Answer") + graph_only_radio = gr.Radio(choices=[True, False], value=True, label="Graph-only Answer") graph_vector_radio = gr.Radio(choices=[True, False], value=False, label="Graph-Vector Answer") def toggle_slider(enable): @@ -265,4 +265,4 @@ def several_rag_answer( ) questions_file.change(read_file_to_excel, questions_file, [qa_dataframe, answer_max_line_count]) answer_max_line_count.change(change_showing_excel, answer_max_line_count, qa_dataframe) - return inp, answer_prompt_input \ No newline at end of file + return inp, answer_prompt_input diff --git a/hugegraph-llm/src/hugegraph_llm/indices/vector_index.py b/hugegraph-llm/src/hugegraph_llm/indices/vector_index.py index 1c395285..3732a9f0 100644 --- a/hugegraph-llm/src/hugegraph_llm/indices/vector_index.py +++ b/hugegraph-llm/src/hugegraph_llm/indices/vector_index.py @@ -97,8 +97,9 @@ def search(self, query_vector: List[float], top_k: int, dis_threshold: float = 0 for dist, i in zip(distances[0], indices[0]): if dist < dis_threshold: # Smaller distances indicate higher similarity results.append(deepcopy(self.properties[i])) + log.debug("[✓] Add valid distance %s to results.", dist) else: - log.debug("Distance %s is larger than threshold %s, ignore this result.", dist, dis_threshold) + log.debug("[x] Distance %s ≥ threshold %s, ignore this result.", dist, dis_threshold) return results @staticmethod diff --git a/hugegraph-llm/src/hugegraph_llm/operators/document_op/chunk_split.py b/hugegraph-llm/src/hugegraph_llm/operators/document_op/chunk_split.py index b26d0943..8c2dd80f 100644 --- a/hugegraph-llm/src/hugegraph_llm/operators/document_op/chunk_split.py +++ b/hugegraph-llm/src/hugegraph_llm/operators/document_op/chunk_split.py @@ -43,24 +43,22 @@ def __init__( def _get_separators(self, language: str) -> List[str]: if language == LANGUAGE_ZH: return ["\n\n", "\n", "。", ",", ""] - elif language == LANGUAGE_EN: + if language == LANGUAGE_EN: return ["\n\n", "\n", ".", ",", " ", ""] - else: - raise ValueError("language must be zh or en") + raise ValueError("language must be zh or en") def _get_text_splitter(self, split_type: str): if split_type == SPLIT_TYPE_DOCUMENT: return lambda text: [text] - elif split_type == SPLIT_TYPE_PARAGRAPH: + if split_type == SPLIT_TYPE_PARAGRAPH: return RecursiveCharacterTextSplitter( chunk_size=500, chunk_overlap=30, separators=self.separators ).split_text - elif split_type == SPLIT_TYPE_SENTENCE: + if split_type == SPLIT_TYPE_SENTENCE: return RecursiveCharacterTextSplitter( chunk_size=50, chunk_overlap=0, separators=self.separators ).split_text - else: - raise ValueError("Type must be paragraph, sentence, html or markdown") + raise ValueError("Type must be paragraph, sentence, html or markdown") def run(self, context: Optional[Dict[str, Any]]) -> Dict[str, Any]: all_chunks = [] diff --git a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py index fcc15530..9fc6cd7f 100644 --- a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py +++ b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/graph_rag_query.py @@ -248,7 +248,8 @@ def _process_edge(self, item: Any, flat_rel: str, prior_edge_str_len: int, raw_flat_rel: List[Any], i: int, use_id_to_match: bool) -> Tuple[str, int]: props_str = ", ".join(f"{k}: {v}" for k, v in item["props"].items()) props_str = f"{{{props_str}}}" if len(props_str) > 0 else "" - prev_matched_str = raw_flat_rel[i - 1]["id"] if use_id_to_match else raw_flat_rel[i - 1]["props"][self._prop_to_match] + prev_matched_str = raw_flat_rel[i - 1]["id"] if use_id_to_match else ( + raw_flat_rel)[i - 1]["props"][self._prop_to_match] if item["outV"] == prev_matched_str: edge_str = f" --[{item['label']}{props_str}]--> " diff --git a/hugegraph-llm/src/hugegraph_llm/operators/index_op/semantic_id_query.py b/hugegraph-llm/src/hugegraph_llm/operators/index_op/semantic_id_query.py index 2df3a047..04804e61 100644 --- a/hugegraph-llm/src/hugegraph_llm/operators/index_op/semantic_id_query.py +++ b/hugegraph-llm/src/hugegraph_llm/operators/index_op/semantic_id_query.py @@ -20,11 +20,11 @@ from copy import deepcopy from typing import Dict, Any, Literal, List, Tuple -from hugegraph_llm.utils.log import log -from pyhugegraph.client import PyHugeClient from hugegraph_llm.config import resource_path, settings from hugegraph_llm.indices.vector_index import VectorIndex from hugegraph_llm.models.embeddings.base import BaseEmbedding +from hugegraph_llm.utils.log import log +from pyhugegraph.client import PyHugeClient class SemanticIdQuery: diff --git a/hugegraph-llm/src/hugegraph_llm/utils/log.py b/hugegraph-llm/src/hugegraph_llm/utils/log.py index 0de5bd96..0a186737 100755 --- a/hugegraph-llm/src/hugegraph_llm/utils/log.py +++ b/hugegraph-llm/src/hugegraph_llm/utils/log.py @@ -14,15 +14,11 @@ # limitations under the License. import logging -from logging.handlers import TimedRotatingFileHandler import os -# TODO: unify the log format in the project (include gradle(fastapi) frame) - -# Set log format -LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s" -DATE_FORMAT = "%Y-%m-%d %H:%M:%S %p" +from pyhugegraph.utils import log +# TODO: unify the log format in the project (include gradle(fastapi) frame) # Configure log file path and maximum size LOG_DIR = "logs" if not os.path.exists(LOG_DIR): @@ -30,43 +26,5 @@ LOG_FILE = os.path.join(LOG_DIR, "llm-server.log") # Create a logger -log = logging.getLogger("llm_app") -log.setLevel(logging.DEBUG) - -# Create a handler for writing to log file -file_handler = TimedRotatingFileHandler(LOG_FILE, when='midnight', interval=1, - backupCount=7, encoding='utf-8') -file_handler.setLevel(logging.DEBUG) -file_handler.setFormatter(logging.Formatter(LOG_FORMAT, datefmt=DATE_FORMAT)) -# Add the handler, and we could use 'log.Info(xxx)' in other files -log.addHandler(file_handler) - - -# ANSI escape sequences for colors -class CustomConsoleHandler(logging.StreamHandler): - COLORS = { - "DEBUG": "\033[0;37m", # White - "INFO": "\033[0;32m", # Green - "WARNING": "\033[0;33m", # Yellow - "ERROR": "\033[0;31m", # Red - "CRITICAL": "\033[0;41m" # Red background - } - - def emit(self, record): - try: - msg = self.format(record) - level = record.levelname - color_prefix = self.COLORS.get(level, "\033[0;37m") # Default to white - color_suffix = "\033[0m" # Reset to default - stream = self.stream - stream.write(color_prefix + msg + color_suffix + self.terminator) - self.flush() - except Exception: # pylint: disable=broad-exception-caught - self.handleError(record) - - -# Also output logs to the console, we could add a StreamHandler here (Optional) -custom_handler = CustomConsoleHandler() # console_handler = logging.StreamHandler() -custom_handler.setLevel(logging.DEBUG) -custom_handler.setFormatter(logging.Formatter(LOG_FORMAT, datefmt=DATE_FORMAT)) -log.addHandler(custom_handler) +log = log.init_logger(log_output=LOG_FILE, log_level=logging.DEBUG, logger_name="rag", + max_log_size=20 * 1024 * 1024) diff --git a/hugegraph-python-client/requirements.txt b/hugegraph-python-client/requirements.txt index d7a8148a..be3a3e20 100644 --- a/hugegraph-python-client/requirements.txt +++ b/hugegraph-python-client/requirements.txt @@ -1,4 +1,5 @@ -decorator==5.1.1 -requests==2.32.0 -setuptools==70.0.0 -urllib3==2.2.2 +decorator~=5.1.1 +requests~=2.32.0 +setuptools~=70.0.0 +urllib3~=2.2.2 +rich~=13.9.4 diff --git a/hugegraph-python-client/src/pyhugegraph/utils/log.py b/hugegraph-python-client/src/pyhugegraph/utils/log.py old mode 100755 new mode 100644 index 157d26c1..7ccd794b --- a/hugegraph-python-client/src/pyhugegraph/utils/log.py +++ b/hugegraph-python-client/src/pyhugegraph/utils/log.py @@ -13,67 +13,191 @@ # See the License for the specific language governing permissions and # limitations under the License. +import atexit import logging import os -from logging.handlers import TimedRotatingFileHandler - -# Set log format -LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s" -DATE_FORMAT = "%Y-%m-%d %H:%M:%S %p" - - -# Function to configure the logging path, default is "logs/output.log" -# You could import it in "__init__.py" & use it in the whole package -def init_log(log_file="logs/output.log"): - # Ensure the log directory exists - log_dir = os.path.dirname(log_file) - os.makedirs(log_dir, exist_ok=True) - - # Create a logger - log = logging.getLogger(__name__) # pylint: disable=redefined-outer-name - log.setLevel(logging.INFO) - - # Create a handler for writing to log files - file_handler = TimedRotatingFileHandler( - log_file, when="midnight", interval=1, backupCount=3, encoding="utf-8" - ) - file_handler.setLevel(logging.DEBUG) - file_handler.setFormatter(logging.Formatter(LOG_FORMAT, datefmt=DATE_FORMAT)) - log.addHandler(file_handler) - - # ANSI escape sequences for colors - class CustomConsoleHandler(logging.StreamHandler): - COLORS = { - "DEBUG": "\033[0;37m", # White - "INFO": "\033[0;32m", # Green - "WARNING": "\033[0;33m", # Yellow - "ERROR": "\033[0;31m", # Red - "CRITICAL": "\033[0;41m", # Red background - } - - def emit(self, record): - try: - msg = self.format(record) - level = record.levelname - color_prefix = self.COLORS.get(level, "\033[0;37m") # Default to white - color_suffix = "\033[0m" # Reset to default - stream = self.stream - stream.write(color_prefix + msg + color_suffix + self.terminator) - self.flush() - except Exception as e: # pylint: disable=broad-exception-caught - self.handleError(record) - log.error( # pylint: disable=logging-fstring-interpolation - f"Log Print Exception: {e}" - ) - - # Also output logs to the console - custom_handler = CustomConsoleHandler() - custom_handler.setLevel(logging.DEBUG) - custom_handler.setFormatter(logging.Formatter(LOG_FORMAT, datefmt=DATE_FORMAT)) - log.addHandler(custom_handler) - - return log - - -# Default logger configuration -log = init_log() +import sys +import time +from collections import Counter +from functools import lru_cache +from logging.handlers import RotatingFileHandler + +from rich.logging import RichHandler + +__all__ = [ + "init_logger", + "fetch_log_level", + "log_first_n_times", + "log_every_n_times", + "log_every_n_secs", +] + +LOG_BUFFER_SIZE_ENV: str = "LOG_BUFFER_SIZE" +DEFAULT_BUFFER_SIZE: int = 1024 * 1024 # 1MB + + +@lru_cache() # avoid creating multiple handlers when calling init_logger() +def init_logger( + log_output=None, + log_level=logging.INFO, + rank=0, + *, + logger_name="client", # users should set logger name for modules + propagate_logs: bool = False, + stdout_logging: bool = True, + max_log_size=50 * 1024 * 1024, # 50 MB + backup_logs=5, +): + """ + Initialize the logger and set its verbosity level to "DEBUG". + + Args: + log_output (str): a file name or a directory to save log. If None, will not save a log file. + If it ends with ".txt" or ".log", assumed to be a file name. + Otherwise, logs will be saved to `log_output/log.txt`. + logger_name (str): the root module name of this logger + propagate_logs (bool): whether to propagate logs to the parent logger. + stdout_logging (bool): whether to configure logging to stdout. + + Returns: + logging.Logger: a logger + """ + log_instance = logging.getLogger(logger_name) + log_instance.setLevel(log_level) + log_instance.propagate = propagate_logs + + if log_instance.hasHandlers(): + log_instance.handlers.clear() + + # stdout logging: master only + if stdout_logging and rank == 0: + rich_handler = RichHandler(log_level) + rich_handler.setFormatter(logging.Formatter("%(name)s: %(message)s")) + log_instance.addHandler(rich_handler) + + # file logging: all workers + if log_output is not None: + if log_output.endswith(".txt") or log_output.endswith(".log"): + log_filename = log_output + else: + log_filename = os.path.join(log_output, "log.txt") + + if rank > 0: + log_filename = f"{log_filename}.rank{rank}" + + os.makedirs(os.path.dirname(log_filename), exist_ok=True) + file_handler = RotatingFileHandler( + log_filename, + maxBytes=max_log_size, + backupCount=backup_logs, + ) + file_handler.setLevel(logging.DEBUG) + formatter = logging.Formatter( + "[%(asctime)s] %(levelname)s [%(name)s:%(filename)s:%(lineno)d] %(message)s", + datefmt="%m/%d/%y %H:%M:%S", + ) + file_handler.setFormatter(formatter) + log_instance.addHandler(file_handler) + return log_instance + + +# Cache the opened file object, so that different calls to `initialize_logger` +# with the same file name can safely write to the same file. +@lru_cache(maxsize=None) +def _cached_log_file(filename): + """Cache the opened file object""" + # Use 1K buffer if writing to cloud storage + with open(filename, "a", buffering=_determine_buffer_size(filename), encoding="utf-8") as file_io: + atexit.register(file_io.close) + return file_io + + +def _determine_buffer_size(filename: str) -> int: + """Determine the buffer size for the log stream""" + if "://" not in filename: + # Local file, no extra caching is necessary + return -1 + # Remote file requires a larger cache to avoid many smalls writes. + if LOG_BUFFER_SIZE_ENV in os.environ: + return int(os.environ[LOG_BUFFER_SIZE_ENV]) + return DEFAULT_BUFFER_SIZE + + +def _identify_caller(): + """ + Returns: + str: module name of the caller + tuple: a hashable key to be used to identify different callers + """ + frame = sys._getframe(2) # pylint: disable=protected-access + while frame: + code = frame.f_code + if os.path.join("utils", "logger.") not in code.co_filename: + module_name = frame.f_globals["__name__"] + if module_name == "__main__": + module_name = "core" + return module_name, (code.co_filename, frame.f_lineno, code.co_name) + frame = frame.f_back + return None, None + + +LOG_COUNTER = Counter() +LOG_TIMERS = {} + + +def log_first_n_times(level, message, n=1, *, logger_name=None, key="caller"): + """ + Log only for the first n times. + + Args: + logger_name (str): name of the logger to use. Will use the caller's module by default. + key (str or tuple[str]): the string(s) can be one of "callers" or + "message", which defines how to identify duplicated logs. + For example, if called with `n=1, key="caller"`, this function + will only log the first call from the same caller, regardless of + the message content. + If called with `n=1, key="message"`, this function will log the + same content only once, even if they are called from different places. + If called with `n=1, key=("caller", "message")`, this function + will not log only if the same caller has logged the same message before. + """ + if isinstance(key, str): + key = (key,) + assert len(key) > 0 + + caller_module, caller_key = _identify_caller() + hash_key = () + if "caller" in key: + hash_key = hash_key + caller_key + if "message" in key: + hash_key = hash_key + (message,) + + LOG_COUNTER[hash_key] += 1 + if LOG_COUNTER[hash_key] <= n: + logging.getLogger(logger_name or caller_module).log(level, message) + + +def log_every_n_times(level, message, n=1, *, logger_name=None): + caller_module, key = _identify_caller() + LOG_COUNTER[key] += 1 + if n == 1 or LOG_COUNTER[key] % n == 1: + logging.getLogger(logger_name or caller_module).log(level, message) + + +def log_every_n_secs(level, message, n=1, *, logger_name=None): + caller_module, key = _identify_caller() + last_logged = LOG_TIMERS.get(key, None) + current_time = time.time() + if last_logged is None or current_time - last_logged >= n: + logging.getLogger(logger_name or caller_module).log(level, message) + LOG_TIMERS[key] = current_time + + +def fetch_log_level(level_name: str): + """Fetch the logging level by its name""" + level = getattr(logging, level_name.upper(), None) + if not isinstance(level, int): + raise ValueError(f"Invalid log level: {level_name}") + return level + +log = init_logger(log_output="logs/output.log", log_level=logging.INFO)