diff --git a/file-summarizer/file_reader.py b/file-summarizer/file_reader.py index 3cdd8a94..8f3e2d2a 100644 --- a/file-summarizer/file_reader.py +++ b/file-summarizer/file_reader.py @@ -3,6 +3,7 @@ from summarize import DocumentSummarizer, MODEL, TIKTOKEN_MODEL, MAX_CHUNK_TOKENS, MAX_WORKERS import os import tiktoken +import asyncio logger = setup_logger(__name__) @@ -28,12 +29,15 @@ async def main(): try: final_summary = summarizer.summarize(file_content) except Exception as e: + logger.error(f"Summarization failed: {e}") raise Exception(f"ERROR: Summarization failed: {e}") - response_str = f"Uploaded file {input_file} contains {len(tokens)} tokens.\n\n" - response_str += f"Summary of the file content:\n\n{final_summary}" + response_str = f"The uploaded file {input_file} contains too many tokens ({len(tokens)}), here is the summary of the file content:\n\n{final_summary}" print(response_str) return response_str else: # if the file has less than TOKEN_THRESHOLD tokens, directly return the file content print(file_content) - return file_content \ No newline at end of file + return file_content + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/file-summarizer/load_text.py b/file-summarizer/load_text.py index 9415fdce..09d0cb55 100644 --- a/file-summarizer/load_text.py +++ b/file-summarizer/load_text.py @@ -3,8 +3,9 @@ import fitz # PyMuPDF import docx from pptx import Presentation -from helper import load_from_gptscript_workspace, save_to_gptscript_workspace +from helper import load_from_gptscript_workspace, setup_logger +logger = setup_logger(__name__) def extract_text_from_pdf(pdf_bytes: bytes) -> str: """Extracts text from a PDF file given as bytes.""" @@ -53,6 +54,7 @@ async def load_text_from_file(file_path: str) -> str: try: file_content = await load_from_gptscript_workspace(file_path) except Exception as e: + logger.error(f"Failed to load file from GPTScript workspace file {file_path}, Error: {e}") raise ValueError( f"Failed to load file from GPTScript workspace file {file_path}, Error: {e}" ) diff --git a/file-summarizer/summarize.py b/file-summarizer/summarize.py index 1f96e944..bdf3d2e9 100644 --- a/file-summarizer/summarize.py +++ b/file-summarizer/summarize.py @@ -295,7 +295,7 @@ async def main(): if output_file == "": directory, file_name = os.path.split(input_file) name, ext = os.path.splitext(file_name) - summary_file_name = f"{name}_summary{ext}" + summary_file_name = f"{name}_summary.md" output_file = os.path.join(directory, summary_file_name) try: diff --git a/file-summarizer/tool.gpt b/file-summarizer/tool.gpt index 726f26eb..9a2b9f4d 100644 --- a/file-summarizer/tool.gpt +++ b/file-summarizer/tool.gpt @@ -1,20 +1,11 @@ ---- Name: File Summarizer Description: This tool summarizes the input file in the workspace, returns a text summary of the file content, either write to a file in the workspace or print to the console. Credential: sys.model.provider.credential -Params: input_file: (Required) Name of the file in the workspace to summarize. Supported formats: [.md", ".txt", ".markdown", ".text", ".mdx", ".mdtxt", ".mdtxtx", ".docx", ".pdf", ".pptx"]. For any other file types, simply say it's not supported yet. +Params: input_file: (Required) Name of the file in the workspace to summarize. Supported formats: [.md, .txt, .markdown, .text, .mdx, .mdtxt, .mdtxtx, .docx, .pdf, .pptx]. For any other file types, simply say it's not supported yet. Params: output_file: (Optional) Name of the file to save the summary, default to empty string. If not provided, a summary file will be created in the same directory as the input file. To print to the console, set this to "NONE". #!/usr/bin/env python3 ${GPTSCRIPT_TOOL_DIR}/summarize.py ---- -Name: File Reader -Description: This tool reads the input file in the workspace, returns the file content and print to the console. -Credential: sys.model.provider.credential -Params: input_file: (Required) Name of the file in the workspace to summarize. Supported formats: [.md", ".txt", ".markdown", ".text", ".mdx", ".mdtxt", ".mdtxtx", ".docx", ".pdf", ".pptx"]. For any other file types, simply say it's not supported yet. - -#!/usr/bin/env python3 ${GPTSCRIPT_TOOL_DIR}/file_reader.py - --- !metadata:*:category Utilities