From 6a1275580754818bb8f7aca5bfb5020beee67179 Mon Sep 17 00:00:00 2001 From: vinicvaz Date: Thu, 7 Dec 2023 09:49:24 -0300 Subject: [PATCH] fix models --- .domino/compiled_metadata.json | 28 +++++++-- pieces/TextSummarizerLocalPiece/models.py | 16 +++-- pieces/TextSummarizerLocalPiece/piece.py | 73 +++++++++++++---------- pieces/TextSummarizerPiece/models.py | 10 +++- 4 files changed, 83 insertions(+), 44 deletions(-) diff --git a/.domino/compiled_metadata.json b/.domino/compiled_metadata.json index 0ef3a6e..3809681 100644 --- a/.domino/compiled_metadata.json +++ b/.domino/compiled_metadata.json @@ -466,16 +466,32 @@ "description": "Input data for TextSummarizerPiece", "properties": { "input_file_path": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], "default": "", "description": "The path to the text file to summarize.", - "title": "Input File Path", - "type": "string" + "from_upstream": "always", + "title": "Input File Path" }, "input_text": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], "default": "", "description": "The text to summarize.", "title": "Input Text", - "type": "string" + "widget": "textarea" }, "output_type": { "allOf": [ @@ -1046,8 +1062,8 @@ ], "default": null, "description": "Text to summarize", - "required": false, - "title": "Text" + "title": "Text", + "widget": "textarea" }, "text_file_path": { "anyOf": [ @@ -1060,7 +1076,7 @@ ], "default": null, "description": "Use it only if not using text field. File path to the text to summarize", - "required": false, + "from_upstream": "always", "title": "Text File Path" }, "output_type": { diff --git a/pieces/TextSummarizerLocalPiece/models.py b/pieces/TextSummarizerLocalPiece/models.py index 7e8fbde..34abda9 100644 --- a/pieces/TextSummarizerLocalPiece/models.py +++ b/pieces/TextSummarizerLocalPiece/models.py @@ -1,5 +1,5 @@ from pydantic import BaseModel, Field, FilePath, validators -from typing import Union +from typing import Union, Optional from enum import Enum @@ -12,13 +12,19 @@ class InputModel(BaseModel): """ Input data for TextSummarizerPiece """ - input_file_path: str = Field( + input_file_path: Optional[str] = Field( description='The path to the text file to summarize.', - default="" + default="", + json_schema_extra={ + "from_upstream": "always" + } ) - input_text: str = Field( + input_text: Optional[str] = Field( description='The text to summarize.', - default="" + default="", + json_schema_extra={ + 'widget': "textarea", + } ) output_type: OutputTypeType = Field( description='The type of output fot the result text.', diff --git a/pieces/TextSummarizerLocalPiece/piece.py b/pieces/TextSummarizerLocalPiece/piece.py index 2841ac7..d7bcf16 100644 --- a/pieces/TextSummarizerLocalPiece/piece.py +++ b/pieces/TextSummarizerLocalPiece/piece.py @@ -6,37 +6,49 @@ -def summarize_long_text(text: str, summarizer, iteration: int=0): - """ - Generate the summary by concatenating the summaries of the individual chunks. - """ - iteration += 1 - print(f"Iteration: {iteration}") - - # Preprocess text - text = text.lower().replace(".", " ").replace(",", " ").replace("\n", " ") - text = "".join(ch if ch.isalnum() or ch == " " else " " for ch in text) - - # Split the input text into chunks - chunk_size = 1000 - chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)] - print(f"chunks to process: {len(chunks)}") - - # Generate the summary for each chunk - summary_list = [ - summarizer(chunk, max_length=60, min_length=30, no_repeat_ngram_size=3)[0]['summary_text'] - for chunk in chunks - ] - summary = " ".join(summary_list) - - if len(summary) > 2000: - return summarize_long_text(summary, summarizer, iteration) - else: - return summary - - class TextSummarizerLocalPiece(BasePiece): + def summarize_long_text(self, text: str, summarizer, iteration: int=0): + """ + Generate the summary by concatenating the summaries of the individual chunks. + """ + iteration += 1 + print(f"Iteration: {iteration}") + + # Preprocess text + text = text.lower().replace(".", " ").replace(",", " ").replace("\n", " ") + text = "".join(ch if ch.isalnum() or ch == " " else " " for ch in text) + + # Split the input text into chunks + chunk_size = 1000 + chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)] + print(f"chunks to process: {len(chunks)}") + + # Generate the summary for each chunk + summary_list = [ + summarizer(chunk, max_length=60, min_length=30, no_repeat_ngram_size=3)[0]['summary_text'] + for chunk in chunks + ] + summary = " ".join(summary_list) + + if len(summary) > 2000: + return self.summarize_long_text(summary, summarizer, iteration) + else: + return summary + + def format_display_result(self, final_summary: str): + md_text = f""" +## Summarized text +{final_summary} +""" + file_path = f"{self.results_path}/display_result.md" + with open(file_path, "w") as f: + f.write(md_text) + self.display_result = { + "file_type": "md", + "file_path": file_path + } + def piece_function(self, input_data: InputModel): # Set device @@ -65,7 +77,7 @@ def piece_function(self, input_data: InputModel): # Run summarizer self.logger.info("Running summarizer...") - result = summarize_long_text(text=text_str, summarizer=summarizer) + result = self.summarize_long_text(text=text_str, summarizer=summarizer) # Return result if input_data.output_type == "xcom": @@ -81,6 +93,7 @@ def piece_function(self, input_data: InputModel): with open(output_file_path, "w") as f: f.write(result) + self.format_display_result(final_summary=result) return OutputModel( message=msg, summary_result=summary_result, diff --git a/pieces/TextSummarizerPiece/models.py b/pieces/TextSummarizerPiece/models.py index 61c4d4e..d39f750 100644 --- a/pieces/TextSummarizerPiece/models.py +++ b/pieces/TextSummarizerPiece/models.py @@ -23,16 +23,20 @@ class LLMModelType(str, Enum): class InputModel(BaseModel): """ TextSummarizerPiece Input model - """ + """ text: Optional[str] = Field( default=None, description="Text to summarize", - required=False # Setting to false because can use text or text_file_path + json_schema_extra={ + 'widget': "textarea", + } ) text_file_path: Optional[str] = Field( default=None, description="Use it only if not using text field. File path to the text to summarize", - required=False # Setting to false because can use text or text_file_path + json_schema_extra={ + "from_upstream": "always" + } ) output_type: OutputTypeType = Field( default=OutputTypeType.string,