From fd3c0fd26eff7d3be72fa4f2a234e33b9f796627 Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Mon, 23 Sep 2024 19:32:01 +0800 Subject: [PATCH] feat: Initial version if Graph RAG in KAGGLE scenario (#301) * Initial version if Graph RAG in KAGGLE scenario * fix CI * fix a small bug * fix CI * fix CI * fix CI --- Makefile | 2 +- rdagent/app/kaggle/conf.py | 7 ++ rdagent/app/kaggle/loop.py | 10 +- .../CoSTEER/knowledge_management.py | 14 ++- .../CoSTEER/knowledge_management.py | 10 +- .../components/knowledge_management/graph.py | 33 +----- .../knowledge_management/vector_base.py | 32 +----- rdagent/components/workflow/conf.py | 2 + rdagent/core/evolving_framework.py | 6 +- rdagent/core/knowledge_base.py | 25 ++++ rdagent/core/prompts.py | 2 +- rdagent/core/proposal.py | 7 +- rdagent/oai/llm_utils.py | 2 +- .../meta_tpl/fea_share_preprocess.py | 14 +-- .../scenarios/kaggle/experiment/prompts.yaml | 2 +- .../scenarios/kaggle/experiment/scenario.py | 14 ++- .../scenarios/kaggle/experiment/workspace.py | 21 ++-- .../kaggle/knowledge_management/graph.py | 108 ++++++++++++++++++ .../kaggle/knowledge_management/prompts.yaml | 43 ++++++- .../knowledge_management/vector_base.py | 25 ++-- rdagent/scenarios/kaggle/prompts.yaml | 27 +++++ rdagent/scenarios/kaggle/proposal/proposal.py | 94 ++++++++++++++- 22 files changed, 382 insertions(+), 118 deletions(-) create mode 100644 rdagent/core/knowledge_base.py create mode 100644 rdagent/scenarios/kaggle/knowledge_management/graph.py diff --git a/Makefile b/Makefile index 950ec5e75..11ad028d5 100644 --- a/Makefile +++ b/Makefile @@ -97,7 +97,7 @@ mypy: # First deal with the core folder, and then gradually increase the scope of detection, # and eventually realize the detection of the complete project. ruff: - $(PIPRUN) ruff check rdagent/core --ignore FBT001,FBT002 # --exclude rdagent/scripts,git_ignore_folder + $(PIPRUN) ruff check rdagent/core --ignore FBT001,FBT002,I001 # --exclude rdagent/scripts,git_ignore_folder # Check lint with toml-sort. toml-sort: diff --git a/rdagent/app/kaggle/conf.py b/rdagent/app/kaggle/conf.py index b1f30fbd5..e9b668a2a 100644 --- a/rdagent/app/kaggle/conf.py +++ b/rdagent/app/kaggle/conf.py @@ -16,6 +16,13 @@ class Config: scen: str = "rdagent.scenarios.kaggle.experiment.scenario.KGScenario" """Scenario class for data mining model""" + knowledge_base: str = "" # TODO enable this line to use the knowledge base + # knowledge_base: str = "rdagent.scenarios.kaggle.knowledge_management.graph.KGKnowledgeGraph" + """Knowledge base class""" + + knowledge_base_path: str = "kg_graph.pkl" + """Knowledge base path""" + hypothesis_gen: str = "rdagent.scenarios.kaggle.proposal.proposal.KGHypothesisGen" """Hypothesis generation class""" diff --git a/rdagent/app/kaggle/loop.py b/rdagent/app/kaggle/loop.py index b250c343d..11c757720 100644 --- a/rdagent/app/kaggle/loop.py +++ b/rdagent/app/kaggle/loop.py @@ -22,6 +22,7 @@ from rdagent.scenarios.kaggle.proposal.proposal import ( KG_ACTION_FEATURE_ENGINEERING, KG_ACTION_FEATURE_PROCESSING, + KGTrace, ) @@ -32,6 +33,13 @@ def __init__(self, PROP_SETTING: BasePropSetting): scen: Scenario = import_class(PROP_SETTING.scen)(PROP_SETTING.competition) logger.log_object(scen, tag="scenario") + knowledge_base = ( + import_class(PROP_SETTING.knowledge_base)(PROP_SETTING.knowledge_base_path, scen) + if PROP_SETTING.knowledge_base != "" + else None + ) + logger.log_object(knowledge_base, tag="knowledge_base") + self.hypothesis_gen: HypothesisGen = import_class(PROP_SETTING.hypothesis_gen)(scen) logger.log_object(self.hypothesis_gen, tag="hypothesis generator") @@ -50,7 +58,7 @@ def __init__(self, PROP_SETTING: BasePropSetting): self.summarizer: HypothesisExperiment2Feedback = import_class(PROP_SETTING.summarizer)(scen) logger.log_object(self.summarizer, tag="summarizer") - self.trace = Trace(scen=scen) + self.trace = KGTrace(scen=scen, knowledge_base=knowledge_base) super(RDLoop, self).__init__() @measure_time diff --git a/rdagent/components/coder/factor_coder/CoSTEER/knowledge_management.py b/rdagent/components/coder/factor_coder/CoSTEER/knowledge_management.py index 462c58122..65cc84bac 100644 --- a/rdagent/components/coder/factor_coder/CoSTEER/knowledge_management.py +++ b/rdagent/components/coder/factor_coder/CoSTEER/knowledge_management.py @@ -22,9 +22,9 @@ ) from rdagent.core.evolving_framework import ( EvolvableSubjects, + EvolvingKnowledgeBase, EvoStep, Knowledge, - KnowledgeBase, QueriedKnowledge, RAGStrategy, ) @@ -71,12 +71,13 @@ def __init__(self, success_task_to_knowledge_dict: dict = {}, failed_task_info_s self.failed_task_info_set = failed_task_info_set -class FactorKnowledgeBaseV1(KnowledgeBase): - def __init__(self) -> None: +class FactorKnowledgeBaseV1(EvolvingKnowledgeBase): + def __init__(self, path: str | Path = None) -> None: self.implementation_trace: dict[str, FactorKnowledge] = dict() self.success_task_info_set: set[str] = set() self.task_to_embedding = dict() + super().__init__(path) def query(self) -> QueriedKnowledge | None: """ @@ -746,12 +747,12 @@ def dataset_query( return factor_implementation_queried_graph_knowledge -class FactorGraphKnowledgeBase(KnowledgeBase): - def __init__(self, init_component_list=None, data_set_knowledge_path=None) -> None: +class FactorGraphKnowledgeBase(EvolvingKnowledgeBase): + def __init__(self, init_component_list=None, path: str | Path = None, data_set_knowledge_path=None) -> None: """ Load knowledge, offer brief information of knowledge and common handle interfaces """ - self.graph: UndirectedGraph = UndirectedGraph.load(Path.cwd() / "graph.pkl") + self.graph: UndirectedGraph = UndirectedGraph(Path.cwd() / "graph.pkl") logger.info(f"Knowledge Graph loaded, size={self.graph.size()}") if init_component_list: @@ -780,6 +781,7 @@ def __init__(self, init_component_list=None, data_set_knowledge_path=None) -> No if data_set_knowledge_path: with open(data_set_knowledge_path, "r") as f: self.data_set_knowledge_dict = json.load(f) + super().__init__(path) def get_all_nodes_by_label(self, label: str) -> list[UndirectedNode]: return self.graph.get_all_nodes_by_label(label) diff --git a/rdagent/components/coder/model_coder/CoSTEER/knowledge_management.py b/rdagent/components/coder/model_coder/CoSTEER/knowledge_management.py index bd88eb4c4..c9a266fbd 100644 --- a/rdagent/components/coder/model_coder/CoSTEER/knowledge_management.py +++ b/rdagent/components/coder/model_coder/CoSTEER/knowledge_management.py @@ -1,11 +1,13 @@ +from pathlib import Path + from rdagent.components.coder.model_coder.conf import MODEL_IMPL_SETTINGS from rdagent.components.coder.model_coder.CoSTEER.evaluators import ModelCoderFeedback from rdagent.components.coder.model_coder.model import ModelTask from rdagent.core.evolving_framework import ( EvolvableSubjects, + EvolvingKnowledgeBase, EvoStep, Knowledge, - KnowledgeBase, QueriedKnowledge, RAGStrategy, ) @@ -49,13 +51,15 @@ def __init__(self, success_task_to_knowledge_dict: dict = {}, failed_task_info_s self.working_task_to_similar_successful_knowledge_dict = dict() -class ModelKnowledgeBase(KnowledgeBase): - def __init__(self) -> None: +class ModelKnowledgeBase(EvolvingKnowledgeBase): + def __init__(self, path: str | Path = None) -> None: self.implementation_trace: dict[str, ModelKnowledge] = dict() self.success_task_info_set: set[str] = set() self.task_to_embedding = dict() + super().__init__(path) + def query(self) -> QueriedKnowledge | None: """ Query the knowledge base to get the queried knowledge. So far is handled in RAG strategy. diff --git a/rdagent/components/knowledge_management/graph.py b/rdagent/components/knowledge_management/graph.py index c25f84e9a..50b408acc 100644 --- a/rdagent/components/knowledge_management/graph.py +++ b/rdagent/components/knowledge_management/graph.py @@ -12,6 +12,7 @@ VectorBase, cosine, ) +from rdagent.core.knowledge_base import KnowledgeBase from rdagent.oai.llm_utils import APIBackend Node = KnowledgeMetaData @@ -47,14 +48,14 @@ def __repr__(self) -> str: ) -class Graph: +class Graph(KnowledgeBase): """ base Graph class for Knowledge Graph Search """ def __init__(self, path: str | Path | None = None) -> None: - self.path = path self.nodes = {} + super().__init__(path=path) def size(self) -> int: return len(self.nodes) @@ -77,22 +78,6 @@ def find_node(self, content: str, label: str) -> Node | None: return node return None - @classmethod - def load(cls: type[Graph], path: str | Path) -> Graph: - """use pickle as the default load method""" - path = path if isinstance(path, Path) else Path(path) - if not path.exists(): - return cls(path=path) - - with path.open("rb") as f: - return pickle.load(f) - - def save(self, path: str | Path) -> None: - """use pickle as the default save method""" - Path.mkdir(path.parent, exist_ok=True) - with path.open("wb") as f: - pickle.dump(self, f) - @staticmethod def batch_embedding(nodes: list[Node]) -> list[Node]: contents = [node.content for node in nodes] @@ -119,8 +104,8 @@ class UndirectedGraph(Graph): """ def __init__(self, path: str | Path | None = None) -> None: - super().__init__(path=path) self.vector_base: VectorBase = PDVectorBase() + super().__init__(path=path) def __str__(self) -> str: return f"UndirectedGraph(nodes={self.nodes})" @@ -174,16 +159,6 @@ def add_node( node.add_neighbor(neighbor) - @classmethod - def load(cls: type[UndirectedGraph], path: str | Path) -> UndirectedGraph: - """use pickle as the default load method""" - path = path if isinstance(path, Path) else Path(path) - if not path.exists(): - return cls(path=path) - - with path.open("rb") as f: - return pickle.load(f) - def add_nodes(self, node: UndirectedNode, neighbors: list[UndirectedNode]) -> None: if not neighbors: self.add_node(node) diff --git a/rdagent/components/knowledge_management/vector_base.py b/rdagent/components/knowledge_management/vector_base.py index f5d22c5e3..bb98caa2c 100644 --- a/rdagent/components/knowledge_management/vector_base.py +++ b/rdagent/components/knowledge_management/vector_base.py @@ -5,6 +5,7 @@ import pandas as pd from scipy.spatial.distance import cosine +from rdagent.core.knowledge_base import KnowledgeBase from rdagent.log import rdagent_logger as logger from rdagent.oai.llm_utils import APIBackend @@ -68,14 +69,11 @@ def contents_to_documents(contents: List[str], label: str = None) -> List[Docume return docs -class VectorBase: +class VectorBase(KnowledgeBase): """ This class is used for handling vector storage and query """ - def __init__(self, vector_df_path: Union[str, Path] = None, **kwargs): - pass - def add(self, document: Union[Document, List[Document]]): """ add new node to vector_df @@ -104,28 +102,15 @@ def search(self, content: str, topk_k: int = 5, similarity_threshold: float = 0) """ pass - def load(self, **kwargs): - """load vector_df""" - - def save(self, **kwargs): - """save vector_df""" - class PDVectorBase(VectorBase): """ Implement of VectorBase using Pandas """ - def __init__(self, vector_df_path: Union[str, Path] = None): - super().__init__(vector_df_path) - - if vector_df_path: - try: - self.vector_df = self.load(vector_df_path) - except FileNotFoundError: - self.vector_df = pd.DataFrame(columns=["id", "label", "content", "embedding"]) - else: - self.vector_df = pd.DataFrame(columns=["id", "label", "content", "embedding"]) + def __init__(self, path: Union[str, Path] = None): + self.vector_df = pd.DataFrame(columns=["id", "label", "content", "embedding"]) + super().__init__(path) def shape(self): return self.vector_df.shape @@ -196,10 +181,3 @@ def search(self, content: str, topk_k: int = 5, similarity_threshold: float = 0) for _, similar_docs in most_similar_docs.iterrows(): docs.append(Document().from_dict(similar_docs.to_dict())) return docs, searched_similarities.to_list() - - def load(self, vector_df_path, **kwargs): - vector_df = pd.read_pickle(vector_df_path) - return vector_df - - def save(self, vector_df_path, **kwargs): - self.vector_df.to_pickle(vector_df_path) diff --git a/rdagent/components/workflow/conf.py b/rdagent/components/workflow/conf.py index f4d1f2a3a..77ed0698c 100644 --- a/rdagent/components/workflow/conf.py +++ b/rdagent/components/workflow/conf.py @@ -14,6 +14,8 @@ class Config: """ scen: str = "" + knowledge_base: str = "" + knowledge_base_path: str = "" hypothesis_gen: str = "" hypothesis2experiment: str = "" coder: str = "" diff --git a/rdagent/core/evolving_framework.py b/rdagent/core/evolving_framework.py index 25f37b4b0..016abad01 100644 --- a/rdagent/core/evolving_framework.py +++ b/rdagent/core/evolving_framework.py @@ -5,6 +5,8 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Any +from rdagent.core.knowledge_base import KnowledgeBase + if TYPE_CHECKING: from rdagent.core.evaluation import Feedback from rdagent.core.scenario import Scenario @@ -18,7 +20,7 @@ class QueriedKnowledge: pass -class KnowledgeBase(ABC): +class EvolvingKnowledgeBase(KnowledgeBase): @abstractmethod def query( self, @@ -78,7 +80,7 @@ def evolve( class RAGStrategy(ABC): """Retrieval Augmentation Generation Strategy""" - def __init__(self, knowledgebase: KnowledgeBase) -> None: + def __init__(self, knowledgebase: EvolvingKnowledgeBase) -> None: self.knowledgebase = knowledgebase @abstractmethod diff --git a/rdagent/core/knowledge_base.py b/rdagent/core/knowledge_base.py new file mode 100644 index 000000000..f4cd9ab51 --- /dev/null +++ b/rdagent/core/knowledge_base.py @@ -0,0 +1,25 @@ +from pathlib import Path + +import dill as pickle # type: ignore[import-untyped] + +from rdagent.log import rdagent_logger as logger + + +class KnowledgeBase: + def __init__(self, path: str | Path | None = None) -> None: + self.path = Path(path) if path else None + self.load() + + def load(self) -> None: + if self.path is not None and self.path.exists(): + with self.path.open("rb") as f: + self.__dict__.update( + pickle.load(f).__dict__, + ) # TODO: because we need to align with init function, we need a less hacky way to do this + + def dump(self) -> None: + if self.path is not None: + self.path.parent.mkdir(parents=True, exist_ok=True) + pickle.dump(self, self.path.open("wb")) + else: + logger.warning("KnowledgeBase path is not set, dump failed.") diff --git a/rdagent/core/prompts.py b/rdagent/core/prompts.py index 02898480a..5fc059597 100644 --- a/rdagent/core/prompts.py +++ b/rdagent/core/prompts.py @@ -1,4 +1,4 @@ -from pathlib import Path # noqa: I001 +from pathlib import Path import yaml diff --git a/rdagent/core/proposal.py b/rdagent/core/proposal.py index 58c044d74..a420bb311 100644 --- a/rdagent/core/proposal.py +++ b/rdagent/core/proposal.py @@ -9,6 +9,7 @@ from rdagent.core.evaluation import Feedback from rdagent.core.experiment import ASpecificExp, Experiment +from rdagent.core.knowledge_base import KnowledgeBase from rdagent.core.scenario import Scenario if TYPE_CHECKING: @@ -83,12 +84,14 @@ def __str__(self) -> str: ASpecificScen = TypeVar("ASpecificScen", bound=Scenario) +ASpecificKB = TypeVar("ASpecificKB", bound=KnowledgeBase) -class Trace(Generic[ASpecificScen]): - def __init__(self, scen: ASpecificScen) -> None: +class Trace(Generic[ASpecificScen, ASpecificKB]): + def __init__(self, scen: ASpecificScen, knowledge_base: ASpecificKB | None = None) -> None: self.scen: ASpecificScen = scen self.hist: list[tuple[Hypothesis, Experiment, HypothesisFeedback]] = [] + self.knowledge_base: ASpecificKB | None = knowledge_base def get_sota_hypothesis_and_experiment(self) -> tuple[Hypothesis | None, Experiment | None]: """Access the last experiment result, sub-task, and the corresponding hypothesis.""" diff --git a/rdagent/oai/llm_utils.py b/rdagent/oai/llm_utils.py index 8cf410b61..d5ccd43e9 100644 --- a/rdagent/oai/llm_utils.py +++ b/rdagent/oai/llm_utils.py @@ -90,7 +90,7 @@ def __init__(self, cache_location: str) -> None: self.cache_location = cache_location db_file_exist = Path(cache_location).exists() # TODO: sqlite3 does not support multiprocessing. - self.conn = sqlite3.connect(cache_location) + self.conn = sqlite3.connect(cache_location, timeout=20) self.c = self.conn.cursor() if not db_file_exist: self.c.execute( diff --git a/rdagent/scenarios/kaggle/experiment/meta_tpl/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/meta_tpl/fea_share_preprocess.py index 4b4ef2736..a6d31a9c4 100644 --- a/rdagent/scenarios/kaggle/experiment/meta_tpl/fea_share_preprocess.py +++ b/rdagent/scenarios/kaggle/experiment/meta_tpl/fea_share_preprocess.py @@ -84,13 +84,13 @@ def preprocess_script(): """ This method applies the preprocessing steps to the training, validation, and test datasets. """ - if os.path.exists("X_train.pkl"): - X_train = pd.read_pickle("X_train.pkl") - X_valid = pd.read_pickle("X_valid.pkl") - y_train = pd.read_pickle("y_train.pkl") - y_valid = pd.read_pickle("y_valid.pkl") - X_test = pd.read_pickle("X_test.pkl") - passenger_ids = pd.read_pickle("passenger_ids.pkl") + if os.path.exists("/kaggle/preprocessed_data/X_train.pkl"): + X_train = pd.read_pickle("/kaggle/preprocessed_data/X_train.pkl") + X_valid = pd.read_pickle("/kaggle/preprocessed_data/X_valid.pkl") + y_train = pd.read_pickle("/kaggle/preprocessed_data/y_train.pkl") + y_valid = pd.read_pickle("/kaggle/preprocessed_data/y_valid.pkl") + X_test = pd.read_pickle("/kaggle/preprocessed_data/X_test.pkl") + passenger_ids = pd.read_pickle("/kaggle/preprocessed_data/passenger_ids.pkl") return X_train, X_valid, y_train, y_valid, X_test, passenger_ids X_train, X_valid, y_train, y_valid = prepreprocess() diff --git a/rdagent/scenarios/kaggle/experiment/prompts.yaml b/rdagent/scenarios/kaggle/experiment/prompts.yaml index ab3ab1392..dc35ce318 100644 --- a/rdagent/scenarios/kaggle/experiment/prompts.yaml +++ b/rdagent/scenarios/kaggle/experiment/prompts.yaml @@ -56,7 +56,7 @@ kg_background: |- The user tries to optimize the performance iteratively by employing one of the feature related or model related action items: - Feature related: - "Feature engineering": The user will design several new tasks and implement several new features. The new feature might only affect the model using all the feature book. - - "Feature processing": The user will design a new task to process the feature book like normalization or one hot encoding to improve the model performance. + - "Feature processing": The user will design a new task to process the feature book like normalization or one hot encoding to improve the model performance. Any processing with help of a deep model is not included in this task. - Model related: - "Model feature selection": The user will modify one model to select the most important features from the feature book to improve the model performance. - "Model tuning": The user will tune the hyperparameters of XGBoost, RandomForest or LightGBM or build or improve the NN model to improve the model performance. diff --git a/rdagent/scenarios/kaggle/experiment/scenario.py b/rdagent/scenarios/kaggle/experiment/scenario.py index c8221ea64..95fd3158e 100644 --- a/rdagent/scenarios/kaggle/experiment/scenario.py +++ b/rdagent/scenarios/kaggle/experiment/scenario.py @@ -38,11 +38,6 @@ def __init__(self, competition: str) -> None: self._background = self.background - # all competitions are based on the same vector base - self.vector_base = KaggleExperienceBase() - if KAGGLE_IMPLEMENT_SETTING.rag_path and Path(KAGGLE_IMPLEMENT_SETTING.rag_path).exists(): - self.vector_base.load(KAGGLE_IMPLEMENT_SETTING.rag_path) - def _analysis_competition_description(self): sys_prompt = ( Environment(undefined=StrictUndefined) @@ -69,7 +64,14 @@ def _analysis_competition_description(self): self.competition_type = response_json_analysis.get("Competition Type", "No type provided") self.competition_description = response_json_analysis.get("Competition Description", "No description provided") self.target_description = response_json_analysis.get("Target Description", "No target provided") - self.competition_features = self.source_data + self.competition_features = response_json_analysis.get("Competition Features", "No features provided") + + def get_competition_full_desc(self) -> str: + return f"""Competition Type: {self.competition_type} +Competition Description: {self.competition_description} +Target Description: {self.target_description} +Competition Features: {self.competition_features} +""" @property def background(self) -> str: diff --git a/rdagent/scenarios/kaggle/experiment/workspace.py b/rdagent/scenarios/kaggle/experiment/workspace.py index 30992d44c..e6f023ca2 100644 --- a/rdagent/scenarios/kaggle/experiment/workspace.py +++ b/rdagent/scenarios/kaggle/experiment/workspace.py @@ -5,6 +5,7 @@ import pandas as pd from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING +from rdagent.components.coder.factor_coder.config import FACTOR_IMPLEMENT_SETTINGS from rdagent.core.experiment import FBWorkspace from rdagent.log import rdagent_logger as logger from rdagent.utils.env import KGDockerEnv @@ -64,22 +65,24 @@ def generate_preprocess_data( def execute(self, run_env: dict = {}, *args, **kwargs) -> str: logger.info(f"Running the experiment in {self.workspace_path}") - # link the data to the workspace to speed up the preprocessing - source_data_path = Path(KAGGLE_IMPLEMENT_SETTING.local_data_path) / KAGGLE_IMPLEMENT_SETTING.competition - self.link_all_files_in_folder_to_workspace(source_data_path, self.workspace_path) - kgde = KGDockerEnv(KAGGLE_IMPLEMENT_SETTING.competition) kgde.prepare() + running_extra_volume = { + ( + Path(FACTOR_IMPLEMENT_SETTINGS.data_folder) / KAGGLE_IMPLEMENT_SETTING.competition + ).absolute(): "/kaggle/preprocessed_data" + } + if KAGGLE_IMPLEMENT_SETTING.competition: + running_extra_volume[ + KAGGLE_IMPLEMENT_SETTING.local_data_path + "/" + KAGGLE_IMPLEMENT_SETTING.competition + ] = "/kaggle/input" + execute_log = kgde.run( local_path=str(self.workspace_path), entry=f"python train.py", env=run_env, - running_extra_volume=( - {KAGGLE_IMPLEMENT_SETTING.local_data_path + "/" + KAGGLE_IMPLEMENT_SETTING.competition: "/kaggle/input"} - if KAGGLE_IMPLEMENT_SETTING.competition - else None - ), + running_extra_volume=running_extra_volume, ) csv_path = self.workspace_path / "submission_score.csv" diff --git a/rdagent/scenarios/kaggle/knowledge_management/graph.py b/rdagent/scenarios/kaggle/knowledge_management/graph.py new file mode 100644 index 000000000..3c6d42200 --- /dev/null +++ b/rdagent/scenarios/kaggle/knowledge_management/graph.py @@ -0,0 +1,108 @@ +import json +from pathlib import Path +from typing import List + +from jinja2 import Environment, StrictUndefined +from tqdm import tqdm + +from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING +from rdagent.components.knowledge_management.graph import ( + UndirectedGraph, + UndirectedNode, +) +from rdagent.core.conf import RD_AGENT_SETTINGS +from rdagent.core.prompts import Prompts +from rdagent.core.utils import multiprocessing_wrapper +from rdagent.oai.llm_utils import APIBackend +from rdagent.scenarios.kaggle.experiment.scenario import KGScenario + +PROMPT_DICT = Prompts(file_path=Path(__file__).parent / "prompts.yaml") + + +class KGKnowledgeGraph(UndirectedGraph): + def __init__(self, path: str | Path | None, scenario: KGScenario) -> None: + super().__init__(path) + if path is not None and not Path(path).exists(): + documents = [] + for file_path in (Path(KAGGLE_IMPLEMENT_SETTING.local_data_path) / "domain_knowledge").glob("*.case"): + with open(file_path, "r") as f: + documents.append(f.read()) + self.load_from_documents(documents=documents, scenario=scenario) + self.dump() + tmp = 1 + + def analyze_one_document(self, document_content: str, scenario: KGScenario) -> list: + session_system_prompt = ( + Environment(undefined=StrictUndefined) + .from_string(PROMPT_DICT["extract_knowledge_graph_from_document"]["system"]) + .render(scenario=scenario.get_scenario_all_desc()) + ) + + session = APIBackend().build_chat_session( + session_system_prompt=session_system_prompt, + ) + user_prompt = ( + Environment(undefined=StrictUndefined) + .from_string(PROMPT_DICT["extract_knowledge_graph_from_document"]["user"]) + .render(document_content=document_content) + ) + knowledge_list = [] + for _ in range(10): + response = session.build_chat_completion(user_prompt=user_prompt, json_mode=True) + knowledge = json.loads(response) + knowledge_list.append(knowledge) + user_prompt = "Continue from the last step please. Don't extract the same knowledge again." + return knowledge_list + + def load_from_documents(self, documents: List[str], scenario: KGScenario): + knowledge_list_list = multiprocessing_wrapper( + [ + ( + self.analyze_one_document, + ( + document_content, + scenario, + ), + ) + for document_content in documents + ], + n=RD_AGENT_SETTINGS.multi_proc_n, + ) + node_pairs = [] + node_list = [] + for knowledge_list in tqdm(knowledge_list_list): + for knowledge in knowledge_list: + if knowledge == {}: + break + competition = knowledge.get("competition", "") + + competition_node = UndirectedNode( + content=( + "General knowledge not related to any competition" + if (competition == "" or competition == "N/A") + else competition + ), + label="competition", + ) + node_list.append(competition_node) + + for action in ["hypothesis", "experiments", "code", "conclusion"]: + if action == "hypothesis": + if isinstance(knowledge.get("hypothesis", ""), str) and knowledge.get("hypothesis", "") in [ + "N/A", + "", + ]: + break + label = knowledge[action]["type"] + else: + label = action + content = str(knowledge.get(action, "")) + if content == "" or content == "N/A": + continue + node = UndirectedNode(content=content, label=label) + node_list.append(node) + node_pairs.append((node, competition_node)) + + node_list = self.batch_embedding(node_list) + for node_pair in node_pairs: + self.add_node(node_pair[0], node_pair[1]) diff --git a/rdagent/scenarios/kaggle/knowledge_management/prompts.yaml b/rdagent/scenarios/kaggle/knowledge_management/prompts.yaml index e7daedc05..7f04a69fe 100644 --- a/rdagent/scenarios/kaggle/knowledge_management/prompts.yaml +++ b/rdagent/scenarios/kaggle/knowledge_management/prompts.yaml @@ -36,4 +36,45 @@ extract_kaggle_knowledge_from_feedback_prompts: } user: |- - Experiment strategy: {{ experiment_strategy }} \ No newline at end of file + Experiment strategy: {{ experiment_strategy }} + + +extract_knowledge_graph_from_document: + system: |- + You are helping user to extract knowledge from a document. + The user is working on data science competitions in Kaggle in the following scenario: + {{ scenario }} + + The user has found some possible high value documents from other experts, and they need your help to extract some knowledge from these documents. + + Considering each document might contain several valuable insights, you need to extract them one by one and organize them in a structured format. + + You should return a dict containing a single knowledge which includes several fields: + 1. The competition the document is related to. + 2. The hypothesis the document is trying to prove. Containing a type to the hypothesis and very detailed explanation to the hypothesis. The type should be one from ["Feature engineering", "Feature processing", "Model feature selection", "Model tuning"]. + 3. Detailed experiments the document has conducted. + 4. Any related code snippets related to the hypothesis if available. + 5. The conclusion to this knowledge. A bool value indicating whether the hypothesis is proved or not is required. More explainable conclusion is also needed. + + Please provide the analysis in the following JSON format: + { + "competition": "(Plain text) extracted competition information, including the competition name, type, description, target, and features", + "hypothesis": + { + "type": "one of the hypothesis types from ['Feature engineering', 'Feature processing', 'Model feature selection', 'Model tuning']", + "explanation": "(Plain text) extracted detailed explanation to the hypothesis" + }, + "experiments": "(Plain text) extracted experiments details. You can list them in bullet points.", + "code": "extracted code snippets if available", + "conclusion": + { + "proved": "bool value indicating whether the hypothesis is proved or not", + "explanation": "(Plain text) extracted detailed explanation to the conclusion" + } + } + All fields are required so don't miss any key in the schema. The document might not contain all the fields, so you should extract as much information as possible. If a field is not available, please put "N/A" in the field. + + If you find no valuable insights in the document, please return an empty dict. + + user: |- + Document content: {{ document_content }} \ No newline at end of file diff --git a/rdagent/scenarios/kaggle/knowledge_management/vector_base.py b/rdagent/scenarios/kaggle/knowledge_management/vector_base.py index 4b6514dfd..4ad55c3f1 100644 --- a/rdagent/scenarios/kaggle/knowledge_management/vector_base.py +++ b/rdagent/scenarios/kaggle/knowledge_management/vector_base.py @@ -1,15 +1,10 @@ -import uuid from pathlib import Path -from typing import List, Tuple, Union +from typing import List, Union import pandas as pd from _pytest.cacheprovider import json -from scipy.spatial.distance import cosine -from rdagent.components.knowledge_management.vector_base import ( - KnowledgeMetaData, - PDVectorBase, -) +from rdagent.components.knowledge_management.vector_base import Document, PDVectorBase from rdagent.log import rdagent_logger as logger from rdagent.oai.llm_utils import APIBackend from rdagent.scenarios.kaggle.knowledge_management.extract_knowledge import ( @@ -17,7 +12,7 @@ ) -class KGKnowledgeMetaData(KnowledgeMetaData): +class KGKnowledgeDocument(Document): """ Class for handling Kaggle competition specific metadata """ @@ -104,7 +99,7 @@ def __repr__(self): ) -KGDocument = KGKnowledgeMetaData +KGDocument = KGKnowledgeDocument class KaggleExperienceBase(PDVectorBase): @@ -112,7 +107,7 @@ class KaggleExperienceBase(PDVectorBase): Class for handling Kaggle competition experience posts and organizing them for reference """ - def __init__(self, vector_df_path: Union[str, Path] = None, kaggle_experience_path: Union[str, Path] = None): + def __init__(self, path: Union[str, Path] = None, kaggle_experience_path: Union[str, Path] = None): """ Initialize the KaggleExperienceBase class @@ -123,7 +118,7 @@ def __init__(self, vector_df_path: Union[str, Path] = None, kaggle_experience_pa kaggle_experience_path: str or Path, optional Path to the Kaggle experience post data. """ - super().__init__(vector_df_path) + super().__init__(path) self.kaggle_experience_path = kaggle_experience_path self.kaggle_experience_data = [] @@ -193,7 +188,7 @@ def add_experience_to_vector_base(self, experiment_feedback=None): if experiment_feedback: extracted_knowledge = extract_knowledge_from_feedback(experiment_feedback) - document = KGKnowledgeMetaData( + document = KGKnowledgeDocument( content=experiment_feedback.get("hypothesis_text", ""), label="Experiment Feedback", competition_name="Experiment Result", @@ -216,7 +211,7 @@ def add_experience_to_vector_base(self, experiment_feedback=None): ranking = experience.get("ranking", None) score = experience.get("score", None) - document = KGKnowledgeMetaData( + document = KGKnowledgeDocument( content=content, label=label, competition_name=competition_name, @@ -250,7 +245,7 @@ def search_experience(self, query: str, topk_k: int = 5, similarity_threshold: f kaggle_docs = [] for result in search_results: - kg_doc = KGKnowledgeMetaData().from_dict(result.__dict__) + kg_doc = KGKnowledgeDocument().from_dict(result.__dict__) kaggle_docs.append(kg_doc) return kaggle_docs, similarities @@ -263,7 +258,7 @@ def search_experience(self, query: str, topk_k: int = 5, similarity_threshold: f kaggle_base.add_experience_to_vector_base() - kaggle_base.save("git_ignore_folder/experience/tabular_cases/kaggle_vector_base.pkl") + kaggle_base.save() print(f"There are {kaggle_base.shape()[0]} records in the vector base.") diff --git a/rdagent/scenarios/kaggle/prompts.yaml b/rdagent/scenarios/kaggle/prompts.yaml index 97feb2c8e..c32cc25cd 100644 --- a/rdagent/scenarios/kaggle/prompts.yaml +++ b/rdagent/scenarios/kaggle/prompts.yaml @@ -1,3 +1,30 @@ +KG_hypothesis_gen_RAG: |- + The user has proposed several hypothesis and conducted experiments to validate them. + The hypothesis can divided into two categories: + 1. Insights: These are the observations user did to other similar problems. You can either apply the same hypothesis or modify them to fit the current problem. + 2. Experience: These are former hypothesis and experiments user did to the current problem. You can either continue to improve the hypothesis or change to a new one. + + {% if insights %} + The insights are as follows: + {% for insight in insights %} + Insight: {{ loop.index }} + - hypothesis: {{ insight.hypothesis }} + - experiments: {{ insight.experiments }} + - conclusion: {{ insight.conclusion }} + {% endfor %} + {% endif %} + + {% if experiences %} + The experiences are as follows: + {% for experience in experiences %} + Experience: {{ loop.index }} + - hypothesis: {{ experience.hypothesis }} + - experiments: {{ experience.experiments }} + - conclusion: {{ experience.conclusion }} + {% endfor %} + {% endif %} + + hypothesis_and_feedback: |- {% for hypothesis, experiment, feedback in trace.hist %} Hypothesis {{ loop.index }}: {{ hypothesis }} diff --git a/rdagent/scenarios/kaggle/proposal/proposal.py b/rdagent/scenarios/kaggle/proposal/proposal.py index ab0359a76..b0abe9be2 100644 --- a/rdagent/scenarios/kaggle/proposal/proposal.py +++ b/rdagent/scenarios/kaggle/proposal/proposal.py @@ -19,6 +19,8 @@ KGFactorExperiment, KGModelExperiment, ) +from rdagent.scenarios.kaggle.experiment.scenario import KGScenario +from rdagent.scenarios.kaggle.knowledge_management.graph import KGKnowledgeGraph from rdagent.scenarios.kaggle.knowledge_management.vector_base import ( KaggleExperienceBase, ) @@ -77,9 +79,88 @@ class KGHypothesisGen(ModelHypothesisGen): prompts: Prompts = a_specifc_prompt_dict """ - def __init__(self, scen: Scenario, knowledge: VectorBase = None) -> Tuple[dict, bool]: + def __init__(self, scen: Scenario) -> Tuple[dict, bool]: super().__init__(scen) - self.scen.vector_base.save(KAGGLE_IMPLEMENT_SETTING.rag_path) + + def generate_RAG_content(self, trace: Trace) -> str: + if trace.knowledge_base is None: + return None + same_competition_node = trace.knowledge_base.get_node_by_content(trace.scen.get_competition_full_desc()) + if same_competition_node is not None: + related_hypothesis_nodes = [] + for action in KG_ACTION_LIST: + related_hypothesis_nodes.extend( + trace.knowledge_base.get_nodes_within_steps( + start_node=same_competition_node, + steps=1, + constraint_labels=[action], + )[:1] + ) + else: + related_hypothesis_nodes = [] + experiences = [] + for hypothesis_node in related_hypothesis_nodes: + experience = {"hypothesis": hypothesis_node.content} + experiment_node_list = trace.knowledge_base.get_nodes_within_steps( + start_node=hypothesis_node, steps=1, constraint_labels=["experiments"] + ) + if len(experiment_node_list) > 0: + experience["experiments"] = experiment_node_list[0].content + else: + experience["experiments"] = "No experiment information available." + conclusion_node_list = trace.knowledge_base.get_nodes_within_steps( + start_node=hypothesis_node, steps=1, constraint_labels=["conclusion"] + ) + if len(conclusion_node_list) > 0: + experience["conclusion"] = conclusion_node_list[0].content + else: + experience["conclusion"] = "No conclusion information available." + experiences.append(experience) + + similar_nodes = trace.knowledge_base.semantic_search( + node=trace.scen.get_competition_full_desc(), + topk_k=2, + ) + + found_hypothesis_nodes = [] + for similar_node in similar_nodes: + for hypothesis_type in KG_ACTION_LIST: + hypothesis_nodes = trace.knowledge_base.get_nodes_within_steps( + start_node=similar_node, + steps=3, + constraint_labels=[hypothesis_type], + ) + found_hypothesis_nodes.extend(hypothesis_nodes[:2]) + + found_hypothesis_nodes = sorted(list(set(found_hypothesis_nodes)), key=lambda x: len(x.content)) + + insights = [] + for hypothesis_node in found_hypothesis_nodes[:5]: + if hypothesis_node in related_hypothesis_nodes: + continue + insight = {"hypothesis": hypothesis_node.content} + experiment_node_list = trace.knowledge_base.get_nodes_within_steps( + start_node=hypothesis_node, steps=1, constraint_labels=["experiments"] + ) + if len(experiment_node_list) > 0: + insight["experiments"] = experiment_node_list[0].content + else: + insight["experiments"] = "No experiment information available." + conclusion_node_list = trace.knowledge_base.get_nodes_within_steps( + start_node=hypothesis_node, steps=1, constraint_labels=["conclusion"] + ) + if len(conclusion_node_list) > 0: + insight["conclusion"] = conclusion_node_list[0].content + else: + insight["conclusion"] = "No conclusion information available." + insights.append(insight) + + RAG_content = ( + Environment(undefined=StrictUndefined) + .from_string(prompt_dict["KG_hypothesis_gen_RAG"]) + .render(insights=insights, experiences=experiences) + ) + return RAG_content def prepare_context(self, trace: Trace) -> Tuple[dict, bool]: hypothesis_and_feedback = ( @@ -92,12 +173,9 @@ def prepare_context(self, trace: Trace) -> Tuple[dict, bool]: else "No previous hypothesis and feedback available since it's the first round." ) - rag_results, _ = self.scen.vector_base.search_experience(hypothesis_and_feedback, topk_k=5) - rag_content = "\n".join([doc.content for doc in rag_results]) - context_dict = { "hypothesis_and_feedback": hypothesis_and_feedback, - "RAG": None, + "RAG": self.generate_RAG_content(trace), "hypothesis_output_format": prompt_dict["hypothesis_output_format"], "hypothesis_specification": None, } @@ -197,3 +275,7 @@ def convert_response(self, response: str, trace: Trace) -> ModelExperiment: return self.convert_feature_experiment(response, trace) elif self.current_action in [KG_ACTION_MODEL_FEATURE_SELECTION, KG_ACTION_MODEL_TUNING]: return self.convert_model_experiment(response, trace) + + +class KGTrace(Trace[KGScenario, KGKnowledgeGraph]): + pass