From fd3c0fd26eff7d3be72fa4f2a234e33b9f796627 Mon Sep 17 00:00:00 2001
From: Xu Yang <peteryang@vip.qq.com>
Date: Mon, 23 Sep 2024 19:32:01 +0800
Subject: [PATCH] feat:  Initial version if Graph RAG in KAGGLE scenario (#301)

* Initial version if Graph RAG in KAGGLE scenario

* fix CI

* fix a small bug

* fix CI

* fix CI

* fix CI
---
 Makefile                                      |   2 +-
 rdagent/app/kaggle/conf.py                    |   7 ++
 rdagent/app/kaggle/loop.py                    |  10 +-
 .../CoSTEER/knowledge_management.py           |  14 ++-
 .../CoSTEER/knowledge_management.py           |  10 +-
 .../components/knowledge_management/graph.py  |  33 +-----
 .../knowledge_management/vector_base.py       |  32 +-----
 rdagent/components/workflow/conf.py           |   2 +
 rdagent/core/evolving_framework.py            |   6 +-
 rdagent/core/knowledge_base.py                |  25 ++++
 rdagent/core/prompts.py                       |   2 +-
 rdagent/core/proposal.py                      |   7 +-
 rdagent/oai/llm_utils.py                      |   2 +-
 .../meta_tpl/fea_share_preprocess.py          |  14 +--
 .../scenarios/kaggle/experiment/prompts.yaml  |   2 +-
 .../scenarios/kaggle/experiment/scenario.py   |  14 ++-
 .../scenarios/kaggle/experiment/workspace.py  |  21 ++--
 .../kaggle/knowledge_management/graph.py      | 108 ++++++++++++++++++
 .../kaggle/knowledge_management/prompts.yaml  |  43 ++++++-
 .../knowledge_management/vector_base.py       |  25 ++--
 rdagent/scenarios/kaggle/prompts.yaml         |  27 +++++
 rdagent/scenarios/kaggle/proposal/proposal.py |  94 ++++++++++++++-
 22 files changed, 382 insertions(+), 118 deletions(-)
 create mode 100644 rdagent/core/knowledge_base.py
 create mode 100644 rdagent/scenarios/kaggle/knowledge_management/graph.py

diff --git a/Makefile b/Makefile
index 950ec5e75..11ad028d5 100644
--- a/Makefile
+++ b/Makefile
@@ -97,7 +97,7 @@ mypy:
 # First deal with the core folder, and then gradually increase the scope of detection,
 # and eventually realize the detection of the complete project.
 ruff:
-	$(PIPRUN) ruff check rdagent/core --ignore FBT001,FBT002   # --exclude rdagent/scripts,git_ignore_folder
+	$(PIPRUN) ruff check rdagent/core --ignore FBT001,FBT002,I001   # --exclude rdagent/scripts,git_ignore_folder
 
 # Check lint with toml-sort.
 toml-sort:
diff --git a/rdagent/app/kaggle/conf.py b/rdagent/app/kaggle/conf.py
index b1f30fbd5..e9b668a2a 100644
--- a/rdagent/app/kaggle/conf.py
+++ b/rdagent/app/kaggle/conf.py
@@ -16,6 +16,13 @@ class Config:
     scen: str = "rdagent.scenarios.kaggle.experiment.scenario.KGScenario"
     """Scenario class for data mining model"""
 
+    knowledge_base: str = ""  # TODO enable this line to use the knowledge base
+    # knowledge_base: str = "rdagent.scenarios.kaggle.knowledge_management.graph.KGKnowledgeGraph"
+    """Knowledge base class"""
+
+    knowledge_base_path: str = "kg_graph.pkl"
+    """Knowledge base path"""
+
     hypothesis_gen: str = "rdagent.scenarios.kaggle.proposal.proposal.KGHypothesisGen"
     """Hypothesis generation class"""
 
diff --git a/rdagent/app/kaggle/loop.py b/rdagent/app/kaggle/loop.py
index b250c343d..11c757720 100644
--- a/rdagent/app/kaggle/loop.py
+++ b/rdagent/app/kaggle/loop.py
@@ -22,6 +22,7 @@
 from rdagent.scenarios.kaggle.proposal.proposal import (
     KG_ACTION_FEATURE_ENGINEERING,
     KG_ACTION_FEATURE_PROCESSING,
+    KGTrace,
 )
 
 
@@ -32,6 +33,13 @@ def __init__(self, PROP_SETTING: BasePropSetting):
             scen: Scenario = import_class(PROP_SETTING.scen)(PROP_SETTING.competition)
             logger.log_object(scen, tag="scenario")
 
+            knowledge_base = (
+                import_class(PROP_SETTING.knowledge_base)(PROP_SETTING.knowledge_base_path, scen)
+                if PROP_SETTING.knowledge_base != ""
+                else None
+            )
+            logger.log_object(knowledge_base, tag="knowledge_base")
+
             self.hypothesis_gen: HypothesisGen = import_class(PROP_SETTING.hypothesis_gen)(scen)
             logger.log_object(self.hypothesis_gen, tag="hypothesis generator")
 
@@ -50,7 +58,7 @@ def __init__(self, PROP_SETTING: BasePropSetting):
 
             self.summarizer: HypothesisExperiment2Feedback = import_class(PROP_SETTING.summarizer)(scen)
             logger.log_object(self.summarizer, tag="summarizer")
-            self.trace = Trace(scen=scen)
+            self.trace = KGTrace(scen=scen, knowledge_base=knowledge_base)
             super(RDLoop, self).__init__()
 
     @measure_time
diff --git a/rdagent/components/coder/factor_coder/CoSTEER/knowledge_management.py b/rdagent/components/coder/factor_coder/CoSTEER/knowledge_management.py
index 462c58122..65cc84bac 100644
--- a/rdagent/components/coder/factor_coder/CoSTEER/knowledge_management.py
+++ b/rdagent/components/coder/factor_coder/CoSTEER/knowledge_management.py
@@ -22,9 +22,9 @@
 )
 from rdagent.core.evolving_framework import (
     EvolvableSubjects,
+    EvolvingKnowledgeBase,
     EvoStep,
     Knowledge,
-    KnowledgeBase,
     QueriedKnowledge,
     RAGStrategy,
 )
@@ -71,12 +71,13 @@ def __init__(self, success_task_to_knowledge_dict: dict = {}, failed_task_info_s
         self.failed_task_info_set = failed_task_info_set
 
 
-class FactorKnowledgeBaseV1(KnowledgeBase):
-    def __init__(self) -> None:
+class FactorKnowledgeBaseV1(EvolvingKnowledgeBase):
+    def __init__(self, path: str | Path = None) -> None:
         self.implementation_trace: dict[str, FactorKnowledge] = dict()
         self.success_task_info_set: set[str] = set()
 
         self.task_to_embedding = dict()
+        super().__init__(path)
 
     def query(self) -> QueriedKnowledge | None:
         """
@@ -746,12 +747,12 @@ def dataset_query(
         return factor_implementation_queried_graph_knowledge
 
 
-class FactorGraphKnowledgeBase(KnowledgeBase):
-    def __init__(self, init_component_list=None, data_set_knowledge_path=None) -> None:
+class FactorGraphKnowledgeBase(EvolvingKnowledgeBase):
+    def __init__(self, init_component_list=None, path: str | Path = None, data_set_knowledge_path=None) -> None:
         """
         Load knowledge, offer brief information of knowledge and common handle interfaces
         """
-        self.graph: UndirectedGraph = UndirectedGraph.load(Path.cwd() / "graph.pkl")
+        self.graph: UndirectedGraph = UndirectedGraph(Path.cwd() / "graph.pkl")
         logger.info(f"Knowledge Graph loaded, size={self.graph.size()}")
 
         if init_component_list:
@@ -780,6 +781,7 @@ def __init__(self, init_component_list=None, data_set_knowledge_path=None) -> No
         if data_set_knowledge_path:
             with open(data_set_knowledge_path, "r") as f:
                 self.data_set_knowledge_dict = json.load(f)
+        super().__init__(path)
 
     def get_all_nodes_by_label(self, label: str) -> list[UndirectedNode]:
         return self.graph.get_all_nodes_by_label(label)
diff --git a/rdagent/components/coder/model_coder/CoSTEER/knowledge_management.py b/rdagent/components/coder/model_coder/CoSTEER/knowledge_management.py
index bd88eb4c4..c9a266fbd 100644
--- a/rdagent/components/coder/model_coder/CoSTEER/knowledge_management.py
+++ b/rdagent/components/coder/model_coder/CoSTEER/knowledge_management.py
@@ -1,11 +1,13 @@
+from pathlib import Path
+
 from rdagent.components.coder.model_coder.conf import MODEL_IMPL_SETTINGS
 from rdagent.components.coder.model_coder.CoSTEER.evaluators import ModelCoderFeedback
 from rdagent.components.coder.model_coder.model import ModelTask
 from rdagent.core.evolving_framework import (
     EvolvableSubjects,
+    EvolvingKnowledgeBase,
     EvoStep,
     Knowledge,
-    KnowledgeBase,
     QueriedKnowledge,
     RAGStrategy,
 )
@@ -49,13 +51,15 @@ def __init__(self, success_task_to_knowledge_dict: dict = {}, failed_task_info_s
         self.working_task_to_similar_successful_knowledge_dict = dict()
 
 
-class ModelKnowledgeBase(KnowledgeBase):
-    def __init__(self) -> None:
+class ModelKnowledgeBase(EvolvingKnowledgeBase):
+    def __init__(self, path: str | Path = None) -> None:
         self.implementation_trace: dict[str, ModelKnowledge] = dict()
         self.success_task_info_set: set[str] = set()
 
         self.task_to_embedding = dict()
 
+        super().__init__(path)
+
     def query(self) -> QueriedKnowledge | None:
         """
         Query the knowledge base to get the queried knowledge. So far is handled in RAG strategy.
diff --git a/rdagent/components/knowledge_management/graph.py b/rdagent/components/knowledge_management/graph.py
index c25f84e9a..50b408acc 100644
--- a/rdagent/components/knowledge_management/graph.py
+++ b/rdagent/components/knowledge_management/graph.py
@@ -12,6 +12,7 @@
     VectorBase,
     cosine,
 )
+from rdagent.core.knowledge_base import KnowledgeBase
 from rdagent.oai.llm_utils import APIBackend
 
 Node = KnowledgeMetaData
@@ -47,14 +48,14 @@ def __repr__(self) -> str:
         )
 
 
-class Graph:
+class Graph(KnowledgeBase):
     """
     base Graph class for Knowledge Graph Search
     """
 
     def __init__(self, path: str | Path | None = None) -> None:
-        self.path = path
         self.nodes = {}
+        super().__init__(path=path)
 
     def size(self) -> int:
         return len(self.nodes)
@@ -77,22 +78,6 @@ def find_node(self, content: str, label: str) -> Node | None:
                 return node
         return None
 
-    @classmethod
-    def load(cls: type[Graph], path: str | Path) -> Graph:
-        """use pickle as the default load method"""
-        path = path if isinstance(path, Path) else Path(path)
-        if not path.exists():
-            return cls(path=path)
-
-        with path.open("rb") as f:
-            return pickle.load(f)
-
-    def save(self, path: str | Path) -> None:
-        """use pickle as the default save method"""
-        Path.mkdir(path.parent, exist_ok=True)
-        with path.open("wb") as f:
-            pickle.dump(self, f)
-
     @staticmethod
     def batch_embedding(nodes: list[Node]) -> list[Node]:
         contents = [node.content for node in nodes]
@@ -119,8 +104,8 @@ class UndirectedGraph(Graph):
     """
 
     def __init__(self, path: str | Path | None = None) -> None:
-        super().__init__(path=path)
         self.vector_base: VectorBase = PDVectorBase()
+        super().__init__(path=path)
 
     def __str__(self) -> str:
         return f"UndirectedGraph(nodes={self.nodes})"
@@ -174,16 +159,6 @@ def add_node(
 
             node.add_neighbor(neighbor)
 
-    @classmethod
-    def load(cls: type[UndirectedGraph], path: str | Path) -> UndirectedGraph:
-        """use pickle as the default load method"""
-        path = path if isinstance(path, Path) else Path(path)
-        if not path.exists():
-            return cls(path=path)
-
-        with path.open("rb") as f:
-            return pickle.load(f)
-
     def add_nodes(self, node: UndirectedNode, neighbors: list[UndirectedNode]) -> None:
         if not neighbors:
             self.add_node(node)
diff --git a/rdagent/components/knowledge_management/vector_base.py b/rdagent/components/knowledge_management/vector_base.py
index f5d22c5e3..bb98caa2c 100644
--- a/rdagent/components/knowledge_management/vector_base.py
+++ b/rdagent/components/knowledge_management/vector_base.py
@@ -5,6 +5,7 @@
 import pandas as pd
 from scipy.spatial.distance import cosine
 
+from rdagent.core.knowledge_base import KnowledgeBase
 from rdagent.log import rdagent_logger as logger
 from rdagent.oai.llm_utils import APIBackend
 
@@ -68,14 +69,11 @@ def contents_to_documents(contents: List[str], label: str = None) -> List[Docume
     return docs
 
 
-class VectorBase:
+class VectorBase(KnowledgeBase):
     """
     This class is used for handling vector storage and query
     """
 
-    def __init__(self, vector_df_path: Union[str, Path] = None, **kwargs):
-        pass
-
     def add(self, document: Union[Document, List[Document]]):
         """
         add new node to vector_df
@@ -104,28 +102,15 @@ def search(self, content: str, topk_k: int = 5, similarity_threshold: float = 0)
         """
         pass
 
-    def load(self, **kwargs):
-        """load vector_df"""
-
-    def save(self, **kwargs):
-        """save vector_df"""
-
 
 class PDVectorBase(VectorBase):
     """
     Implement of VectorBase using Pandas
     """
 
-    def __init__(self, vector_df_path: Union[str, Path] = None):
-        super().__init__(vector_df_path)
-
-        if vector_df_path:
-            try:
-                self.vector_df = self.load(vector_df_path)
-            except FileNotFoundError:
-                self.vector_df = pd.DataFrame(columns=["id", "label", "content", "embedding"])
-        else:
-            self.vector_df = pd.DataFrame(columns=["id", "label", "content", "embedding"])
+    def __init__(self, path: Union[str, Path] = None):
+        self.vector_df = pd.DataFrame(columns=["id", "label", "content", "embedding"])
+        super().__init__(path)
 
     def shape(self):
         return self.vector_df.shape
@@ -196,10 +181,3 @@ def search(self, content: str, topk_k: int = 5, similarity_threshold: float = 0)
         for _, similar_docs in most_similar_docs.iterrows():
             docs.append(Document().from_dict(similar_docs.to_dict()))
         return docs, searched_similarities.to_list()
-
-    def load(self, vector_df_path, **kwargs):
-        vector_df = pd.read_pickle(vector_df_path)
-        return vector_df
-
-    def save(self, vector_df_path, **kwargs):
-        self.vector_df.to_pickle(vector_df_path)
diff --git a/rdagent/components/workflow/conf.py b/rdagent/components/workflow/conf.py
index f4d1f2a3a..77ed0698c 100644
--- a/rdagent/components/workflow/conf.py
+++ b/rdagent/components/workflow/conf.py
@@ -14,6 +14,8 @@ class Config:
     """
 
     scen: str = ""
+    knowledge_base: str = ""
+    knowledge_base_path: str = ""
     hypothesis_gen: str = ""
     hypothesis2experiment: str = ""
     coder: str = ""
diff --git a/rdagent/core/evolving_framework.py b/rdagent/core/evolving_framework.py
index 25f37b4b0..016abad01 100644
--- a/rdagent/core/evolving_framework.py
+++ b/rdagent/core/evolving_framework.py
@@ -5,6 +5,8 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any
 
+from rdagent.core.knowledge_base import KnowledgeBase
+
 if TYPE_CHECKING:
     from rdagent.core.evaluation import Feedback
     from rdagent.core.scenario import Scenario
@@ -18,7 +20,7 @@ class QueriedKnowledge:
     pass
 
 
-class KnowledgeBase(ABC):
+class EvolvingKnowledgeBase(KnowledgeBase):
     @abstractmethod
     def query(
         self,
@@ -78,7 +80,7 @@ def evolve(
 class RAGStrategy(ABC):
     """Retrieval Augmentation Generation Strategy"""
 
-    def __init__(self, knowledgebase: KnowledgeBase) -> None:
+    def __init__(self, knowledgebase: EvolvingKnowledgeBase) -> None:
         self.knowledgebase = knowledgebase
 
     @abstractmethod
diff --git a/rdagent/core/knowledge_base.py b/rdagent/core/knowledge_base.py
new file mode 100644
index 000000000..f4cd9ab51
--- /dev/null
+++ b/rdagent/core/knowledge_base.py
@@ -0,0 +1,25 @@
+from pathlib import Path
+
+import dill as pickle  # type: ignore[import-untyped]
+
+from rdagent.log import rdagent_logger as logger
+
+
+class KnowledgeBase:
+    def __init__(self, path: str | Path | None = None) -> None:
+        self.path = Path(path) if path else None
+        self.load()
+
+    def load(self) -> None:
+        if self.path is not None and self.path.exists():
+            with self.path.open("rb") as f:
+                self.__dict__.update(
+                    pickle.load(f).__dict__,
+                )  # TODO: because we need to align with init function, we need a less hacky way to do this
+
+    def dump(self) -> None:
+        if self.path is not None:
+            self.path.parent.mkdir(parents=True, exist_ok=True)
+            pickle.dump(self, self.path.open("wb"))
+        else:
+            logger.warning("KnowledgeBase path is not set, dump failed.")
diff --git a/rdagent/core/prompts.py b/rdagent/core/prompts.py
index 02898480a..5fc059597 100644
--- a/rdagent/core/prompts.py
+++ b/rdagent/core/prompts.py
@@ -1,4 +1,4 @@
-from pathlib import Path  # noqa: I001
+from pathlib import Path
 
 import yaml
 
diff --git a/rdagent/core/proposal.py b/rdagent/core/proposal.py
index 58c044d74..a420bb311 100644
--- a/rdagent/core/proposal.py
+++ b/rdagent/core/proposal.py
@@ -9,6 +9,7 @@
 
 from rdagent.core.evaluation import Feedback
 from rdagent.core.experiment import ASpecificExp, Experiment
+from rdagent.core.knowledge_base import KnowledgeBase
 from rdagent.core.scenario import Scenario
 
 if TYPE_CHECKING:
@@ -83,12 +84,14 @@ def __str__(self) -> str:
 
 
 ASpecificScen = TypeVar("ASpecificScen", bound=Scenario)
+ASpecificKB = TypeVar("ASpecificKB", bound=KnowledgeBase)
 
 
-class Trace(Generic[ASpecificScen]):
-    def __init__(self, scen: ASpecificScen) -> None:
+class Trace(Generic[ASpecificScen, ASpecificKB]):
+    def __init__(self, scen: ASpecificScen, knowledge_base: ASpecificKB | None = None) -> None:
         self.scen: ASpecificScen = scen
         self.hist: list[tuple[Hypothesis, Experiment, HypothesisFeedback]] = []
+        self.knowledge_base: ASpecificKB | None = knowledge_base
 
     def get_sota_hypothesis_and_experiment(self) -> tuple[Hypothesis | None, Experiment | None]:
         """Access the last experiment result, sub-task, and the corresponding hypothesis."""
diff --git a/rdagent/oai/llm_utils.py b/rdagent/oai/llm_utils.py
index 8cf410b61..d5ccd43e9 100644
--- a/rdagent/oai/llm_utils.py
+++ b/rdagent/oai/llm_utils.py
@@ -90,7 +90,7 @@ def __init__(self, cache_location: str) -> None:
         self.cache_location = cache_location
         db_file_exist = Path(cache_location).exists()
         # TODO: sqlite3 does not support multiprocessing.
-        self.conn = sqlite3.connect(cache_location)
+        self.conn = sqlite3.connect(cache_location, timeout=20)
         self.c = self.conn.cursor()
         if not db_file_exist:
             self.c.execute(
diff --git a/rdagent/scenarios/kaggle/experiment/meta_tpl/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/meta_tpl/fea_share_preprocess.py
index 4b4ef2736..a6d31a9c4 100644
--- a/rdagent/scenarios/kaggle/experiment/meta_tpl/fea_share_preprocess.py
+++ b/rdagent/scenarios/kaggle/experiment/meta_tpl/fea_share_preprocess.py
@@ -84,13 +84,13 @@ def preprocess_script():
     """
     This method applies the preprocessing steps to the training, validation, and test datasets.
     """
-    if os.path.exists("X_train.pkl"):
-        X_train = pd.read_pickle("X_train.pkl")
-        X_valid = pd.read_pickle("X_valid.pkl")
-        y_train = pd.read_pickle("y_train.pkl")
-        y_valid = pd.read_pickle("y_valid.pkl")
-        X_test = pd.read_pickle("X_test.pkl")
-        passenger_ids = pd.read_pickle("passenger_ids.pkl")
+    if os.path.exists("/kaggle/preprocessed_data/X_train.pkl"):
+        X_train = pd.read_pickle("/kaggle/preprocessed_data/X_train.pkl")
+        X_valid = pd.read_pickle("/kaggle/preprocessed_data/X_valid.pkl")
+        y_train = pd.read_pickle("/kaggle/preprocessed_data/y_train.pkl")
+        y_valid = pd.read_pickle("/kaggle/preprocessed_data/y_valid.pkl")
+        X_test = pd.read_pickle("/kaggle/preprocessed_data/X_test.pkl")
+        passenger_ids = pd.read_pickle("/kaggle/preprocessed_data/passenger_ids.pkl")
 
         return X_train, X_valid, y_train, y_valid, X_test, passenger_ids
     X_train, X_valid, y_train, y_valid = prepreprocess()
diff --git a/rdagent/scenarios/kaggle/experiment/prompts.yaml b/rdagent/scenarios/kaggle/experiment/prompts.yaml
index ab3ab1392..dc35ce318 100644
--- a/rdagent/scenarios/kaggle/experiment/prompts.yaml
+++ b/rdagent/scenarios/kaggle/experiment/prompts.yaml
@@ -56,7 +56,7 @@ kg_background: |-
   The user tries to optimize the performance iteratively by employing one of the feature related or model related action items:
   - Feature related:
     - "Feature engineering": The user will design several new tasks and implement several new features. The new feature might only affect the model using all the feature book.
-    - "Feature processing": The user will design a new task to process the feature book like normalization or one hot encoding to improve the model performance.
+    - "Feature processing": The user will design a new task to process the feature book like normalization or one hot encoding to improve the model performance. Any processing with help of a deep model is not included in this task.
   - Model related:
     - "Model feature selection": The user will modify one model to select the most important features from the feature book to improve the model performance.
     - "Model tuning": The user will tune the hyperparameters of XGBoost, RandomForest or LightGBM or build or improve the NN model to improve the model performance.
diff --git a/rdagent/scenarios/kaggle/experiment/scenario.py b/rdagent/scenarios/kaggle/experiment/scenario.py
index c8221ea64..95fd3158e 100644
--- a/rdagent/scenarios/kaggle/experiment/scenario.py
+++ b/rdagent/scenarios/kaggle/experiment/scenario.py
@@ -38,11 +38,6 @@ def __init__(self, competition: str) -> None:
 
         self._background = self.background
 
-        # all competitions are based on the same vector base
-        self.vector_base = KaggleExperienceBase()
-        if KAGGLE_IMPLEMENT_SETTING.rag_path and Path(KAGGLE_IMPLEMENT_SETTING.rag_path).exists():
-            self.vector_base.load(KAGGLE_IMPLEMENT_SETTING.rag_path)
-
     def _analysis_competition_description(self):
         sys_prompt = (
             Environment(undefined=StrictUndefined)
@@ -69,7 +64,14 @@ def _analysis_competition_description(self):
         self.competition_type = response_json_analysis.get("Competition Type", "No type provided")
         self.competition_description = response_json_analysis.get("Competition Description", "No description provided")
         self.target_description = response_json_analysis.get("Target Description", "No target provided")
-        self.competition_features = self.source_data
+        self.competition_features = response_json_analysis.get("Competition Features", "No features provided")
+
+    def get_competition_full_desc(self) -> str:
+        return f"""Competition Type: {self.competition_type}
+Competition Description: {self.competition_description}
+Target Description: {self.target_description}
+Competition Features: {self.competition_features}
+"""
 
     @property
     def background(self) -> str:
diff --git a/rdagent/scenarios/kaggle/experiment/workspace.py b/rdagent/scenarios/kaggle/experiment/workspace.py
index 30992d44c..e6f023ca2 100644
--- a/rdagent/scenarios/kaggle/experiment/workspace.py
+++ b/rdagent/scenarios/kaggle/experiment/workspace.py
@@ -5,6 +5,7 @@
 import pandas as pd
 
 from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
+from rdagent.components.coder.factor_coder.config import FACTOR_IMPLEMENT_SETTINGS
 from rdagent.core.experiment import FBWorkspace
 from rdagent.log import rdagent_logger as logger
 from rdagent.utils.env import KGDockerEnv
@@ -64,22 +65,24 @@ def generate_preprocess_data(
     def execute(self, run_env: dict = {}, *args, **kwargs) -> str:
         logger.info(f"Running the experiment in {self.workspace_path}")
 
-        # link the data to the workspace to speed up the preprocessing
-        source_data_path = Path(KAGGLE_IMPLEMENT_SETTING.local_data_path) / KAGGLE_IMPLEMENT_SETTING.competition
-        self.link_all_files_in_folder_to_workspace(source_data_path, self.workspace_path)
-
         kgde = KGDockerEnv(KAGGLE_IMPLEMENT_SETTING.competition)
         kgde.prepare()
 
+        running_extra_volume = {
+            (
+                Path(FACTOR_IMPLEMENT_SETTINGS.data_folder) / KAGGLE_IMPLEMENT_SETTING.competition
+            ).absolute(): "/kaggle/preprocessed_data"
+        }
+        if KAGGLE_IMPLEMENT_SETTING.competition:
+            running_extra_volume[
+                KAGGLE_IMPLEMENT_SETTING.local_data_path + "/" + KAGGLE_IMPLEMENT_SETTING.competition
+            ] = "/kaggle/input"
+
         execute_log = kgde.run(
             local_path=str(self.workspace_path),
             entry=f"python train.py",
             env=run_env,
-            running_extra_volume=(
-                {KAGGLE_IMPLEMENT_SETTING.local_data_path + "/" + KAGGLE_IMPLEMENT_SETTING.competition: "/kaggle/input"}
-                if KAGGLE_IMPLEMENT_SETTING.competition
-                else None
-            ),
+            running_extra_volume=running_extra_volume,
         )
 
         csv_path = self.workspace_path / "submission_score.csv"
diff --git a/rdagent/scenarios/kaggle/knowledge_management/graph.py b/rdagent/scenarios/kaggle/knowledge_management/graph.py
new file mode 100644
index 000000000..3c6d42200
--- /dev/null
+++ b/rdagent/scenarios/kaggle/knowledge_management/graph.py
@@ -0,0 +1,108 @@
+import json
+from pathlib import Path
+from typing import List
+
+from jinja2 import Environment, StrictUndefined
+from tqdm import tqdm
+
+from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
+from rdagent.components.knowledge_management.graph import (
+    UndirectedGraph,
+    UndirectedNode,
+)
+from rdagent.core.conf import RD_AGENT_SETTINGS
+from rdagent.core.prompts import Prompts
+from rdagent.core.utils import multiprocessing_wrapper
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.scenarios.kaggle.experiment.scenario import KGScenario
+
+PROMPT_DICT = Prompts(file_path=Path(__file__).parent / "prompts.yaml")
+
+
+class KGKnowledgeGraph(UndirectedGraph):
+    def __init__(self, path: str | Path | None, scenario: KGScenario) -> None:
+        super().__init__(path)
+        if path is not None and not Path(path).exists():
+            documents = []
+            for file_path in (Path(KAGGLE_IMPLEMENT_SETTING.local_data_path) / "domain_knowledge").glob("*.case"):
+                with open(file_path, "r") as f:
+                    documents.append(f.read())
+            self.load_from_documents(documents=documents, scenario=scenario)
+            self.dump()
+            tmp = 1
+
+    def analyze_one_document(self, document_content: str, scenario: KGScenario) -> list:
+        session_system_prompt = (
+            Environment(undefined=StrictUndefined)
+            .from_string(PROMPT_DICT["extract_knowledge_graph_from_document"]["system"])
+            .render(scenario=scenario.get_scenario_all_desc())
+        )
+
+        session = APIBackend().build_chat_session(
+            session_system_prompt=session_system_prompt,
+        )
+        user_prompt = (
+            Environment(undefined=StrictUndefined)
+            .from_string(PROMPT_DICT["extract_knowledge_graph_from_document"]["user"])
+            .render(document_content=document_content)
+        )
+        knowledge_list = []
+        for _ in range(10):
+            response = session.build_chat_completion(user_prompt=user_prompt, json_mode=True)
+            knowledge = json.loads(response)
+            knowledge_list.append(knowledge)
+            user_prompt = "Continue from the last step please. Don't extract the same knowledge again."
+        return knowledge_list
+
+    def load_from_documents(self, documents: List[str], scenario: KGScenario):
+        knowledge_list_list = multiprocessing_wrapper(
+            [
+                (
+                    self.analyze_one_document,
+                    (
+                        document_content,
+                        scenario,
+                    ),
+                )
+                for document_content in documents
+            ],
+            n=RD_AGENT_SETTINGS.multi_proc_n,
+        )
+        node_pairs = []
+        node_list = []
+        for knowledge_list in tqdm(knowledge_list_list):
+            for knowledge in knowledge_list:
+                if knowledge == {}:
+                    break
+                competition = knowledge.get("competition", "")
+
+                competition_node = UndirectedNode(
+                    content=(
+                        "General knowledge not related to any competition"
+                        if (competition == "" or competition == "N/A")
+                        else competition
+                    ),
+                    label="competition",
+                )
+                node_list.append(competition_node)
+
+                for action in ["hypothesis", "experiments", "code", "conclusion"]:
+                    if action == "hypothesis":
+                        if isinstance(knowledge.get("hypothesis", ""), str) and knowledge.get("hypothesis", "") in [
+                            "N/A",
+                            "",
+                        ]:
+                            break
+                        label = knowledge[action]["type"]
+                    else:
+                        label = action
+                    content = str(knowledge.get(action, ""))
+                    if content == "" or content == "N/A":
+                        continue
+                    node = UndirectedNode(content=content, label=label)
+                    node_list.append(node)
+                    node_pairs.append((node, competition_node))
+
+        node_list = self.batch_embedding(node_list)
+        for node_pair in node_pairs:
+            self.add_node(node_pair[0], node_pair[1])
diff --git a/rdagent/scenarios/kaggle/knowledge_management/prompts.yaml b/rdagent/scenarios/kaggle/knowledge_management/prompts.yaml
index e7daedc05..7f04a69fe 100644
--- a/rdagent/scenarios/kaggle/knowledge_management/prompts.yaml
+++ b/rdagent/scenarios/kaggle/knowledge_management/prompts.yaml
@@ -36,4 +36,45 @@ extract_kaggle_knowledge_from_feedback_prompts:
     }
   
   user: |-
-    Experiment strategy: {{ experiment_strategy }}
\ No newline at end of file
+    Experiment strategy: {{ experiment_strategy }}
+
+
+extract_knowledge_graph_from_document:
+  system: |-
+    You are helping user to extract knowledge from a document.
+    The user is working on data science competitions in Kaggle in the following scenario:
+    {{ scenario }}
+
+    The user has found some possible high value documents from other experts, and they need your help to extract some knowledge from these documents.
+
+    Considering each document might contain several valuable insights, you need to extract them one by one and organize them in a structured format.
+
+    You should return a dict containing a single knowledge which includes several fields:
+    1. The competition the document is related to.
+    2. The hypothesis the document is trying to prove. Containing a type to the hypothesis and very detailed explanation to the hypothesis. The type should be one from ["Feature engineering", "Feature processing", "Model feature selection", "Model tuning"].
+    3. Detailed experiments the document has conducted. 
+    4. Any related code snippets related to the hypothesis if available.
+    5. The conclusion to this knowledge. A bool value indicating whether the hypothesis is proved or not is required. More explainable conclusion is also needed.
+
+    Please provide the analysis in the following JSON format:
+    {
+      "competition": "(Plain text) extracted competition information, including the competition name, type, description, target, and features",
+      "hypothesis":
+        {
+          "type": "one of the hypothesis types from ['Feature engineering', 'Feature processing', 'Model feature selection', 'Model tuning']",
+          "explanation": "(Plain text) extracted detailed explanation to the hypothesis"
+        },
+      "experiments": "(Plain text) extracted experiments details. You can list them in bullet points.",
+      "code": "extracted code snippets if available",
+      "conclusion": 
+        {
+          "proved": "bool value indicating whether the hypothesis is proved or not",
+          "explanation": "(Plain text) extracted detailed explanation to the conclusion"
+        }
+    }
+    All fields are required so don't miss any key in the schema. The document might not contain all the fields, so you should extract as much information as possible. If a field is not available, please put "N/A" in the field.
+
+    If you find no valuable insights in the document, please return an empty dict.
+  
+  user: |-
+    Document content: {{ document_content }}
\ No newline at end of file
diff --git a/rdagent/scenarios/kaggle/knowledge_management/vector_base.py b/rdagent/scenarios/kaggle/knowledge_management/vector_base.py
index 4b6514dfd..4ad55c3f1 100644
--- a/rdagent/scenarios/kaggle/knowledge_management/vector_base.py
+++ b/rdagent/scenarios/kaggle/knowledge_management/vector_base.py
@@ -1,15 +1,10 @@
-import uuid
 from pathlib import Path
-from typing import List, Tuple, Union
+from typing import List, Union
 
 import pandas as pd
 from _pytest.cacheprovider import json
-from scipy.spatial.distance import cosine
 
-from rdagent.components.knowledge_management.vector_base import (
-    KnowledgeMetaData,
-    PDVectorBase,
-)
+from rdagent.components.knowledge_management.vector_base import Document, PDVectorBase
 from rdagent.log import rdagent_logger as logger
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.scenarios.kaggle.knowledge_management.extract_knowledge import (
@@ -17,7 +12,7 @@
 )
 
 
-class KGKnowledgeMetaData(KnowledgeMetaData):
+class KGKnowledgeDocument(Document):
     """
     Class for handling Kaggle competition specific metadata
     """
@@ -104,7 +99,7 @@ def __repr__(self):
         )
 
 
-KGDocument = KGKnowledgeMetaData
+KGDocument = KGKnowledgeDocument
 
 
 class KaggleExperienceBase(PDVectorBase):
@@ -112,7 +107,7 @@ class KaggleExperienceBase(PDVectorBase):
     Class for handling Kaggle competition experience posts and organizing them for reference
     """
 
-    def __init__(self, vector_df_path: Union[str, Path] = None, kaggle_experience_path: Union[str, Path] = None):
+    def __init__(self, path: Union[str, Path] = None, kaggle_experience_path: Union[str, Path] = None):
         """
         Initialize the KaggleExperienceBase class
 
@@ -123,7 +118,7 @@ def __init__(self, vector_df_path: Union[str, Path] = None, kaggle_experience_pa
         kaggle_experience_path: str or Path, optional
             Path to the Kaggle experience post data.
         """
-        super().__init__(vector_df_path)
+        super().__init__(path)
         self.kaggle_experience_path = kaggle_experience_path
         self.kaggle_experience_data = []
 
@@ -193,7 +188,7 @@ def add_experience_to_vector_base(self, experiment_feedback=None):
         if experiment_feedback:
             extracted_knowledge = extract_knowledge_from_feedback(experiment_feedback)
 
-            document = KGKnowledgeMetaData(
+            document = KGKnowledgeDocument(
                 content=experiment_feedback.get("hypothesis_text", ""),
                 label="Experiment Feedback",
                 competition_name="Experiment Result",
@@ -216,7 +211,7 @@ def add_experience_to_vector_base(self, experiment_feedback=None):
             ranking = experience.get("ranking", None)
             score = experience.get("score", None)
 
-            document = KGKnowledgeMetaData(
+            document = KGKnowledgeDocument(
                 content=content,
                 label=label,
                 competition_name=competition_name,
@@ -250,7 +245,7 @@ def search_experience(self, query: str, topk_k: int = 5, similarity_threshold: f
 
         kaggle_docs = []
         for result in search_results:
-            kg_doc = KGKnowledgeMetaData().from_dict(result.__dict__)
+            kg_doc = KGKnowledgeDocument().from_dict(result.__dict__)
             kaggle_docs.append(kg_doc)
 
         return kaggle_docs, similarities
@@ -263,7 +258,7 @@ def search_experience(self, query: str, topk_k: int = 5, similarity_threshold: f
 
     kaggle_base.add_experience_to_vector_base()
 
-    kaggle_base.save("git_ignore_folder/experience/tabular_cases/kaggle_vector_base.pkl")
+    kaggle_base.save()
 
     print(f"There are {kaggle_base.shape()[0]} records in the vector base.")
 
diff --git a/rdagent/scenarios/kaggle/prompts.yaml b/rdagent/scenarios/kaggle/prompts.yaml
index 97feb2c8e..c32cc25cd 100644
--- a/rdagent/scenarios/kaggle/prompts.yaml
+++ b/rdagent/scenarios/kaggle/prompts.yaml
@@ -1,3 +1,30 @@
+KG_hypothesis_gen_RAG: |-
+  The user has proposed several hypothesis and conducted experiments to validate them. 
+  The hypothesis can divided into two categories:
+  1. Insights: These are the observations user did to other similar problems. You can either apply the same hypothesis or modify them to fit the current problem.
+  2. Experience: These are former hypothesis and experiments user did to the current problem. You can either continue to improve the hypothesis or change to a new one.
+  
+  {% if insights %}
+  The insights are as follows:
+  {% for insight in insights %}
+  Insight: {{ loop.index }}
+  - hypothesis: {{ insight.hypothesis }}
+  - experiments: {{ insight.experiments }}
+  - conclusion: {{ insight.conclusion }}
+  {% endfor %}
+  {% endif %}
+
+  {% if experiences %}
+  The experiences are as follows:
+  {% for experience in experiences %}
+  Experience: {{ loop.index }}
+  - hypothesis: {{ experience.hypothesis }}
+  - experiments: {{ experience.experiments }}
+  - conclusion: {{ experience.conclusion }}
+  {% endfor %}
+  {% endif %}
+
+
 hypothesis_and_feedback: |-
   {% for hypothesis, experiment, feedback in trace.hist %}
   Hypothesis {{ loop.index }}: {{ hypothesis }}
diff --git a/rdagent/scenarios/kaggle/proposal/proposal.py b/rdagent/scenarios/kaggle/proposal/proposal.py
index ab0359a76..b0abe9be2 100644
--- a/rdagent/scenarios/kaggle/proposal/proposal.py
+++ b/rdagent/scenarios/kaggle/proposal/proposal.py
@@ -19,6 +19,8 @@
     KGFactorExperiment,
     KGModelExperiment,
 )
+from rdagent.scenarios.kaggle.experiment.scenario import KGScenario
+from rdagent.scenarios.kaggle.knowledge_management.graph import KGKnowledgeGraph
 from rdagent.scenarios.kaggle.knowledge_management.vector_base import (
     KaggleExperienceBase,
 )
@@ -77,9 +79,88 @@ class KGHypothesisGen(ModelHypothesisGen):
             prompts: Prompts = a_specifc_prompt_dict
     """
 
-    def __init__(self, scen: Scenario, knowledge: VectorBase = None) -> Tuple[dict, bool]:
+    def __init__(self, scen: Scenario) -> Tuple[dict, bool]:
         super().__init__(scen)
-        self.scen.vector_base.save(KAGGLE_IMPLEMENT_SETTING.rag_path)
+
+    def generate_RAG_content(self, trace: Trace) -> str:
+        if trace.knowledge_base is None:
+            return None
+        same_competition_node = trace.knowledge_base.get_node_by_content(trace.scen.get_competition_full_desc())
+        if same_competition_node is not None:
+            related_hypothesis_nodes = []
+            for action in KG_ACTION_LIST:
+                related_hypothesis_nodes.extend(
+                    trace.knowledge_base.get_nodes_within_steps(
+                        start_node=same_competition_node,
+                        steps=1,
+                        constraint_labels=[action],
+                    )[:1]
+                )
+        else:
+            related_hypothesis_nodes = []
+        experiences = []
+        for hypothesis_node in related_hypothesis_nodes:
+            experience = {"hypothesis": hypothesis_node.content}
+            experiment_node_list = trace.knowledge_base.get_nodes_within_steps(
+                start_node=hypothesis_node, steps=1, constraint_labels=["experiments"]
+            )
+            if len(experiment_node_list) > 0:
+                experience["experiments"] = experiment_node_list[0].content
+            else:
+                experience["experiments"] = "No experiment information available."
+            conclusion_node_list = trace.knowledge_base.get_nodes_within_steps(
+                start_node=hypothesis_node, steps=1, constraint_labels=["conclusion"]
+            )
+            if len(conclusion_node_list) > 0:
+                experience["conclusion"] = conclusion_node_list[0].content
+            else:
+                experience["conclusion"] = "No conclusion information available."
+            experiences.append(experience)
+
+        similar_nodes = trace.knowledge_base.semantic_search(
+            node=trace.scen.get_competition_full_desc(),
+            topk_k=2,
+        )
+
+        found_hypothesis_nodes = []
+        for similar_node in similar_nodes:
+            for hypothesis_type in KG_ACTION_LIST:
+                hypothesis_nodes = trace.knowledge_base.get_nodes_within_steps(
+                    start_node=similar_node,
+                    steps=3,
+                    constraint_labels=[hypothesis_type],
+                )
+                found_hypothesis_nodes.extend(hypothesis_nodes[:2])
+
+        found_hypothesis_nodes = sorted(list(set(found_hypothesis_nodes)), key=lambda x: len(x.content))
+
+        insights = []
+        for hypothesis_node in found_hypothesis_nodes[:5]:
+            if hypothesis_node in related_hypothesis_nodes:
+                continue
+            insight = {"hypothesis": hypothesis_node.content}
+            experiment_node_list = trace.knowledge_base.get_nodes_within_steps(
+                start_node=hypothesis_node, steps=1, constraint_labels=["experiments"]
+            )
+            if len(experiment_node_list) > 0:
+                insight["experiments"] = experiment_node_list[0].content
+            else:
+                insight["experiments"] = "No experiment information available."
+            conclusion_node_list = trace.knowledge_base.get_nodes_within_steps(
+                start_node=hypothesis_node, steps=1, constraint_labels=["conclusion"]
+            )
+            if len(conclusion_node_list) > 0:
+                insight["conclusion"] = conclusion_node_list[0].content
+            else:
+                insight["conclusion"] = "No conclusion information available."
+            insights.append(insight)
+
+        RAG_content = (
+            Environment(undefined=StrictUndefined)
+            .from_string(prompt_dict["KG_hypothesis_gen_RAG"])
+            .render(insights=insights, experiences=experiences)
+        )
+        return RAG_content
 
     def prepare_context(self, trace: Trace) -> Tuple[dict, bool]:
         hypothesis_and_feedback = (
@@ -92,12 +173,9 @@ def prepare_context(self, trace: Trace) -> Tuple[dict, bool]:
             else "No previous hypothesis and feedback available since it's the first round."
         )
 
-        rag_results, _ = self.scen.vector_base.search_experience(hypothesis_and_feedback, topk_k=5)
-        rag_content = "\n".join([doc.content for doc in rag_results])
-
         context_dict = {
             "hypothesis_and_feedback": hypothesis_and_feedback,
-            "RAG": None,
+            "RAG": self.generate_RAG_content(trace),
             "hypothesis_output_format": prompt_dict["hypothesis_output_format"],
             "hypothesis_specification": None,
         }
@@ -197,3 +275,7 @@ def convert_response(self, response: str, trace: Trace) -> ModelExperiment:
             return self.convert_feature_experiment(response, trace)
         elif self.current_action in [KG_ACTION_MODEL_FEATURE_SELECTION, KG_ACTION_MODEL_TUNING]:
             return self.convert_model_experiment(response, trace)
+
+
+class KGTrace(Trace[KGScenario, KGKnowledgeGraph]):
+    pass