feat: Initial version if Graph RAG in KAGGLE scenario (#301)

* Initial version if Graph RAG in KAGGLE scenario * fix CI * fix a small bug * fix CI * fix CI * fix CI
microsoft · Sep 23, 2024 · fd3c0fd · fd3c0fd
1 parent 4ecf25f
commit fd3c0fd
Show file tree

Hide file tree

Showing 22 changed files with 382 additions and 118 deletions.
diff --git a/Makefile b/Makefile
@@ -97,7 +97,7 @@ mypy:
 # First deal with the core folder, and then gradually increase the scope of detection,
 # and eventually realize the detection of the complete project.
 ruff:
-	$(PIPRUN) ruff check rdagent/core --ignore FBT001,FBT002   # --exclude rdagent/scripts,git_ignore_folder
+	$(PIPRUN) ruff check rdagent/core --ignore FBT001,FBT002,I001   # --exclude rdagent/scripts,git_ignore_folder
 
 # Check lint with toml-sort.
 toml-sort:

diff --git a/rdagent/app/kaggle/conf.py b/rdagent/app/kaggle/conf.py
@@ -16,6 +16,13 @@ class Config:
     scen: str = "rdagent.scenarios.kaggle.experiment.scenario.KGScenario"
     """Scenario class for data mining model"""
 
+    knowledge_base: str = ""  # TODO enable this line to use the knowledge base
+    # knowledge_base: str = "rdagent.scenarios.kaggle.knowledge_management.graph.KGKnowledgeGraph"
+    """Knowledge base class"""
+
+    knowledge_base_path: str = "kg_graph.pkl"
+    """Knowledge base path"""
+
     hypothesis_gen: str = "rdagent.scenarios.kaggle.proposal.proposal.KGHypothesisGen"
     """Hypothesis generation class"""
 

diff --git a/rdagent/app/kaggle/loop.py b/rdagent/app/kaggle/loop.py
@@ -22,6 +22,7 @@
 from rdagent.scenarios.kaggle.proposal.proposal import (
     KG_ACTION_FEATURE_ENGINEERING,
     KG_ACTION_FEATURE_PROCESSING,
+    KGTrace,
 )
 
 
@@ -32,6 +33,13 @@ def __init__(self, PROP_SETTING: BasePropSetting):
             scen: Scenario = import_class(PROP_SETTING.scen)(PROP_SETTING.competition)
             logger.log_object(scen, tag="scenario")
 
+            knowledge_base = (
+                import_class(PROP_SETTING.knowledge_base)(PROP_SETTING.knowledge_base_path, scen)
+                if PROP_SETTING.knowledge_base != ""
+                else None
+            )
+            logger.log_object(knowledge_base, tag="knowledge_base")
+
             self.hypothesis_gen: HypothesisGen = import_class(PROP_SETTING.hypothesis_gen)(scen)
             logger.log_object(self.hypothesis_gen, tag="hypothesis generator")
 
@@ -50,7 +58,7 @@ def __init__(self, PROP_SETTING: BasePropSetting):
 
             self.summarizer: HypothesisExperiment2Feedback = import_class(PROP_SETTING.summarizer)(scen)
             logger.log_object(self.summarizer, tag="summarizer")
-            self.trace = Trace(scen=scen)
+            self.trace = KGTrace(scen=scen, knowledge_base=knowledge_base)
             super(RDLoop, self).__init__()
 
     @measure_time

diff --git a/rdagent/components/coder/factor_coder/CoSTEER/knowledge_management.py b/rdagent/components/coder/factor_coder/CoSTEER/knowledge_management.py
@@ -22,9 +22,9 @@
 )
 from rdagent.core.evolving_framework import (
     EvolvableSubjects,
+    EvolvingKnowledgeBase,
     EvoStep,
     Knowledge,
-    KnowledgeBase,
     QueriedKnowledge,
     RAGStrategy,
 )
@@ -71,12 +71,13 @@ def __init__(self, success_task_to_knowledge_dict: dict = {}, failed_task_info_s
         self.failed_task_info_set = failed_task_info_set
 
 
-class FactorKnowledgeBaseV1(KnowledgeBase):
-    def __init__(self) -> None:
+class FactorKnowledgeBaseV1(EvolvingKnowledgeBase):
+    def __init__(self, path: str | Path = None) -> None:
         self.implementation_trace: dict[str, FactorKnowledge] = dict()
         self.success_task_info_set: set[str] = set()
 
         self.task_to_embedding = dict()
+        super().__init__(path)
 
     def query(self) -> QueriedKnowledge | None:
         """
@@ -746,12 +747,12 @@ def dataset_query(
         return factor_implementation_queried_graph_knowledge
 
 
-class FactorGraphKnowledgeBase(KnowledgeBase):
-    def __init__(self, init_component_list=None, data_set_knowledge_path=None) -> None:
+class FactorGraphKnowledgeBase(EvolvingKnowledgeBase):
+    def __init__(self, init_component_list=None, path: str | Path = None, data_set_knowledge_path=None) -> None:
         """
         Load knowledge, offer brief information of knowledge and common handle interfaces
         """
-        self.graph: UndirectedGraph = UndirectedGraph.load(Path.cwd() / "graph.pkl")
+        self.graph: UndirectedGraph = UndirectedGraph(Path.cwd() / "graph.pkl")
         logger.info(f"Knowledge Graph loaded, size={self.graph.size()}")
 
         if init_component_list:
@@ -780,6 +781,7 @@ def __init__(self, init_component_list=None, data_set_knowledge_path=None) -> No
         if data_set_knowledge_path:
             with open(data_set_knowledge_path, "r") as f:
                 self.data_set_knowledge_dict = json.load(f)
+        super().__init__(path)
 
     def get_all_nodes_by_label(self, label: str) -> list[UndirectedNode]:
         return self.graph.get_all_nodes_by_label(label)

diff --git a/rdagent/components/coder/model_coder/CoSTEER/knowledge_management.py b/rdagent/components/coder/model_coder/CoSTEER/knowledge_management.py
@@ -1,11 +1,13 @@
+from pathlib import Path
+
 from rdagent.components.coder.model_coder.conf import MODEL_IMPL_SETTINGS
 from rdagent.components.coder.model_coder.CoSTEER.evaluators import ModelCoderFeedback
 from rdagent.components.coder.model_coder.model import ModelTask
 from rdagent.core.evolving_framework import (
     EvolvableSubjects,
+    EvolvingKnowledgeBase,
     EvoStep,
     Knowledge,
-    KnowledgeBase,
     QueriedKnowledge,
     RAGStrategy,
 )
@@ -49,13 +51,15 @@ def __init__(self, success_task_to_knowledge_dict: dict = {}, failed_task_info_s
         self.working_task_to_similar_successful_knowledge_dict = dict()
 
 
-class ModelKnowledgeBase(KnowledgeBase):
-    def __init__(self) -> None:
+class ModelKnowledgeBase(EvolvingKnowledgeBase):
+    def __init__(self, path: str | Path = None) -> None:
         self.implementation_trace: dict[str, ModelKnowledge] = dict()
         self.success_task_info_set: set[str] = set()
 
         self.task_to_embedding = dict()
 
+        super().__init__(path)
+
     def query(self) -> QueriedKnowledge | None:
         """
         Query the knowledge base to get the queried knowledge. So far is handled in RAG strategy.

diff --git a/rdagent/components/knowledge_management/graph.py b/rdagent/components/knowledge_management/graph.py
@@ -12,6 +12,7 @@
     VectorBase,
     cosine,
 )
+from rdagent.core.knowledge_base import KnowledgeBase
 from rdagent.oai.llm_utils import APIBackend
 
 Node = KnowledgeMetaData
@@ -47,14 +48,14 @@ def __repr__(self) -> str:
         )
 
 
-class Graph:
+class Graph(KnowledgeBase):
     """
     base Graph class for Knowledge Graph Search
     """
 
     def __init__(self, path: str | Path | None = None) -> None:
-        self.path = path
         self.nodes = {}
+        super().__init__(path=path)
 
     def size(self) -> int:
         return len(self.nodes)
@@ -77,22 +78,6 @@ def find_node(self, content: str, label: str) -> Node | None:
                 return node
         return None
 
-    @classmethod
-    def load(cls: type[Graph], path: str | Path) -> Graph:
-        """use pickle as the default load method"""
-        path = path if isinstance(path, Path) else Path(path)
-        if not path.exists():
-            return cls(path=path)
-
-        with path.open("rb") as f:
-            return pickle.load(f)
-
-    def save(self, path: str | Path) -> None:
-        """use pickle as the default save method"""
-        Path.mkdir(path.parent, exist_ok=True)
-        with path.open("wb") as f:
-            pickle.dump(self, f)
-
     @staticmethod
     def batch_embedding(nodes: list[Node]) -> list[Node]:
         contents = [node.content for node in nodes]
@@ -119,8 +104,8 @@ class UndirectedGraph(Graph):
     """
 
     def __init__(self, path: str | Path | None = None) -> None:
-        super().__init__(path=path)
         self.vector_base: VectorBase = PDVectorBase()
+        super().__init__(path=path)
 
     def __str__(self) -> str:
         return f"UndirectedGraph(nodes={self.nodes})"
@@ -174,16 +159,6 @@ def add_node(
 
             node.add_neighbor(neighbor)
 
-    @classmethod
-    def load(cls: type[UndirectedGraph], path: str | Path) -> UndirectedGraph:
-        """use pickle as the default load method"""
-        path = path if isinstance(path, Path) else Path(path)
-        if not path.exists():
-            return cls(path=path)
-
-        with path.open("rb") as f:
-            return pickle.load(f)
-
     def add_nodes(self, node: UndirectedNode, neighbors: list[UndirectedNode]) -> None:
         if not neighbors:
             self.add_node(node)

diff --git a/rdagent/components/knowledge_management/vector_base.py b/rdagent/components/knowledge_management/vector_base.py
@@ -5,6 +5,7 @@
 import pandas as pd
 from scipy.spatial.distance import cosine
 
+from rdagent.core.knowledge_base import KnowledgeBase
 from rdagent.log import rdagent_logger as logger
 from rdagent.oai.llm_utils import APIBackend
 
@@ -68,14 +69,11 @@ def contents_to_documents(contents: List[str], label: str = None) -> List[Docume
     return docs
 
 
-class VectorBase:
+class VectorBase(KnowledgeBase):
     """
     This class is used for handling vector storage and query
     """
 
-    def __init__(self, vector_df_path: Union[str, Path] = None, **kwargs):
-        pass
-
     def add(self, document: Union[Document, List[Document]]):
         """
         add new node to vector_df
@@ -104,28 +102,15 @@ def search(self, content: str, topk_k: int = 5, similarity_threshold: float = 0)
         """
         pass
 
-    def load(self, **kwargs):
-        """load vector_df"""
-
-    def save(self, **kwargs):
-        """save vector_df"""
-
 
 class PDVectorBase(VectorBase):
     """
     Implement of VectorBase using Pandas
     """
 
-    def __init__(self, vector_df_path: Union[str, Path] = None):
-        super().__init__(vector_df_path)
-
-        if vector_df_path:
-            try:
-                self.vector_df = self.load(vector_df_path)
-            except FileNotFoundError:
-                self.vector_df = pd.DataFrame(columns=["id", "label", "content", "embedding"])
-        else:
-            self.vector_df = pd.DataFrame(columns=["id", "label", "content", "embedding"])
+    def __init__(self, path: Union[str, Path] = None):
+        self.vector_df = pd.DataFrame(columns=["id", "label", "content", "embedding"])
+        super().__init__(path)
 
     def shape(self):
         return self.vector_df.shape
@@ -196,10 +181,3 @@ def search(self, content: str, topk_k: int = 5, similarity_threshold: float = 0)
         for _, similar_docs in most_similar_docs.iterrows():
             docs.append(Document().from_dict(similar_docs.to_dict()))
         return docs, searched_similarities.to_list()
-
-    def load(self, vector_df_path, **kwargs):
-        vector_df = pd.read_pickle(vector_df_path)
-        return vector_df
-
-    def save(self, vector_df_path, **kwargs):
-        self.vector_df.to_pickle(vector_df_path)
diff --git a/rdagent/components/workflow/conf.py b/rdagent/components/workflow/conf.py
@@ -14,6 +14,8 @@ class Config:
     """
 
     scen: str = ""
+    knowledge_base: str = ""
+    knowledge_base_path: str = ""
     hypothesis_gen: str = ""
     hypothesis2experiment: str = ""
     coder: str = ""

diff --git a/rdagent/core/evolving_framework.py b/rdagent/core/evolving_framework.py
@@ -5,6 +5,8 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any
 
+from rdagent.core.knowledge_base import KnowledgeBase
+
 if TYPE_CHECKING:
     from rdagent.core.evaluation import Feedback
     from rdagent.core.scenario import Scenario
@@ -18,7 +20,7 @@ class QueriedKnowledge:
     pass
 
 
-class KnowledgeBase(ABC):
+class EvolvingKnowledgeBase(KnowledgeBase):
     @abstractmethod
     def query(
         self,
@@ -78,7 +80,7 @@ def evolve(
 class RAGStrategy(ABC):
     """Retrieval Augmentation Generation Strategy"""
 
-    def __init__(self, knowledgebase: KnowledgeBase) -> None:
+    def __init__(self, knowledgebase: EvolvingKnowledgeBase) -> None:
         self.knowledgebase = knowledgebase
 
     @abstractmethod

diff --git a/rdagent/core/knowledge_base.py b/rdagent/core/knowledge_base.py
@@ -0,0 +1,25 @@
+from pathlib import Path
+
+import dill as pickle  # type: ignore[import-untyped]
+
+from rdagent.log import rdagent_logger as logger
+
+
+class KnowledgeBase:
+    def __init__(self, path: str | Path | None = None) -> None:
+        self.path = Path(path) if path else None
+        self.load()
+
+    def load(self) -> None:
+        if self.path is not None and self.path.exists():
+            with self.path.open("rb") as f:
+                self.__dict__.update(
+                    pickle.load(f).__dict__,
+                )  # TODO: because we need to align with init function, we need a less hacky way to do this
+
+    def dump(self) -> None:
+        if self.path is not None:
+            self.path.parent.mkdir(parents=True, exist_ok=True)
+            pickle.dump(self, self.path.open("wb"))
+        else:
+            logger.warning("KnowledgeBase path is not set, dump failed.")
diff --git a/rdagent/core/prompts.py b/rdagent/core/prompts.py
@@ -1,4 +1,4 @@
-from pathlib import Path  # noqa: I001
+from pathlib import Path
 
 import yaml
 

diff --git a/rdagent/core/proposal.py b/rdagent/core/proposal.py
@@ -9,6 +9,7 @@
 
 from rdagent.core.evaluation import Feedback
 from rdagent.core.experiment import ASpecificExp, Experiment
+from rdagent.core.knowledge_base import KnowledgeBase
 from rdagent.core.scenario import Scenario
 
 if TYPE_CHECKING:
@@ -83,12 +84,14 @@ def __str__(self) -> str:
 
 
 ASpecificScen = TypeVar("ASpecificScen", bound=Scenario)
+ASpecificKB = TypeVar("ASpecificKB", bound=KnowledgeBase)
 
 
-class Trace(Generic[ASpecificScen]):
-    def __init__(self, scen: ASpecificScen) -> None:
+class Trace(Generic[ASpecificScen, ASpecificKB]):
+    def __init__(self, scen: ASpecificScen, knowledge_base: ASpecificKB | None = None) -> None:
         self.scen: ASpecificScen = scen
         self.hist: list[tuple[Hypothesis, Experiment, HypothesisFeedback]] = []
+        self.knowledge_base: ASpecificKB | None = knowledge_base
 
     def get_sota_hypothesis_and_experiment(self) -> tuple[Hypothesis | None, Experiment | None]:
         """Access the last experiment result, sub-task, and the corresponding hypothesis."""

diff --git a/rdagent/oai/llm_utils.py b/rdagent/oai/llm_utils.py
@@ -90,7 +90,7 @@ def __init__(self, cache_location: str) -> None:
         self.cache_location = cache_location
         db_file_exist = Path(cache_location).exists()
         # TODO: sqlite3 does not support multiprocessing.
-        self.conn = sqlite3.connect(cache_location)
+        self.conn = sqlite3.connect(cache_location, timeout=20)
         self.c = self.conn.cursor()
         if not db_file_exist:
             self.c.execute(