Skip to content

Commit

Permalink
feat: Initial version if Graph RAG in KAGGLE scenario (#301)
Browse files Browse the repository at this point in the history
* Initial version if Graph RAG in KAGGLE scenario

* fix CI

* fix a small bug

* fix CI

* fix CI

* fix CI
  • Loading branch information
peteryang1 authored Sep 23, 2024
1 parent 4ecf25f commit fd3c0fd
Show file tree
Hide file tree
Showing 22 changed files with 382 additions and 118 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ mypy:
# First deal with the core folder, and then gradually increase the scope of detection,
# and eventually realize the detection of the complete project.
ruff:
$(PIPRUN) ruff check rdagent/core --ignore FBT001,FBT002 # --exclude rdagent/scripts,git_ignore_folder
$(PIPRUN) ruff check rdagent/core --ignore FBT001,FBT002,I001 # --exclude rdagent/scripts,git_ignore_folder

# Check lint with toml-sort.
toml-sort:
Expand Down
7 changes: 7 additions & 0 deletions rdagent/app/kaggle/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,13 @@ class Config:
scen: str = "rdagent.scenarios.kaggle.experiment.scenario.KGScenario"
"""Scenario class for data mining model"""

knowledge_base: str = "" # TODO enable this line to use the knowledge base
# knowledge_base: str = "rdagent.scenarios.kaggle.knowledge_management.graph.KGKnowledgeGraph"
"""Knowledge base class"""

knowledge_base_path: str = "kg_graph.pkl"
"""Knowledge base path"""

hypothesis_gen: str = "rdagent.scenarios.kaggle.proposal.proposal.KGHypothesisGen"
"""Hypothesis generation class"""

Expand Down
10 changes: 9 additions & 1 deletion rdagent/app/kaggle/loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from rdagent.scenarios.kaggle.proposal.proposal import (
KG_ACTION_FEATURE_ENGINEERING,
KG_ACTION_FEATURE_PROCESSING,
KGTrace,
)


Expand All @@ -32,6 +33,13 @@ def __init__(self, PROP_SETTING: BasePropSetting):
scen: Scenario = import_class(PROP_SETTING.scen)(PROP_SETTING.competition)
logger.log_object(scen, tag="scenario")

knowledge_base = (
import_class(PROP_SETTING.knowledge_base)(PROP_SETTING.knowledge_base_path, scen)
if PROP_SETTING.knowledge_base != ""
else None
)
logger.log_object(knowledge_base, tag="knowledge_base")

self.hypothesis_gen: HypothesisGen = import_class(PROP_SETTING.hypothesis_gen)(scen)
logger.log_object(self.hypothesis_gen, tag="hypothesis generator")

Expand All @@ -50,7 +58,7 @@ def __init__(self, PROP_SETTING: BasePropSetting):

self.summarizer: HypothesisExperiment2Feedback = import_class(PROP_SETTING.summarizer)(scen)
logger.log_object(self.summarizer, tag="summarizer")
self.trace = Trace(scen=scen)
self.trace = KGTrace(scen=scen, knowledge_base=knowledge_base)
super(RDLoop, self).__init__()

@measure_time
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@
)
from rdagent.core.evolving_framework import (
EvolvableSubjects,
EvolvingKnowledgeBase,
EvoStep,
Knowledge,
KnowledgeBase,
QueriedKnowledge,
RAGStrategy,
)
Expand Down Expand Up @@ -71,12 +71,13 @@ def __init__(self, success_task_to_knowledge_dict: dict = {}, failed_task_info_s
self.failed_task_info_set = failed_task_info_set


class FactorKnowledgeBaseV1(KnowledgeBase):
def __init__(self) -> None:
class FactorKnowledgeBaseV1(EvolvingKnowledgeBase):
def __init__(self, path: str | Path = None) -> None:
self.implementation_trace: dict[str, FactorKnowledge] = dict()
self.success_task_info_set: set[str] = set()

self.task_to_embedding = dict()
super().__init__(path)

def query(self) -> QueriedKnowledge | None:
"""
Expand Down Expand Up @@ -746,12 +747,12 @@ def dataset_query(
return factor_implementation_queried_graph_knowledge


class FactorGraphKnowledgeBase(KnowledgeBase):
def __init__(self, init_component_list=None, data_set_knowledge_path=None) -> None:
class FactorGraphKnowledgeBase(EvolvingKnowledgeBase):
def __init__(self, init_component_list=None, path: str | Path = None, data_set_knowledge_path=None) -> None:
"""
Load knowledge, offer brief information of knowledge and common handle interfaces
"""
self.graph: UndirectedGraph = UndirectedGraph.load(Path.cwd() / "graph.pkl")
self.graph: UndirectedGraph = UndirectedGraph(Path.cwd() / "graph.pkl")
logger.info(f"Knowledge Graph loaded, size={self.graph.size()}")

if init_component_list:
Expand Down Expand Up @@ -780,6 +781,7 @@ def __init__(self, init_component_list=None, data_set_knowledge_path=None) -> No
if data_set_knowledge_path:
with open(data_set_knowledge_path, "r") as f:
self.data_set_knowledge_dict = json.load(f)
super().__init__(path)

def get_all_nodes_by_label(self, label: str) -> list[UndirectedNode]:
return self.graph.get_all_nodes_by_label(label)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from pathlib import Path

from rdagent.components.coder.model_coder.conf import MODEL_IMPL_SETTINGS
from rdagent.components.coder.model_coder.CoSTEER.evaluators import ModelCoderFeedback
from rdagent.components.coder.model_coder.model import ModelTask
from rdagent.core.evolving_framework import (
EvolvableSubjects,
EvolvingKnowledgeBase,
EvoStep,
Knowledge,
KnowledgeBase,
QueriedKnowledge,
RAGStrategy,
)
Expand Down Expand Up @@ -49,13 +51,15 @@ def __init__(self, success_task_to_knowledge_dict: dict = {}, failed_task_info_s
self.working_task_to_similar_successful_knowledge_dict = dict()


class ModelKnowledgeBase(KnowledgeBase):
def __init__(self) -> None:
class ModelKnowledgeBase(EvolvingKnowledgeBase):
def __init__(self, path: str | Path = None) -> None:
self.implementation_trace: dict[str, ModelKnowledge] = dict()
self.success_task_info_set: set[str] = set()

self.task_to_embedding = dict()

super().__init__(path)

def query(self) -> QueriedKnowledge | None:
"""
Query the knowledge base to get the queried knowledge. So far is handled in RAG strategy.
Expand Down
33 changes: 4 additions & 29 deletions rdagent/components/knowledge_management/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
VectorBase,
cosine,
)
from rdagent.core.knowledge_base import KnowledgeBase
from rdagent.oai.llm_utils import APIBackend

Node = KnowledgeMetaData
Expand Down Expand Up @@ -47,14 +48,14 @@ def __repr__(self) -> str:
)


class Graph:
class Graph(KnowledgeBase):
"""
base Graph class for Knowledge Graph Search
"""

def __init__(self, path: str | Path | None = None) -> None:
self.path = path
self.nodes = {}
super().__init__(path=path)

def size(self) -> int:
return len(self.nodes)
Expand All @@ -77,22 +78,6 @@ def find_node(self, content: str, label: str) -> Node | None:
return node
return None

@classmethod
def load(cls: type[Graph], path: str | Path) -> Graph:
"""use pickle as the default load method"""
path = path if isinstance(path, Path) else Path(path)
if not path.exists():
return cls(path=path)

with path.open("rb") as f:
return pickle.load(f)

def save(self, path: str | Path) -> None:
"""use pickle as the default save method"""
Path.mkdir(path.parent, exist_ok=True)
with path.open("wb") as f:
pickle.dump(self, f)

@staticmethod
def batch_embedding(nodes: list[Node]) -> list[Node]:
contents = [node.content for node in nodes]
Expand All @@ -119,8 +104,8 @@ class UndirectedGraph(Graph):
"""

def __init__(self, path: str | Path | None = None) -> None:
super().__init__(path=path)
self.vector_base: VectorBase = PDVectorBase()
super().__init__(path=path)

def __str__(self) -> str:
return f"UndirectedGraph(nodes={self.nodes})"
Expand Down Expand Up @@ -174,16 +159,6 @@ def add_node(

node.add_neighbor(neighbor)

@classmethod
def load(cls: type[UndirectedGraph], path: str | Path) -> UndirectedGraph:
"""use pickle as the default load method"""
path = path if isinstance(path, Path) else Path(path)
if not path.exists():
return cls(path=path)

with path.open("rb") as f:
return pickle.load(f)

def add_nodes(self, node: UndirectedNode, neighbors: list[UndirectedNode]) -> None:
if not neighbors:
self.add_node(node)
Expand Down
32 changes: 5 additions & 27 deletions rdagent/components/knowledge_management/vector_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pandas as pd
from scipy.spatial.distance import cosine

from rdagent.core.knowledge_base import KnowledgeBase
from rdagent.log import rdagent_logger as logger
from rdagent.oai.llm_utils import APIBackend

Expand Down Expand Up @@ -68,14 +69,11 @@ def contents_to_documents(contents: List[str], label: str = None) -> List[Docume
return docs


class VectorBase:
class VectorBase(KnowledgeBase):
"""
This class is used for handling vector storage and query
"""

def __init__(self, vector_df_path: Union[str, Path] = None, **kwargs):
pass

def add(self, document: Union[Document, List[Document]]):
"""
add new node to vector_df
Expand Down Expand Up @@ -104,28 +102,15 @@ def search(self, content: str, topk_k: int = 5, similarity_threshold: float = 0)
"""
pass

def load(self, **kwargs):
"""load vector_df"""

def save(self, **kwargs):
"""save vector_df"""


class PDVectorBase(VectorBase):
"""
Implement of VectorBase using Pandas
"""

def __init__(self, vector_df_path: Union[str, Path] = None):
super().__init__(vector_df_path)

if vector_df_path:
try:
self.vector_df = self.load(vector_df_path)
except FileNotFoundError:
self.vector_df = pd.DataFrame(columns=["id", "label", "content", "embedding"])
else:
self.vector_df = pd.DataFrame(columns=["id", "label", "content", "embedding"])
def __init__(self, path: Union[str, Path] = None):
self.vector_df = pd.DataFrame(columns=["id", "label", "content", "embedding"])
super().__init__(path)

def shape(self):
return self.vector_df.shape
Expand Down Expand Up @@ -196,10 +181,3 @@ def search(self, content: str, topk_k: int = 5, similarity_threshold: float = 0)
for _, similar_docs in most_similar_docs.iterrows():
docs.append(Document().from_dict(similar_docs.to_dict()))
return docs, searched_similarities.to_list()

def load(self, vector_df_path, **kwargs):
vector_df = pd.read_pickle(vector_df_path)
return vector_df

def save(self, vector_df_path, **kwargs):
self.vector_df.to_pickle(vector_df_path)
2 changes: 2 additions & 0 deletions rdagent/components/workflow/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ class Config:
"""

scen: str = ""
knowledge_base: str = ""
knowledge_base_path: str = ""
hypothesis_gen: str = ""
hypothesis2experiment: str = ""
coder: str = ""
Expand Down
6 changes: 4 additions & 2 deletions rdagent/core/evolving_framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any

from rdagent.core.knowledge_base import KnowledgeBase

if TYPE_CHECKING:
from rdagent.core.evaluation import Feedback
from rdagent.core.scenario import Scenario
Expand All @@ -18,7 +20,7 @@ class QueriedKnowledge:
pass


class KnowledgeBase(ABC):
class EvolvingKnowledgeBase(KnowledgeBase):
@abstractmethod
def query(
self,
Expand Down Expand Up @@ -78,7 +80,7 @@ def evolve(
class RAGStrategy(ABC):
"""Retrieval Augmentation Generation Strategy"""

def __init__(self, knowledgebase: KnowledgeBase) -> None:
def __init__(self, knowledgebase: EvolvingKnowledgeBase) -> None:
self.knowledgebase = knowledgebase

@abstractmethod
Expand Down
25 changes: 25 additions & 0 deletions rdagent/core/knowledge_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from pathlib import Path

import dill as pickle # type: ignore[import-untyped]

from rdagent.log import rdagent_logger as logger


class KnowledgeBase:
def __init__(self, path: str | Path | None = None) -> None:
self.path = Path(path) if path else None
self.load()

def load(self) -> None:
if self.path is not None and self.path.exists():
with self.path.open("rb") as f:
self.__dict__.update(
pickle.load(f).__dict__,
) # TODO: because we need to align with init function, we need a less hacky way to do this

def dump(self) -> None:
if self.path is not None:
self.path.parent.mkdir(parents=True, exist_ok=True)
pickle.dump(self, self.path.open("wb"))
else:
logger.warning("KnowledgeBase path is not set, dump failed.")
2 changes: 1 addition & 1 deletion rdagent/core/prompts.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from pathlib import Path # noqa: I001
from pathlib import Path

import yaml

Expand Down
7 changes: 5 additions & 2 deletions rdagent/core/proposal.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from rdagent.core.evaluation import Feedback
from rdagent.core.experiment import ASpecificExp, Experiment
from rdagent.core.knowledge_base import KnowledgeBase
from rdagent.core.scenario import Scenario

if TYPE_CHECKING:
Expand Down Expand Up @@ -83,12 +84,14 @@ def __str__(self) -> str:


ASpecificScen = TypeVar("ASpecificScen", bound=Scenario)
ASpecificKB = TypeVar("ASpecificKB", bound=KnowledgeBase)


class Trace(Generic[ASpecificScen]):
def __init__(self, scen: ASpecificScen) -> None:
class Trace(Generic[ASpecificScen, ASpecificKB]):
def __init__(self, scen: ASpecificScen, knowledge_base: ASpecificKB | None = None) -> None:
self.scen: ASpecificScen = scen
self.hist: list[tuple[Hypothesis, Experiment, HypothesisFeedback]] = []
self.knowledge_base: ASpecificKB | None = knowledge_base

def get_sota_hypothesis_and_experiment(self) -> tuple[Hypothesis | None, Experiment | None]:
"""Access the last experiment result, sub-task, and the corresponding hypothesis."""
Expand Down
2 changes: 1 addition & 1 deletion rdagent/oai/llm_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def __init__(self, cache_location: str) -> None:
self.cache_location = cache_location
db_file_exist = Path(cache_location).exists()
# TODO: sqlite3 does not support multiprocessing.
self.conn = sqlite3.connect(cache_location)
self.conn = sqlite3.connect(cache_location, timeout=20)
self.c = self.conn.cursor()
if not db_file_exist:
self.c.execute(
Expand Down
Loading

0 comments on commit fd3c0fd

Please sign in to comment.