wip: add feature tables (#842)

* wip: add feature tables * wip: add functioning tests * fix tests * run black * remove commented lines * use db for ingestion, and remove extra imports * add crud interface for pipeline * update openapi * style with black * pacify flake8
neurostuff · Jan 17, 2025 · b3788e3 · b3788e3
1 parent 71d9f1d
commit b3788e3
Show file tree

Hide file tree

Showing 13 changed files with 846 additions and 6 deletions.
diff --git a/store/neurostore/ingest/extracted_features.py b/store/neurostore/ingest/extracted_features.py
@@ -0,0 +1,107 @@
+"""Ingest extracted features into the database."""
+
+import json
+import os.path as op
+from pathlib import Path
+import hashlib
+from dateutil.parser import parse as parse_date
+
+from neurostore.database import db
+from neurostore.models import (
+    Pipeline,
+    PipelineConfig,
+    PipelineRun,
+    PipelineRunResult,
+)
+
+
+def ingest_feature(feature_directory):
+    """Ingest demographics data into the database."""
+    # read pipeline_info.json from the base feature directory
+    with open(op.join(feature_directory, "pipeline_info.json")) as f:
+        pipeline_info = json.load(f)
+
+    # search if there is an existing pipeline with the same name and version
+    pipeline = (
+        db.session.query(Pipeline)
+        .filter(
+            Pipeline.name == pipeline_info["name"],
+            Pipeline.version == pipeline_info["version"],
+        )
+        .first()
+    )
+    # create a pipeline if it does not exist
+    if not pipeline:
+        pipeline = Pipeline(
+            name=pipeline_info["name"],
+            version=pipeline_info["version"],
+            description=pipeline_info.get("description"),
+            study_dependent=(
+                True if pipeline_info.get("type", False) == "dependent" else False
+            ),
+            ace_compatible="ace"
+            in pipeline_info.get("arguments", {}).get("input_sources", []),
+            pubget_compatible="pubget"
+            in pipeline_info.get("arguments", {}).get("input_sources", []),
+            derived_from=pipeline_info.get("derived_from", None),
+        )
+        db.session.add(pipeline)
+
+    # search within the pipeline and see if there are any existing pipeline configs
+    # that match the "arguements" field in the pipeline_info.json
+    # create a hash of the config arguments
+    config_hash = hashlib.sha256(
+        json.dumps(pipeline_info["arguments"]).encode()
+    ).hexdigest()
+    pipeline_config = (
+        db.session.query(PipelineConfig)
+        .filter(
+            PipelineConfig.pipeline_id == pipeline.id,
+            PipelineConfig.config_hash == config_hash,
+        )
+        .first()
+    )
+    # create a pipeline config if it does not exist
+    if not pipeline_config:
+        pipeline_config = PipelineConfig(
+            pipeline_id=pipeline.id,
+            config=pipeline_info["arguments"],
+            config_hash=config_hash,
+        )
+        db.session.add(pipeline_config)
+
+    # create a new pipeline run
+    pipeline_run = PipelineRun(
+        pipeline_id=pipeline.id,
+        config_id=pipeline_config.id,
+    )
+
+    # get a list of all the paper directories in the feature directory
+    paper_dirs = [d for d in Path(feature_directory).iterdir() if d.is_dir()]
+
+    # for each subject directory, read the results.json file and the info.json file
+    pipeline_run_results = []
+    for paper_dir in paper_dirs:
+        with open(op.join(paper_dir, "results.json")) as f:
+            results = json.load(f)
+
+        with open(op.join(paper_dir, "info.json")) as f:
+            info = json.load(f)
+
+        # use the directory name as the base_study_id
+        base_study_id = paper_dir.name
+        # create a new result record
+        pipeline_run_results.append(
+            PipelineRunResult(
+                base_study_id=base_study_id,
+                data=results,
+                date_executed=parse_date(info["date"]),
+                file_inputs=info["inputs"],
+                run=pipeline_run,
+            )
+        )
+
+    db.session.add(pipeline_run)
+    db.session.add_all(pipeline_run_results)
+
+    db.session.commit()
diff --git a/store/neurostore/models/__init__.py b/store/neurostore/models/__init__.py
@@ -12,6 +12,11 @@
     AnnotationAnalysis,
     PointValue,
     AnalysisConditions,
+    Pipeline,
+    PipelineConfig,
+    PipelineRun,
+    PipelineRunResult,
+    PipelineRunResultVote,
 )
 from .auth import User, Role
 
@@ -31,4 +36,9 @@
     "AnalysisConditions",
     "User",
     "Role",
+    "Pipeline",
+    "PipelineConfig",
+    "PipelineRun",
+    "PipelineRunResult",
+    "PipelineRunResultVote",
 ]
diff --git a/store/neurostore/models/data.py b/store/neurostore/models/data.py
@@ -279,6 +279,7 @@ class Study(BaseMixin, db.Model):
     level = db.Column(db.String)
     metadata_ = db.Column(JSONB)
     source = db.Column(db.String, index=True)
+    base_study_id = db.Column(db.Text, db.ForeignKey("base_studies.id"), index=True)
     source_id = db.Column(db.String, index=True)
     source_updated_at = db.Column(db.DateTime(timezone=True))
     base_study_id = db.Column(db.Text, db.ForeignKey("base_studies.id"), index=True)
@@ -538,6 +539,75 @@ class PointValue(BaseMixin, db.Model):
     user = relationship("User", backref=backref("point_values", passive_deletes=True))
 
 
+class Pipeline(BaseMixin, db.Model):
+    __tablename__ = "pipelines"
+
+    name = db.Column(db.String)
+    description = db.Column(db.String)
+    version = db.Column(db.String)
+    study_dependent = db.Column(db.Boolean, default=False)
+    ace_compatible = db.Column(db.Boolean, default=False)
+    pubget_compatible = db.Column(db.Boolean, default=False)
+    derived_from = db.Column(db.Text)
+
+
+class PipelineConfig(BaseMixin, db.Model):
+    __tablename__ = "pipeline_configs"
+
+    pipeline_id = db.Column(
+        db.Text, db.ForeignKey("pipelines.id", ondelete="CASCADE"), index=True
+    )
+    config = db.Column(JSONB)
+    config_hash = db.Column(db.String, index=True)
+    pipeline = relationship(
+        "Pipeline", backref=backref("configs", passive_deletes=True)
+    )
+
+
+class PipelineRun(BaseMixin, db.Model):
+    __tablename__ = "pipeline_runs"
+
+    pipeline_id = db.Column(
+        db.Text, db.ForeignKey("pipelines.id", ondelete="CASCADE"), index=True
+    )
+    config_id = db.Column(
+        db.Text, db.ForeignKey("pipeline_configs.id", ondelete="CASCADE"), index=True
+    )
+    config = relationship(
+        "PipelineConfig", backref=backref("runs", passive_deletes=True)
+    )
+    run_index = db.Column(db.Integer())
+
+
+class PipelineRunResult(BaseMixin, db.Model):
+    __tablename__ = "pipeline_run_results"
+
+    run_id = db.Column(
+        db.Text, db.ForeignKey("pipeline_runs.id", ondelete="CASCADE"), index=True
+    )
+    base_study_id = db.Column(db.Text, db.ForeignKey("base_studies.id"), index=True)
+    date_executed = db.Column(db.DateTime(timezone=True))
+    data = db.Column(JSONB)
+    file_inputs = db.Column(JSONB)
+    run = relationship("PipelineRun", backref=backref("results", passive_deletes=True))
+
+
+class PipelineRunResultVote(BaseMixin, db.Model):
+    __tablename__ = "pipeline_run_result_votes"
+
+    run_result_id = db.Column(
+        db.Text,
+        db.ForeignKey("pipeline_run_results.id", ondelete="CASCADE"),
+        index=True,
+    )
+    user_id = db.Column(db.Text, db.ForeignKey("users.external_id"), index=True)
+    accurate = db.Column(db.Boolean)
+    run_result = relationship(
+        "PipelineRunResult", backref=backref("votes", passive_deletes=True)
+    )
+    user = relationship("User", backref=backref("votes", passive_deletes=True))
+
+
 # from . import event_listeners  # noqa E402
 
 # del event_listeners
diff --git a/store/neurostore/openapi b/store/neurostore/openapi
diff --git a/store/neurostore/resources/__init__.py b/store/neurostore/resources/__init__.py
@@ -15,6 +15,14 @@
     UsersView,
 )
 
+from .pipeline import (
+    PipelinesView,
+    PipelineConfigsView,
+    PipelineRunsView,
+    PipelineRunResultsView,
+    PipelineRunResultVotesView,
+)
+
 __all__ = [
     "StudysetsView",
     "AnnotationsView",
@@ -27,4 +35,9 @@
     "PointsView",
     "PointValuesView",
     "UsersView",
+    "PipelinesView",
+    "PipelineConfigsView",
+    "PipelineRunsView",
+    "PipelineRunResultsView",
+    "PipelineRunResultVotesView",
 ]
diff --git a/store/neurostore/resources/base.py b/store/neurostore/resources/base.py
@@ -619,7 +619,7 @@ def search(self):
                 validate_search_query(s)
             except errors.SyntaxError as e:
                 abort(400, description=e.args[0])
-            tsquery = func.to_tsquery('english', pubmed_to_tsquery(s))
+            tsquery = func.to_tsquery("english", pubmed_to_tsquery(s))
             q = q.filter(m._ts_vector.op("@@")(tsquery))
 
         # Alternatively (or in addition), search on individual fields.