remove commented lines

neurostuff · Jan 13, 2025 · f14f343 · f14f343
1 parent 3796e66
commit f14f343
Show file tree

Hide file tree

Showing 2 changed files with 110 additions and 4 deletions.
diff --git a/store/neurostore/ingest/extracted_features.py b/store/neurostore/ingest/extracted_features.py
@@ -0,0 +1,110 @@
+"""Ingest extracted features into the database."""
+
+import json
+import os.path as op
+import re
+from pathlib import Path
+import hashlib
+
+import numpy as np
+import pandas as pd
+import requests
+from scipy import sparse
+from dateutil.parser import parse as parse_date
+from sqlalchemy import or_
+
+from neurostore.database import db
+from neurostore.models import (
+    Pipeline,
+    PipelineConfig,
+    PipelineRun,
+    PipelineRunResult,
+)
+
+
+def ingest_feature(feature_directory, session):
+    """Ingest demographics data into the database."""
+    # read pipeline_info.json from the base feature directory
+    with open(op.join(feature_directory, "pipeline_info.json")) as f:
+        pipeline_info = json.load(f)
+
+    # search if there is an existing pipeline with the same name and version
+    pipeline = (
+        session.query(Pipeline)
+        .filter(
+            Pipeline.name == pipeline_info["name"],
+            Pipeline.version == pipeline_info["version"],
+        )
+        .first()
+    )
+    # create a pipeline if it does not exist
+    if not pipeline:
+        pipeline = Pipeline(
+            name=pipeline_info["name"],
+            version=pipeline_info["version"],
+            description=pipeline_info.get("description"),
+            study_dependent=pipeline_info.get("study_dependent", False),
+            ace_compatible=pipeline_info.get("ace_compatible", False),
+            pubget_compatible=pipeline_info.get("pubget_compatible", False),
+            derived_from=pipeline_info.get("derived_from", None),
+        )
+        session.add(pipeline)
+
+    # search within the pipeline and see if there are any existing pipeline configs
+    # that match the "arguements" field in the pipeline_info.json
+    # create a hash of the config arguments
+    config_hash = hashlib.sha256(
+        json.dumps(pipeline_info["arguments"]).encode()
+    ).hexdigest()
+    pipeline_config = (
+        session.query(PipelineConfig)
+        .filter(
+            PipelineConfig.pipeline_id == pipeline.id,
+            PipelineConfig.config_hash == config_hash,
+        )
+        .first()
+    )
+    # create a pipeline config if it does not exist
+    if not pipeline_config:
+        pipeline_config = PipelineConfig(
+            pipeline_id=pipeline.id,
+            config=pipeline_info["arguments"],
+            config_hash=config_hash,
+        )
+        session.add(pipeline_config)
+
+    # create a new pipeline run
+    pipeline_run = PipelineRun(
+        pipeline_id=pipeline.id,
+        config_id=pipeline_config.id,
+    )
+
+    # get a list of all the paper directories in the feature directory
+    paper_dirs = [d for d in Path(feature_directory).iterdir() if d.is_dir()]
+
+    # for each subject directory, read the results.json file and the info.json file
+    pipeline_run_results = []
+    for paper_dir in paper_dirs:
+        with open(op.join(paper_dir, "results.json")) as f:
+            results = json.load(f)
+
+        with open(op.join(paper_dir, "info.json")) as f:
+            info = json.load(f)
+
+        # use the directory name as the base_study_id
+        base_study_id = paper_dir.name
+        # create a new result record
+        pipeline_run_results.append(
+            PipelineRunResult(
+                base_study_id=base_study_id,
+                data=results,
+                date_executed=parse_date(info["date"]),
+                file_inputs=info["inputs"],
+                run=pipeline_run,
+            )
+        )
+
+    session.add(pipeline_run)
+    session.add_all(pipeline_run_results)
+
+    session.commit()
diff --git a/store/neurostore/models/data.py b/store/neurostore/models/data.py
@@ -589,10 +589,6 @@ class PipelineRunResult(BaseMixin, db.Model):
     date_executed = db.Column(db.DateTime(timezone=True))
     data = db.Column(JSONB)
     file_inputs = db.Column(JSONB)
-    # feature_index = db.Column(db.Integer)  # the same categories of information can be extracted multiple times from a single paper (e.g., multiple demographic groups, multiple software packages, etc)
-    # feature_group = db.Column(db.String)  # task, disease, software, age
-    # feature = db.Column(db.String)  # stroop task, schizophrenia, fsl
-    # value = db.Column(db.Float)  # 0.67, 0.3, 0.5 (some measure of confidence for the result)
     run = relationship("PipelineRun", backref=backref("results", passive_deletes=True))