Skip to content

Commit

Permalink
remove commented lines
Browse files Browse the repository at this point in the history
  • Loading branch information
jdkent committed Jan 13, 2025
1 parent 3796e66 commit f14f343
Show file tree
Hide file tree
Showing 2 changed files with 110 additions and 4 deletions.
110 changes: 110 additions & 0 deletions store/neurostore/ingest/extracted_features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
"""Ingest extracted features into the database."""

import json
import os.path as op
import re
from pathlib import Path
import hashlib

import numpy as np
import pandas as pd
import requests
from scipy import sparse
from dateutil.parser import parse as parse_date
from sqlalchemy import or_

from neurostore.database import db
from neurostore.models import (
Pipeline,
PipelineConfig,
PipelineRun,
PipelineRunResult,
)


def ingest_feature(feature_directory, session):
"""Ingest demographics data into the database."""
# read pipeline_info.json from the base feature directory
with open(op.join(feature_directory, "pipeline_info.json")) as f:
pipeline_info = json.load(f)

# search if there is an existing pipeline with the same name and version
pipeline = (
session.query(Pipeline)
.filter(
Pipeline.name == pipeline_info["name"],
Pipeline.version == pipeline_info["version"],
)
.first()
)
# create a pipeline if it does not exist
if not pipeline:
pipeline = Pipeline(
name=pipeline_info["name"],
version=pipeline_info["version"],
description=pipeline_info.get("description"),
study_dependent=pipeline_info.get("study_dependent", False),
ace_compatible=pipeline_info.get("ace_compatible", False),
pubget_compatible=pipeline_info.get("pubget_compatible", False),
derived_from=pipeline_info.get("derived_from", None),
)
session.add(pipeline)

# search within the pipeline and see if there are any existing pipeline configs
# that match the "arguements" field in the pipeline_info.json
# create a hash of the config arguments
config_hash = hashlib.sha256(
json.dumps(pipeline_info["arguments"]).encode()
).hexdigest()
pipeline_config = (
session.query(PipelineConfig)
.filter(
PipelineConfig.pipeline_id == pipeline.id,
PipelineConfig.config_hash == config_hash,
)
.first()
)
# create a pipeline config if it does not exist
if not pipeline_config:
pipeline_config = PipelineConfig(
pipeline_id=pipeline.id,
config=pipeline_info["arguments"],
config_hash=config_hash,
)
session.add(pipeline_config)

# create a new pipeline run
pipeline_run = PipelineRun(
pipeline_id=pipeline.id,
config_id=pipeline_config.id,
)

# get a list of all the paper directories in the feature directory
paper_dirs = [d for d in Path(feature_directory).iterdir() if d.is_dir()]

# for each subject directory, read the results.json file and the info.json file
pipeline_run_results = []
for paper_dir in paper_dirs:
with open(op.join(paper_dir, "results.json")) as f:
results = json.load(f)

with open(op.join(paper_dir, "info.json")) as f:
info = json.load(f)

# use the directory name as the base_study_id
base_study_id = paper_dir.name
# create a new result record
pipeline_run_results.append(
PipelineRunResult(
base_study_id=base_study_id,
data=results,
date_executed=parse_date(info["date"]),
file_inputs=info["inputs"],
run=pipeline_run,
)
)

session.add(pipeline_run)
session.add_all(pipeline_run_results)

session.commit()
4 changes: 0 additions & 4 deletions store/neurostore/models/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -589,10 +589,6 @@ class PipelineRunResult(BaseMixin, db.Model):
date_executed = db.Column(db.DateTime(timezone=True))
data = db.Column(JSONB)
file_inputs = db.Column(JSONB)
# feature_index = db.Column(db.Integer) # the same categories of information can be extracted multiple times from a single paper (e.g., multiple demographic groups, multiple software packages, etc)
# feature_group = db.Column(db.String) # task, disease, software, age
# feature = db.Column(db.String) # stroop task, schizophrenia, fsl
# value = db.Column(db.Float) # 0.67, 0.3, 0.5 (some measure of confidence for the result)
run = relationship("PipelineRun", backref=backref("results", passive_deletes=True))


Expand Down

0 comments on commit f14f343

Please sign in to comment.