Skip to content

Commit

Permalink
Merge pull request #110 from Aarhus-Psychiatry-Research/mb/add_careml…
Browse files Browse the repository at this point in the history
…_to_monorepo

feat: add careml to monorepo
  • Loading branch information
MartinBernstorff authored Jun 14, 2023
2 parents f65c385 + 884e6fd commit 3eb1fa5
Show file tree
Hide file tree
Showing 96 changed files with 5,545 additions and 26 deletions.
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ default_stages: [commit]

repos:
- repo: https://github.com/charliermarsh/ruff-pre-commit
rev: v0.0.254
rev: v0.0.272
hooks:
- id: ruff
args:
Expand All @@ -15,7 +15,7 @@ repos:
]

- repo: https://github.com/psf/black
rev: 22.8.0
rev: 23.3.0
hooks:
- id: black

Expand Down
8 changes: 7 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -177,12 +177,18 @@ mypy-init-return = true
suppress-none-returning = true

[tool.ruff.isort]
known-third-party = ["wandb"]
known-third-party = ["wandb", "joblib"]

[tool.ruff.mccabe]
# Unlike Flake8, default to a complexity level of 10.
max-complexity = 10

[tool.ruff.per-file-ignores]
"src/psycop/projects/care_ml/model_evaluation/figures/feature_importance/shap/get_shap_values.py" = ["I001"]
"src/psycop/projects/care_ml/model_evaluation/data/load_true_data.py" = ["I001"]
"src/psycop/common/global_utils/cache.py" = ["I001"]


[tool.semantic_release]
branch = "main"
version_variable = [
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Describe flattened dataset.""" ""
"""Describe flattened dataset."""
from __future__ import annotations

import logging
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Wandb utils.""" ""
"""Wandb utils."""
import traceback

import wandb
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def physical_visits(

if shak_code is not None:
sql += f" AND {schema.location_col_name} != 'Ukendt'"
sql += f" AND left({schema.location_col_name}, {len(str(shak_code))}) {shak_sql_operator} {str(shak_code)}"
sql += f" AND left({schema.location_col_name}, {len(str(shak_code))}) {shak_sql_operator} {shak_code!s}"

if where_clause is not None:
sql += f" {where_separator} {where_clause}"
Expand Down
2 changes: 1 addition & 1 deletion src/psycop/common/feature_generation/loaders/raw/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def load_from_codes(
)

if shak_code is not None:
sql += f" AND left({shak_location_col}, {len(str(shak_code))}) {shak_sql_operator} {str(shak_code)}"
sql += f" AND left({shak_location_col}, {len(str(shak_code))}) {shak_sql_operator} {shak_code!s}"

if administration_method:
allowed_administration_methods = (
Expand Down
3 changes: 2 additions & 1 deletion src/psycop/common/global_utils/cache.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from pathlib import Path

from joblib import Memory
from psycop.common.global_utils.paths import OVARTACI_SHARED_DIR, PSYCOP_PKG_ROOT

from joblib import Memory

# If on Windows
if Path("E:/").exists():
cache_dir = OVARTACI_SHARED_DIR / "cache"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Create post split pipeline.""" ""
"""Create post split pipeline."""
import logging

from psycop.common.model_training.config_schemas.full_config import FullConfigSchema
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def _keep_unique_outcome_col_with_lookahead_days_matching_conf(
col_to_drop = [
c
for c in outcome_cols
if f"_{str(self.pre_split_cfg.min_lookahead_days)}_" not in c
if f"_{self.pre_split_cfg.min_lookahead_days!s}_" not in c
]

# If no columns to drop, return the dataset
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
TEST_TFIDF_VOCAB = ["tfidf-" + word for word in _test_tf_idf_vocab]

if __name__ == "__main__":

p = Path("tests") / "test_data" / "test_tfidf"

with open(p / "tfidf_10.pkl", "rb") as f: # noqa
Expand Down
1 change: 1 addition & 0 deletions src/psycop/projects/care_ml/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

13 changes: 13 additions & 0 deletions src/psycop/projects/care_ml/careml_global_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Run elements that are required before wandb init first,
# then run the rest in main so you can wrap it all in
# wandb_alert_on_exception, which will send a slack alert
# if you have wandb alerts set up in wandb
from psycop.common.feature_generation.application_modules.project_setup import (
ProjectInfo,
)
from psycop.common.global_utils.paths import OVARTACI_SHARED_DIR

CAREML_PROJECT_INFO = ProjectInfo(
project_name="coercion",
project_path=OVARTACI_SHARED_DIR / "coercion",
)
194 changes: 194 additions & 0 deletions src/psycop/projects/care_ml/cohort_creation/pipelines/create_cohort.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
"""
This script creates the cohort for the psycop coercion project.
"""

from datetime import date

import numpy as np
import pandas as pd
from psycop.common.feature_generation.loaders.raw.sql_load import sql_load
from psycop.projects.care_ml.cohort_creation.utils.cohort_hyperparameters import (
exclude_prior_outcome_with_lookbehind,
)
from psycop.projects.care_ml.cohort_creation.utils.utils import (
concat_readmissions,
first_coercion_within_admission,
unpack_adm_days,
)
from psycop_ml_utils.sql.writer import write_df_to_sql

# load data
df_adm = sql_load(
"SELECT * FROM fct.[FOR_kohorte_indhold_pt_journal_inkl_2021_feb2022]",
) # only includes admissions in psychiatry (shak code starts with 6600)
df_coercion = sql_load(
"SELECT * FROM fct.[FOR_tvang_alt_hele_kohorten_inkl_2021_feb2022]",
) # includes coercion in both psychiatry and somatic


# ADMISSIONS DATA
# only keep admissions (not ambulatory visits)
df_adm = df_adm[df_adm["pt_type"] == "Indlagt"]

# only keep age >= 18 at the start of contact
df_adm = df_adm[df_adm["alder_start"] >= 18]

# only keep admissions after January 1st 2015 (so we can use use lookbehind windows of two years)
df_adm = df_adm[df_adm["datotid_start"] >= "2015-01-01"]

# only keep relevant columns
df_adm = df_adm[["dw_ek_borger", "datotid_start", "datotid_slut"]]

# COERCION DATA
# only include target coercion types: manual restraint, forced medication, and mechanical restraint, excluding voluntary mechanical restraint (i.e., "fastholdelse", "beroligende medicin", and "bæltefiksering", excluding "frivillig bæltefiksering")
df_coercion = df_coercion[
(
(df_coercion.typetekst_sei == "Bælte")
& (df_coercion.begrundtekst_sei != "Frivillig bæltefiksering")
)
| (df_coercion.typetekst_sei == "Fastholden")
| (df_coercion.typetekst_sei == "Beroligende medicin")
]

# only keep relevant columns
df_coercion = df_coercion[
["dw_ek_borger", "datotid_start_sei", "typetekst_sei", "behandlingsomraade"]
]

# sort based on patient and start of admission
df_adm = df_adm.sort_values(["dw_ek_borger", "datotid_start"])

# group by patient
df_patients = df_adm.groupby("dw_ek_borger")

# list of dfs; one for each patient
df_patients_list = [df_patients.get_group(key) for key in df_patients.groups]

# concatenate dataframes for individual patients
df_adm = pd.concat([concat_readmissions(patient) for patient in df_patients_list])

# for all patients, join all instances of coercion onto all admissions
df_cohort = df_adm.merge(df_coercion, how="left", on="dw_ek_borger")


# exclude admission if there has been an instance of coercion between 0 and 365 days before admission start (including 0 and 365)
df_excluded_admissions = exclude_prior_outcome_with_lookbehind(
df_cohort,
lookbehind=365,
col_admission_start="datotid_start",
col_outcome_start="datotid_start_sei",
)[["dw_ek_borger", "datotid_start"]]

# remove duplicate rows, so we have one row per admission (instead of multiple rows for admissions with multiple coercion instances)
df_excluded_admissions = df_excluded_admissions.drop_duplicates(keep="first")

# outer join of admissions and excluded admissions with and indicator column ("_merge") denoting whether and observation occurs in both datasets
df_cohort = df_cohort.merge(
df_excluded_admissions,
how="outer",
on=["dw_ek_borger", "datotid_start"],
indicator=True,
)

# exclude rows that are in both datasets (i.e., exclude admissions in "df_excluded_admissions")
df_cohort = df_cohort.loc[df_cohort["_merge"] != "both"]


# only keep instances of coercion that occured during the particular admission
df_cohort_with_coercion = df_cohort[
(df_cohort["datotid_start_sei"] > df_cohort["datotid_start"])
& (df_cohort["datotid_start_sei"] < df_cohort["datotid_slut"])
]

# keep first time of coercion for each admission
# group by admission
df_admissions = df_cohort_with_coercion.groupby(["dw_ek_borger", "datotid_start"])
df_admissions_list = [df_admissions.get_group(key) for key in df_admissions.groups]


df_cohort_with_coercion = pd.concat(
[first_coercion_within_admission(admission) for admission in df_admissions_list],
)

# remove irrelevant columns from df_cohort, drop duplicates
df_cohort = df_cohort[
["dw_ek_borger", "datotid_start", "datotid_slut"]
].drop_duplicates()


# merge with df_cohort_coercion
df_cohort = df_cohort.merge(
df_cohort_with_coercion,
how="left",
on=["dw_ek_borger", "datotid_start", "datotid_slut"],
)


# we exclude admissions with na discharge day and discharge day > 2021-11-22 due to legal restrictions
df_cohort = df_cohort[
(df_cohort.datotid_slut.notna()) & (df_cohort.datotid_slut <= "2021-11-22")
]


# for each admission, we want to make a prediction every day


# Apply the function unpack_adm_days to all patients
df_cohort = pd.concat([unpack_adm_days(idx, row) for idx, row in df_cohort.iterrows()]) # type: ignore


# Create include_pred_time_column (pred times were coercion hasn't happened yet or no coercion in the admission)
df_cohort["include_pred_time"] = np.where(
(df_cohort.pred_time < df_cohort.datotid_start_sei)
| (df_cohort.datotid_start_sei.isna()),
1,
0,
)


# load admission data again
df_adm = sql_load(
"SELECT * FROM fct.[FOR_kohorte_indhold_pt_journal_inkl_2021_feb2022]",
) # only includes admissions in psychiatry (shak code starts with 6600)

# only keep admission contacts
df_adm = df_adm[df_adm["pt_type"] == "Indlagt"]

# only keep age >= 18 at the start of contact
df_adm = df_adm[df_adm["alder_start"] >= 18]

# only keep admissions after January 1st 2015 (so we can use use lookbehind windows of two years)
df_adm = df_adm[df_adm["datotid_start"] >= "2015-01-01"]

# only keep relevant columns
df_adm = df_adm[["dw_ek_borger", "datotid_start", "shakkode_ansvarlig"]]

# left join df_adm on df_cohort
df_cohort = df_cohort.merge(
df_adm,
how="left",
on=["dw_ek_borger", "datotid_start"],
)

# remove admissions in the department of forensic psychiatry (shak code 6600021 and 6600310)
df_cohort = df_cohort[
(df_cohort["shakkode_ansvarlig"] != "6600310")
& (df_cohort["shakkode_ansvarlig"] != "6600021")
]


# remove coercion in somatics
df_cohort = df_cohort[df_cohort["behandlingsomraade"] != "Somatikken"]


# write csv with today's date
today = date.today().strftime("%d%m%y")
df_cohort.to_csv(f"cohort_{today}.csv")

# Write to sql database
write_df_to_sql(
df=df_cohort,
table_name="psycop_coercion_cohort_with_all_days_without_labels_feb2022",
if_exists="replace",
rows_per_chunk=5000,
)
Loading

0 comments on commit 3eb1fa5

Please sign in to comment.