Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add paper workflows for expanse #32

Merged
merged 3 commits into from
Oct 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 11 additions & 12 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
# Benchmarking data
# benchmarking/data/citrus_output/
# benchmarking/data/citrus_phenos/
# benchmarking/data/splits/
# benchmarking/data/ukb/*
# !benchmarking/data/ukb/
# !benchmarking/data/ukb/*.py

# benchmarking/prs/plink_output/*
# !benchmarking/prs/plink_output/
# !benchmarking/prs/plink_output/*.py
# benchmarking/prs/prsice2_output/
# Benchmarking
benchmarking*/

# Large plots
paper/pheno_sim/sim_configs/complex_pheno_1.png
paper/pheno_sim/sim_configs/complex_pheno_1_noise.png
paper/pheno_sim/sim_configs/complex_pheno_4.png

# Paper eval
paper/data/
paper/output/
paper/pheno_sim/citrus*/

# Tools
benchmarking/PRSice2/
benchmarking/QCTOOL/
Expand Down
58 changes: 58 additions & 0 deletions paper/pheno_sim/citrus_out_to_plink_fmt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""Convert CITRUS output CSV to plink phenotype file format.

Example output:

#IID gene1 phenotype
1110 23.31 22.22
2202 34.12 18.23
...

"""

import os

import pandas as pd

# # Write a function to replace main
# def citrus_out_to_plink_phenotype(
# sample_id_col=''
# )

import argparse


if __name__ == "__main__":

# args
parser = argparse.ArgumentParser()

parser.add_argument("desc")

args = parser.parse_args()

sample_id_col = 'sample_id'

pheno_cols = ['phenotype']

csv_path = f'citrus_output/{args.desc}.csv'
output_path = f'citrus_phenos/{args.desc}.pheno'

# function
df = pd.read_csv(
csv_path,
index_col=False
)

df = df[[sample_id_col] + pheno_cols]

# Split FID and IID
df['FID'] = df[sample_id_col].str.split('_').map(lambda x: x[0])
df['IID'] = df[sample_id_col].str.split('_').map(lambda x: x[1])

df[['FID', 'IID'] + pheno_cols].to_csv(
output_path,
sep='\t',
index=False
)


45 changes: 45 additions & 0 deletions paper/pheno_sim/est_heritability.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""Estimate heritability of simulated CITRUS phenotypes."""

import json
import os

from pheno_sim import PhenoSimulation
from pheno_sim.heritability_estimation import broad_sense_heritability


if __name__ == '__main__':

sim_name = 'gepsi_lin_add_hFULL_hFULL'
config_dir = 'citrus_output'

print(sim_name)

# Load simulation config
with open(os.path.join(config_dir, sim_name + '_config.json'), 'r') as f:
sim_config = json.load(f)

# Create simulation
sim = PhenoSimulation(sim_config)

# Load genotype data
input_vals = sim.run_input_step()

# Compute heritability
confidence_level = 95

heritability = broad_sense_heritability(
sim,
input_vals=input_vals,
n_samples=0.1,
confidence_level=confidence_level/100,
)

# Save results
with open(
os.path.join(
config_dir, sim_name + f'_heritability_ci{confidence_level}.json'
),
'w'
) as f:
json.dump(heritability, f, indent=4)

121 changes: 121 additions & 0 deletions paper/pheno_sim/gen_lin_add_sim.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
"""Generate linear additive phenotype CITRUS simulation configs."""

import copy
import json

import numpy as np
import pandas as pd


CONFIG_TEMPLATE = {
"input": [
{
"file": "../data/ukb/vcf/phased_chr19.vcf",
"file_format": "vcf",
"reference_genome": "GRCh37",
"input_nodes": [
{
"alias": "variants",
"type": "SNP",
"chr": "19",
"pos": []
}
]
}
],
"simulation_steps": [
{
"type": "RandomConstant",
"alias": "variants_betas",
"input_match_size": "variants",
"dist_name": "normal",
"dist_kwargs": {
"loc": 0.0,
"scale": 1.0
},
"by_feat": True
},
{
"type": "Product",
"alias": "variant_effects",
"input_aliases": [
"variants_betas",
"variants"
]
},
{
"type": "AdditiveCombine",
"alias": "combined_variant_effects",
"input_alias": "variant_effects"
},
{
"type": "SumReduce",
"alias": "no_noise_phenotype",
"input_alias": "combined_variant_effects"
},
{
"type": "Heritability",
"alias": "phenotype",
"input_alias": "no_noise_phenotype",
}
]
}


class NpEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, np.integer):
return int(obj)
if isinstance(obj, np.floating):
return float(obj)
if isinstance(obj, np.ndarray):
return obj.tolist()
return super(NpEncoder, self).default(obj)


def create_config(pos, heritability):
sim_config = copy.deepcopy(CONFIG_TEMPLATE)

sim_config['input'][0]['input_nodes'][0]['pos'] = list(pos)
sim_config['simulation_steps'][-1]['heritability'] = float(heritability)

return sim_config


if __name__ == '__main__':

# Options
n_vars = [10, 100, 1000]
herit_vals = [0.05, 0.1, 0.2, 0.4, 0.8]
version_num = 1

config_save_dir = 'sim_configs'

# Load variant info
var_info_path = '../data/ukb/chr19_var_info.csv'
var_df = pd.read_csv(
var_info_path
)

# Add position column
var_df['pos'] = var_df.locus.apply(lambda x: int(x.split(':')[1]))

for n_var in n_vars:
for herit in herit_vals:
rng = np.random.default_rng(seed=147*version_num)

# Select loci
pos = rng.choice(
var_df.pos,
size=n_var,
replace=False
)

sim_config = create_config(list(pos), herit)

# Save config
h_str = str(herit).split('.')[1]
config_fname = f'lin_add_nvar{n_var}_h{h_str}_v{version_num}'

with open(f'{config_save_dir}/{config_fname}.json', 'w') as f:
json.dump(sim_config, f, indent=4, cls=NpEncoder)
Loading
Loading