Skip to content

Commit

Permalink
Merge pull request #32 from gymrek-lab/new_plotting
Browse files Browse the repository at this point in the history
Add paper workflows for expanse
  • Loading branch information
RossDeVito authored Oct 31, 2024
2 parents e2d2f7e + 28139d4 commit 1346c00
Show file tree
Hide file tree
Showing 127 changed files with 28,811 additions and 12 deletions.
23 changes: 11 additions & 12 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
# Benchmarking data
# benchmarking/data/citrus_output/
# benchmarking/data/citrus_phenos/
# benchmarking/data/splits/
# benchmarking/data/ukb/*
# !benchmarking/data/ukb/
# !benchmarking/data/ukb/*.py

# benchmarking/prs/plink_output/*
# !benchmarking/prs/plink_output/
# !benchmarking/prs/plink_output/*.py
# benchmarking/prs/prsice2_output/
# Benchmarking
benchmarking*/

# Large plots
paper/pheno_sim/sim_configs/complex_pheno_1.png
paper/pheno_sim/sim_configs/complex_pheno_1_noise.png
paper/pheno_sim/sim_configs/complex_pheno_4.png

# Paper eval
paper/data/
paper/output/
paper/pheno_sim/citrus*/

# Tools
benchmarking/PRSice2/
benchmarking/QCTOOL/
Expand Down
58 changes: 58 additions & 0 deletions paper/pheno_sim/citrus_out_to_plink_fmt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""Convert CITRUS output CSV to plink phenotype file format.
Example output:
#IID gene1 phenotype
1110 23.31 22.22
2202 34.12 18.23
...
"""

import os

import pandas as pd

# # Write a function to replace main
# def citrus_out_to_plink_phenotype(
# sample_id_col=''
# )

import argparse


if __name__ == "__main__":

# args
parser = argparse.ArgumentParser()

parser.add_argument("desc")

args = parser.parse_args()

sample_id_col = 'sample_id'

pheno_cols = ['phenotype']

csv_path = f'citrus_output/{args.desc}.csv'
output_path = f'citrus_phenos/{args.desc}.pheno'

# function
df = pd.read_csv(
csv_path,
index_col=False
)

df = df[[sample_id_col] + pheno_cols]

# Split FID and IID
df['FID'] = df[sample_id_col].str.split('_').map(lambda x: x[0])
df['IID'] = df[sample_id_col].str.split('_').map(lambda x: x[1])

df[['FID', 'IID'] + pheno_cols].to_csv(
output_path,
sep='\t',
index=False
)


45 changes: 45 additions & 0 deletions paper/pheno_sim/est_heritability.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""Estimate heritability of simulated CITRUS phenotypes."""

import json
import os

from pheno_sim import PhenoSimulation
from pheno_sim.heritability_estimation import broad_sense_heritability


if __name__ == '__main__':

sim_name = 'gepsi_lin_add_hFULL_hFULL'
config_dir = 'citrus_output'

print(sim_name)

# Load simulation config
with open(os.path.join(config_dir, sim_name + '_config.json'), 'r') as f:
sim_config = json.load(f)

# Create simulation
sim = PhenoSimulation(sim_config)

# Load genotype data
input_vals = sim.run_input_step()

# Compute heritability
confidence_level = 95

heritability = broad_sense_heritability(
sim,
input_vals=input_vals,
n_samples=0.1,
confidence_level=confidence_level/100,
)

# Save results
with open(
os.path.join(
config_dir, sim_name + f'_heritability_ci{confidence_level}.json'
),
'w'
) as f:
json.dump(heritability, f, indent=4)

121 changes: 121 additions & 0 deletions paper/pheno_sim/gen_lin_add_sim.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
"""Generate linear additive phenotype CITRUS simulation configs."""

import copy
import json

import numpy as np
import pandas as pd


CONFIG_TEMPLATE = {
"input": [
{
"file": "../data/ukb/vcf/phased_chr19.vcf",
"file_format": "vcf",
"reference_genome": "GRCh37",
"input_nodes": [
{
"alias": "variants",
"type": "SNP",
"chr": "19",
"pos": []
}
]
}
],
"simulation_steps": [
{
"type": "RandomConstant",
"alias": "variants_betas",
"input_match_size": "variants",
"dist_name": "normal",
"dist_kwargs": {
"loc": 0.0,
"scale": 1.0
},
"by_feat": True
},
{
"type": "Product",
"alias": "variant_effects",
"input_aliases": [
"variants_betas",
"variants"
]
},
{
"type": "AdditiveCombine",
"alias": "combined_variant_effects",
"input_alias": "variant_effects"
},
{
"type": "SumReduce",
"alias": "no_noise_phenotype",
"input_alias": "combined_variant_effects"
},
{
"type": "Heritability",
"alias": "phenotype",
"input_alias": "no_noise_phenotype",
}
]
}


class NpEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, np.integer):
return int(obj)
if isinstance(obj, np.floating):
return float(obj)
if isinstance(obj, np.ndarray):
return obj.tolist()
return super(NpEncoder, self).default(obj)


def create_config(pos, heritability):
sim_config = copy.deepcopy(CONFIG_TEMPLATE)

sim_config['input'][0]['input_nodes'][0]['pos'] = list(pos)
sim_config['simulation_steps'][-1]['heritability'] = float(heritability)

return sim_config


if __name__ == '__main__':

# Options
n_vars = [10, 100, 1000]
herit_vals = [0.05, 0.1, 0.2, 0.4, 0.8]
version_num = 1

config_save_dir = 'sim_configs'

# Load variant info
var_info_path = '../data/ukb/chr19_var_info.csv'
var_df = pd.read_csv(
var_info_path
)

# Add position column
var_df['pos'] = var_df.locus.apply(lambda x: int(x.split(':')[1]))

for n_var in n_vars:
for herit in herit_vals:
rng = np.random.default_rng(seed=147*version_num)

# Select loci
pos = rng.choice(
var_df.pos,
size=n_var,
replace=False
)

sim_config = create_config(list(pos), herit)

# Save config
h_str = str(herit).split('.')[1]
config_fname = f'lin_add_nvar{n_var}_h{h_str}_v{version_num}'

with open(f'{config_save_dir}/{config_fname}.json', 'w') as f:
json.dump(sim_config, f, indent=4, cls=NpEncoder)
Loading

0 comments on commit 1346c00

Please sign in to comment.