Skip to content

Commit

Permalink
Add paper workflows for expanse
Browse files Browse the repository at this point in the history
  • Loading branch information
Ross Devito committed Oct 31, 2024
1 parent f1ad9a0 commit 2b69139
Show file tree
Hide file tree
Showing 131 changed files with 28,806 additions and 36 deletions.
18 changes: 6 additions & 12 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,17 +1,11 @@
# Benchmarking data
# benchmarking/data/citrus_output/
# benchmarking/data/citrus_phenos/
# benchmarking/data/splits/
# benchmarking/data/ukb/*
# !benchmarking/data/ukb/
# !benchmarking/data/ukb/*.py

# benchmarking/prs/plink_output/*
# !benchmarking/prs/plink_output/
# !benchmarking/prs/plink_output/*.py
# benchmarking/prs/prsice2_output/
# Benchmarking
benchmarking*/

# Paper eval
paper/data/
paper/output/
paper/pheno_sim/citrus*/

# Tools
benchmarking/PRSice2/
benchmarking/QCTOOL/
Expand Down
24 changes: 0 additions & 24 deletions cl_tool/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,30 +21,6 @@
<path_to_genotype_file> ...
CITRUS_sim -c <path_to_config_file> -g <path_to_genotype_file>
Output:
If no additional flags are provided, output will be written to the
current working directory. The output will be a CSV file (output.csv)
containing sample IDs and all corresponding values from the simulation,
and a JSON file (sim_config.json) containing the simulation configuration
(including any random selections made by nodes).
If the -o or --output_dir flag is provided, if the directory does not
exist it will be created, and the output files will be saved to it. By
default the output files will be named output.csv and sim_config.json,
but these can be changed with the -f or --output_file_name and -j or
--output_config_json flags, respectively.
Output file will by default be a comma seperated CSV file. Use -t or
--tsv flag to instead save as a tab seperated TSV file.
Example Usage:
CITRUS_sim -c config.json -o sim_results/output_dir
CITRUS_sim -c config.json -g genotype_file_1 genotype_file_2 \\
-o sim_results/output_dir -t -f my_output.tsv -j my_sim_config.json
"""

import click
Expand Down
58 changes: 58 additions & 0 deletions paper/pheno_sim/citrus_out_to_plink_fmt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""Convert CITRUS output CSV to plink phenotype file format.
Example output:
#IID gene1 phenotype
1110 23.31 22.22
2202 34.12 18.23
...
"""

import os

import pandas as pd

# # Write a function to replace main
# def citrus_out_to_plink_phenotype(
# sample_id_col=''
# )

import argparse


if __name__ == "__main__":

# args
parser = argparse.ArgumentParser()

parser.add_argument("desc")

args = parser.parse_args()

sample_id_col = 'sample_id'

pheno_cols = ['phenotype']

csv_path = f'citrus_output/{args.desc}.csv'
output_path = f'citrus_phenos/{args.desc}.pheno'

# function
df = pd.read_csv(
csv_path,
index_col=False
)

df = df[[sample_id_col] + pheno_cols]

# Split FID and IID
df['FID'] = df[sample_id_col].str.split('_').map(lambda x: x[0])
df['IID'] = df[sample_id_col].str.split('_').map(lambda x: x[1])

df[['FID', 'IID'] + pheno_cols].to_csv(
output_path,
sep='\t',
index=False
)


45 changes: 45 additions & 0 deletions paper/pheno_sim/est_heritability.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""Estimate heritability of simulated CITRUS phenotypes."""

import json
import os

from pheno_sim import PhenoSimulation
from pheno_sim.heritability_estimation import broad_sense_heritability


if __name__ == '__main__':

sim_name = 'gepsi_lin_add_hFULL_hFULL'
config_dir = 'citrus_output'

print(sim_name)

# Load simulation config
with open(os.path.join(config_dir, sim_name + '_config.json'), 'r') as f:
sim_config = json.load(f)

# Create simulation
sim = PhenoSimulation(sim_config)

# Load genotype data
input_vals = sim.run_input_step()

# Compute heritability
confidence_level = 95

heritability = broad_sense_heritability(
sim,
input_vals=input_vals,
n_samples=0.1,
confidence_level=confidence_level/100,
)

# Save results
with open(
os.path.join(
config_dir, sim_name + f'_heritability_ci{confidence_level}.json'
),
'w'
) as f:
json.dump(heritability, f, indent=4)

121 changes: 121 additions & 0 deletions paper/pheno_sim/gen_lin_add_sim.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
"""Generate linear additive phenotype CITRUS simulation configs."""

import copy
import json

import numpy as np
import pandas as pd


CONFIG_TEMPLATE = {
"input": [
{
"file": "../data/ukb/vcf/phased_chr19.vcf",
"file_format": "vcf",
"reference_genome": "GRCh37",
"input_nodes": [
{
"alias": "variants",
"type": "SNP",
"chr": "19",
"pos": []
}
]
}
],
"simulation_steps": [
{
"type": "RandomConstant",
"alias": "variants_betas",
"input_match_size": "variants",
"dist_name": "normal",
"dist_kwargs": {
"loc": 0.0,
"scale": 1.0
},
"by_feat": True
},
{
"type": "Product",
"alias": "variant_effects",
"input_aliases": [
"variants_betas",
"variants"
]
},
{
"type": "AdditiveCombine",
"alias": "combined_variant_effects",
"input_alias": "variant_effects"
},
{
"type": "SumReduce",
"alias": "no_noise_phenotype",
"input_alias": "combined_variant_effects"
},
{
"type": "Heritability",
"alias": "phenotype",
"input_alias": "no_noise_phenotype",
}
]
}


class NpEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, np.integer):
return int(obj)
if isinstance(obj, np.floating):
return float(obj)
if isinstance(obj, np.ndarray):
return obj.tolist()
return super(NpEncoder, self).default(obj)


def create_config(pos, heritability):
sim_config = copy.deepcopy(CONFIG_TEMPLATE)

sim_config['input'][0]['input_nodes'][0]['pos'] = list(pos)
sim_config['simulation_steps'][-1]['heritability'] = float(heritability)

return sim_config


if __name__ == '__main__':

# Options
n_vars = [10, 100, 1000]
herit_vals = [0.05, 0.1, 0.2, 0.4, 0.8]
version_num = 1

config_save_dir = 'sim_configs'

# Load variant info
var_info_path = '../data/ukb/chr19_var_info.csv'
var_df = pd.read_csv(
var_info_path
)

# Add position column
var_df['pos'] = var_df.locus.apply(lambda x: int(x.split(':')[1]))

for n_var in n_vars:
for herit in herit_vals:
rng = np.random.default_rng(seed=147*version_num)

# Select loci
pos = rng.choice(
var_df.pos,
size=n_var,
replace=False
)

sim_config = create_config(list(pos), herit)

# Save config
h_str = str(herit).split('.')[1]
config_fname = f'lin_add_nvar{n_var}_h{h_str}_v{version_num}'

with open(f'{config_save_dir}/{config_fname}.json', 'w') as f:
json.dump(sim_config, f, indent=4, cls=NpEncoder)
Loading

0 comments on commit 2b69139

Please sign in to comment.