Add paper workflows for expanse

gymrek-lab · Oct 31, 2024 · 2b69139 · 2b69139
1 parent f1ad9a0
commit 2b69139
Show file tree

Hide file tree

Showing 131 changed files with 28,806 additions and 36 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,17 +1,11 @@
-# Benchmarking data
-# benchmarking/data/citrus_output/
-# benchmarking/data/citrus_phenos/
-# benchmarking/data/splits/
-# benchmarking/data/ukb/*
-# !benchmarking/data/ukb/
-# !benchmarking/data/ukb/*.py
-
-# benchmarking/prs/plink_output/*
-# !benchmarking/prs/plink_output/
-# !benchmarking/prs/plink_output/*.py
-# benchmarking/prs/prsice2_output/
+# Benchmarking
 benchmarking*/
 
+# Paper eval
+paper/data/
+paper/output/
+paper/pheno_sim/citrus*/
+
 # Tools
 benchmarking/PRSice2/
 benchmarking/QCTOOL/

diff --git a/cl_tool/cli.py b/cl_tool/cli.py
@@ -21,30 +21,6 @@
 			<path_to_genotype_file> ...
 			
 		CITRUS_sim -c <path_to_config_file> -g <path_to_genotype_file>
-
-Output:
-
-	If no additional flags are provided, output will be written to the
-	current working directory. The output will be a CSV file (output.csv)
-	containing sample IDs and all corresponding values from the simulation,
-	and a JSON file (sim_config.json) containing the simulation configuration
-	(including any random selections made by nodes).
-
-	If the -o or --output_dir flag is provided, if the directory does not
-	exist it will be created, and the output files will be saved to it. By
-	default the output files will be named output.csv and sim_config.json,
-	but these can be changed with the -f or --output_file_name and -j or
-	--output_config_json flags, respectively.
-
-	Output file will by default be a comma seperated CSV file. Use -t or
-	--tsv flag to instead save as a tab seperated TSV file.
-
-Example Usage:
-
-	CITRUS_sim -c config.json -o sim_results/output_dir
-
-	CITRUS_sim -c config.json -g genotype_file_1 genotype_file_2 \\
-		-o sim_results/output_dir -t -f my_output.tsv -j my_sim_config.json
 """
 
 import click

diff --git a/paper/pheno_sim/citrus_out_to_plink_fmt.py b/paper/pheno_sim/citrus_out_to_plink_fmt.py
@@ -0,0 +1,58 @@
+"""Convert CITRUS output CSV to plink phenotype file format.
+
+Example output:
+
+	#IID	gene1	phenotype
+	1110	23.31	22.22
+	2202	34.12	18.23
+	...
+	
+"""
+
+import os
+
+import pandas as pd
+
+# # Write a function to replace main
+# def citrus_out_to_plink_phenotype(
+# 	sample_id_col=''
+# )
+
+import argparse
+
+
+if __name__ == "__main__":
+
+	# args
+	parser = argparse.ArgumentParser()
+
+	parser.add_argument("desc")
+
+	args = parser.parse_args()
+
+	sample_id_col = 'sample_id'
+
+	pheno_cols = ['phenotype']
+
+	csv_path = f'citrus_output/{args.desc}.csv'
+	output_path = f'citrus_phenos/{args.desc}.pheno'
+
+	# function
+	df = pd.read_csv(
+		csv_path,
+		index_col=False
+	)
+
+	df = df[[sample_id_col] + pheno_cols]
+
+	# Split FID and IID
+	df['FID'] = df[sample_id_col].str.split('_').map(lambda x: x[0])
+	df['IID'] = df[sample_id_col].str.split('_').map(lambda x: x[1])
+
+	df[['FID', 'IID'] + pheno_cols].to_csv(
+		output_path,
+		sep='\t',
+		index=False
+	)
+
+
diff --git a/paper/pheno_sim/est_heritability.py b/paper/pheno_sim/est_heritability.py
@@ -0,0 +1,45 @@
+"""Estimate heritability of simulated CITRUS phenotypes."""
+
+import json
+import os
+
+from pheno_sim import PhenoSimulation
+from pheno_sim.heritability_estimation import broad_sense_heritability
+
+
+if __name__ == '__main__':
+
+	sim_name = 'gepsi_lin_add_hFULL_hFULL'
+	config_dir = 'citrus_output'
+
+	print(sim_name)
+
+	# Load simulation config
+	with open(os.path.join(config_dir, sim_name + '_config.json'), 'r') as f:
+		sim_config = json.load(f)
+
+	# Create simulation
+	sim = PhenoSimulation(sim_config)
+
+	# Load genotype data
+	input_vals = sim.run_input_step()
+
+	# Compute heritability
+	confidence_level = 95
+
+	heritability = broad_sense_heritability(
+		sim,
+		input_vals=input_vals,
+		n_samples=0.1,
+		confidence_level=confidence_level/100,
+	)
+
+	# Save results
+	with open(
+		os.path.join(
+			config_dir, sim_name + f'_heritability_ci{confidence_level}.json'
+		),
+		'w'
+	) as f:
+		json.dump(heritability, f, indent=4)
+
diff --git a/paper/pheno_sim/gen_lin_add_sim.py b/paper/pheno_sim/gen_lin_add_sim.py
@@ -0,0 +1,121 @@
+"""Generate linear additive phenotype CITRUS simulation configs."""
+
+import copy
+import json
+
+import numpy as np
+import pandas as pd
+
+
+CONFIG_TEMPLATE = {
+	"input": [
+		{
+			"file": "../data/ukb/vcf/phased_chr19.vcf",
+			"file_format": "vcf",
+			"reference_genome": "GRCh37",
+			"input_nodes": [
+				{
+					"alias": "variants",
+					"type": "SNP",
+					"chr": "19",
+					"pos": []
+				}
+			]
+		}
+	],
+	"simulation_steps": [
+		{
+			"type": "RandomConstant",
+			"alias": "variants_betas",
+			"input_match_size": "variants",
+			"dist_name": "normal",
+			"dist_kwargs": {
+				"loc": 0.0,
+				"scale": 1.0
+			},
+			"by_feat": True
+		},
+		{
+			"type": "Product",
+			"alias": "variant_effects",
+			"input_aliases": [
+				"variants_betas",
+				"variants"
+			]
+		},
+		{
+			"type": "AdditiveCombine",
+			"alias": "combined_variant_effects",
+			"input_alias": "variant_effects"
+		},
+		{
+			"type": "SumReduce",
+			"alias": "no_noise_phenotype",
+			"input_alias": "combined_variant_effects"
+		},
+		{
+			"type": "Heritability",
+			"alias": "phenotype",
+			"input_alias": "no_noise_phenotype",
+		}
+	]
+}
+
+
+class NpEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, np.integer):
+            return int(obj)
+        if isinstance(obj, np.floating):
+            return float(obj)
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        return super(NpEncoder, self).default(obj)
+
+
+def create_config(pos, heritability):
+	sim_config = copy.deepcopy(CONFIG_TEMPLATE)
+
+	sim_config['input'][0]['input_nodes'][0]['pos'] = list(pos)
+	sim_config['simulation_steps'][-1]['heritability'] = float(heritability)
+
+	return sim_config
+
+
+if __name__ == '__main__':
+
+	# Options
+	n_vars = [10, 100, 1000]
+	herit_vals = [0.05, 0.1, 0.2, 0.4, 0.8]
+	version_num = 1
+
+	config_save_dir = 'sim_configs'
+
+	# Load variant info
+	var_info_path = '../data/ukb/chr19_var_info.csv'
+	var_df = pd.read_csv(
+		var_info_path
+	)
+
+	# Add position column
+	var_df['pos'] = var_df.locus.apply(lambda x: int(x.split(':')[1]))
+
+	for n_var in n_vars:
+		for herit in herit_vals:
+			rng = np.random.default_rng(seed=147*version_num)
+
+			# Select loci
+			pos = rng.choice(
+				var_df.pos,
+				size=n_var,
+				replace=False
+			)
+
+			sim_config = create_config(list(pos), herit)
+
+			# Save config
+			h_str = str(herit).split('.')[1]
+			config_fname = f'lin_add_nvar{n_var}_h{h_str}_v{version_num}'
+
+			with open(f'{config_save_dir}/{config_fname}.json', 'w') as f:
+				json.dump(sim_config, f, indent=4, cls=NpEncoder)