Merge pull request #8 from yli091230/add-association-example

add files and notebook for association test
harbi811 · Apr 27, 2024 · dc867ed · dc867ed
2 parents f111a1e + 7dbab05
commit dc867ed
Show file tree

Hide file tree

Showing 4 changed files with 1,832 additions and 0 deletions.
diff --git a/association_example.ipynb b/association_example.ipynb
@@ -0,0 +1,178 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# This notebook provides an example script for running association test\n",
+    "1. Install all required packages\n",
+    "2. Download required files from the `files_for_association` folder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import csv\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import math\n",
+    "from tqdm import tqdm\n",
+    "import random\n",
+    "import statsmodels.api as sm\n",
+    "from sklearn import preprocessing\n",
+    "import statsmodels.stats.multitest\n",
+    "\n",
+    "def getData(filename):\n",
+    "    with open(filename, \"r\") as csvfile:\n",
+    "        reader = csv.reader(csvfile)\n",
+    "        for row in reader:\n",
+    "            yield row\n",
+    "\n",
+    "def exonstrSLR(chrom,Pop,files_dir,minTs=10,mingt=3,minsPgt=3,search_range=100_000):\n",
+    "    \"\"\"\n",
+    "    Pop: population \n",
+    "    minTs: minimum samples required for regression\n",
+    "    mingt: minimum genotypes required for regression\n",
+    "    minsPgt: minimum samples requried for genotypes to be included in regression\n",
+    "    search_range: ranges search for associated STRs\n",
+    "    \"\"\"\n",
+    "\n",
+    "    #load covariates\n",
+    "    cov_df=pd.read_csv(f'{files_dir}/covariates_all.csv',sep='\\t',index_col='sample_id')\n",
+    "    exp_df=pd.read_csv(f'{files_dir}/'+Pop+'_normalized_and_filtered_hg38_chr21.csv',sep='\\t')\n",
+    "    gt_dir=f'{files_dir}/chr'+str(chrom)+'.GB.FORMAT'\n",
+    "    \n",
+    "    csv_gen = getData(gt_dir)\n",
+    "    row_count = 0\n",
+    "    p_df = pd.DataFrame(columns =[ \"str-gene\",\"str_end\",\"motif\",\"gene_name\", \"sample_n\",\"GT_n\",\"p_values\",\"slope\",'error',\"shuffled_p\",\"shuffled_slope\",\"shuffled_error\"])\n",
+    "    #the total length just for illustration purpose, can remove tqdm when running in large batches\n",
+    "    for row in tqdm(csv_gen,total=1067):\n",
+    "        if row_count==0:\n",
+    "            #extract sample name of current STR\n",
+    "            col_name=row[0].split('\\t')\n",
+    "            row_count+=1\n",
+    "            continue\n",
+    "\n",
+    "        gt_value=list(filter(lambda a: a!='',row[0].split('\\t')))\n",
+    "        gt_df=pd.DataFrame([gt_value],columns=col_name)\n",
+    "        gt_df[gt_df.columns[4:]]=gt_df[gt_df.columns[4:]]. \\\n",
+    "                applymap(lambda x: [int(x.split('/')[0]),int(x.split('/')[1])] if '/' in x else [None,None])\n",
+    "        gt_samples=gt_df.columns[4:][gt_df.iloc[0,4:].apply(lambda x: x !=[None,None])].to_list()\n",
+    "        gt_psi_samples=list(set(exp_df.columns[5:]) & set(gt_samples))\n",
+    "        #STR GENOTYPE FILTER\n",
+    "        gt_psi_phased=gt_df[gt_psi_samples].T.rename(columns={0:'GT'}).applymap(lambda x: sum(x))\n",
+    "        gt_sum=gt_psi_phased.groupby('GT').size()\n",
+    "\n",
+    "        #get joint data\n",
+    "        gt_ab_3=gt_sum[gt_sum>=minsPgt].index.tolist()\n",
+    "        #check number of genotype and check how many samples remains\n",
+    "        if len(gt_ab_3)<mingt or gt_sum[gt_ab_3].sum()<minTs:\n",
+    "            row_count+=1\n",
+    "            continue\n",
+    "\n",
+    "        gt_psi_filted_samples=set(gt_psi_phased[gt_psi_phased.GT.isin(gt_ab_3)].index) & set(cov_df.index)\n",
+    "        gt_psi_samples=list(gt_psi_filted_samples)\n",
+    "\n",
+    "        gt_df=gt_df[gt_df.columns[0:4].to_list()+gt_psi_samples]\n",
+    "        joint_cov_df=cov_df.loc[gt_psi_samples]\n",
+    "        joint_exp_df=exp_df[exp_df.columns[0:5].tolist()+gt_psi_samples]\n",
+    "        joint_exp_df=joint_exp_df[joint_exp_df.chromosome.values == gt_df['CHROM'].values]\n",
+    "\n",
+    "        paired_df=joint_exp_df[(joint_exp_df.start - search_range <= int(gt_df.POS)) \\\n",
+    "                         & (joint_exp_df.end + search_range >= (int(gt_df.POS)))]\n",
+    "\n",
+    "        if paired_df.empty:\n",
+    "            row_count+=1\n",
+    "            continue\n",
+    "\n",
+    "        for index, curr_exon in paired_df.iterrows():\n",
+    "            #contat the PSI, genotype, peer and pc for current exon of all samples\n",
+    "            a=pd.concat([curr_exon[5:].astype('float64'),\\\n",
+    "                       gt_df[gt_psi_samples].T.rename(columns={0:'GT'}).applymap(lambda x: sum(x)),\\\n",
+    "                       joint_cov_df],\\\n",
+    "                       axis=1).rename(columns={index:'exp'})\n",
+    "\n",
+    "            a=a[~a.exp.isnull()]\n",
+    "            #standardization\n",
+    "            a_scaled = preprocessing.StandardScaler().fit_transform(a)\n",
+    "            y=a_scaled[:,0]\n",
+    "            x=a_scaled[:,1:]\n",
+    "            x=sm.add_constant(x)\n",
+    "            mod_ols  = sm.OLS(y,x)\n",
+    "            res_ols = mod_ols.fit()\n",
+    "            p_values=res_ols.pvalues[1]\n",
+    "            slope=res_ols.params[1]\n",
+    "            err=res_ols.bse[1]\n",
+    "\n",
+    "            shuffled_y = random.sample(list(y),len(y))\n",
+    "            mod_ols_s = sm.OLS(shuffled_y,x)\n",
+    "            res_ols_s = mod_ols_s.fit()\n",
+    "            shuffled_p=res_ols_s.pvalues[1]\n",
+    "            slope_p=res_ols_s.params[1]\n",
+    "            err_p=res_ols_s.bse[1]\n",
+    "\n",
+    "            p_df = p_df.append({\"str-gene\":list(gt_df.CHROM +'_'+ gt_df.POS.str.rstrip()+'-'+curr_exon.gene_id)[0],\\\n",
+    "                                \"str_end\":gt_df.END.tolist()[0],\\\n",
+    "                                \"motif\":gt_df.motif.tolist()[0],\\\n",
+    "                                \"gene_name\":curr_exon.gene_name,\"sample_n\":len(a),\"GT_n\":len(gt_ab_3),\"p_values\":p_values,\\\n",
+    "                                \"slope\":slope,\"error\":err,\"shuffled_p\":shuffled_p,\"shuffled_slope\":slope_p,\\\n",
+    "                                \"shuffled_error\":err_p}, ignore_index=True)\n",
+    "            row_count+=1\n",
+    "\n",
+    "    return p_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#directory where association files are lcoated\n",
+    "reg_file_dir=\"/expanse/protected/gymreklab-dbgap/mount/yal084/genotyping_repeats_tutorial/files_for_association\"\n",
+    "#running regression\n",
+    "reg_results=exonstrSLR(21,\"AFR\",reg_file_dir)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reg_results.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}