From 537e29ba28ac8be3b01c7c22e0a10a718e9c6a79 Mon Sep 17 00:00:00 2001 From: JohannesKersting Date: Fri, 9 Jun 2023 14:45:21 +0200 Subject: [PATCH 1/3] created dev branch --- dev_tests.ipynb | 950 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 950 insertions(+) create mode 100644 dev_tests.ipynb diff --git a/dev_tests.ipynb b/dev_tests.ipynb new file mode 100644 index 0000000..2bb5c38 --- /dev/null +++ b/dev_tests.ipynb @@ -0,0 +1,950 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 109, + "id": "d03d274e-6792-4bbf-93bf-b8c7259c1d7f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "import pandas as pd\n", + "from dysregnet.dysregnet import run" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b6408bef-4768-42c3-88e3-21e250102240", + "metadata": {}, + "outputs": [], + "source": [ + "meta = pd.read_csv(\"test_data/tpm_meta.csv\")\n", + "expr = pd.read_csv(\"test_data/tpm.csv\")\n", + "grn=pd.read_csv(\"test_data/HTRIdb_data.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0b876315-e883-450e-95a4-88b7caf76085", + "metadata": {}, + "outputs": [], + "source": [ + "meta[\"birth_days_to\"] = meta[\"birth_days_to\"].fillna(meta[\"birth_days_to\"].mean())\n", + "meta[\"race\"] = meta[\"race\"].fillna(\"not reported\")\n", + "meta[\"race\"] = meta[\"race\"].replace({\"[Unknown]\": \"not reported\", \"[Not Evaluated]\":\"not reported\"})\n", + "\n", + "expr = expr.set_index(expr.columns[0])\n", + "expr = expr.T\n", + "expr.insert(0, \"sample\", expr.index)\n", + "assert all(expr.iloc[:, 0].values == meta.iloc[:,0].values)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "208046d3-5d5f-4ea4-aae7-c575094bc88e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_PATIENTcancer type abbreviationage_at_initial_pathologic_diagnosisgenderraceajcc_pathologic_tumor_stageclinical_stagehistological_typehistological_grade...DSS.timeDFIDFI.timePFIPFI.timeRedactionsample_type_idsample_type_primary_diseasecondition
0TCGA-55-7995-01TCGA-55-7995LUAD73.0FEMALEWHITEStage IANaNLung AdenocarcinomaNaN...889.01.0468.01.0468.0NaN1.0Primary Tumorlung adenocarcinoma1
1TCGA-38-4625-11TCGA-38-4625LUAD66.0FEMALEWHITEStage IBNaNLung AdenocarcinomaNaN...2973.00.02973.00.02973.0NaN11.0Solid Tissue Normallung adenocarcinoma0
2TCGA-69-7761-01TCGA-69-7761LUAD84.0MALEWHITEStage IBNaNLung AdenocarcinomaNaN...186.0NaNNaN0.0186.0NaN1.0Primary Tumorlung adenocarcinoma1
3TCGA-67-6216-01TCGA-67-6216LUAD57.0FEMALEWHITEStage IANaNLung AdenocarcinomaNaN...141.00.0141.00.0141.0NaN1.0Primary Tumorlung adenocarcinoma1
4TCGA-44-6148-01TCGA-44-6148LUAD60.0MALEWHITEStage IANaNLung AdenocarcinomaNaN...704.00.0704.00.0704.0NaN1.0Primary Tumorlung adenocarcinoma1
\n", + "

5 rows × 38 columns

\n", + "
" + ], + "text/plain": [ + " sample _PATIENT cancer type abbreviation \\\n", + "0 TCGA-55-7995-01 TCGA-55-7995 LUAD \n", + "1 TCGA-38-4625-11 TCGA-38-4625 LUAD \n", + "2 TCGA-69-7761-01 TCGA-69-7761 LUAD \n", + "3 TCGA-67-6216-01 TCGA-67-6216 LUAD \n", + "4 TCGA-44-6148-01 TCGA-44-6148 LUAD \n", + "\n", + " age_at_initial_pathologic_diagnosis gender race \\\n", + "0 73.0 FEMALE WHITE \n", + "1 66.0 FEMALE WHITE \n", + "2 84.0 MALE WHITE \n", + "3 57.0 FEMALE WHITE \n", + "4 60.0 MALE WHITE \n", + "\n", + " ajcc_pathologic_tumor_stage clinical_stage histological_type \\\n", + "0 Stage IA NaN Lung Adenocarcinoma \n", + "1 Stage IB NaN Lung Adenocarcinoma \n", + "2 Stage IB NaN Lung Adenocarcinoma \n", + "3 Stage IA NaN Lung Adenocarcinoma \n", + "4 Stage IA NaN Lung Adenocarcinoma \n", + "\n", + " histological_grade ... DSS.time DFI DFI.time PFI PFI.time Redaction \\\n", + "0 NaN ... 889.0 1.0 468.0 1.0 468.0 NaN \n", + "1 NaN ... 2973.0 0.0 2973.0 0.0 2973.0 NaN \n", + "2 NaN ... 186.0 NaN NaN 0.0 186.0 NaN \n", + "3 NaN ... 141.0 0.0 141.0 0.0 141.0 NaN \n", + "4 NaN ... 704.0 0.0 704.0 0.0 704.0 NaN \n", + "\n", + " sample_type_id sample_type _primary_disease condition \n", + "0 1.0 Primary Tumor lung adenocarcinoma 1 \n", + "1 11.0 Solid Tissue Normal lung adenocarcinoma 0 \n", + "2 1.0 Primary Tumor lung adenocarcinoma 1 \n", + "3 1.0 Primary Tumor lung adenocarcinoma 1 \n", + "4 1.0 Primary Tumor lung adenocarcinoma 1 \n", + "\n", + "[5 rows x 38 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "meta.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "35febb56-1441-4049-a452-450c25afebb1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
samplesampleRP11-34P13.7RP11-34P13.8CICP27RP11-34P13.15RP11-34P13.16RP11-34P13.13FO538757.2AP006222.2RP4-669L17.10...MT-CO2MT-ATP8MT-ATP6MT-CO3MT-ND3MT-ND4LMT-ND4MT-ND5MT-ND6MT-CYB
TCGA-55-7995-01TCGA-55-7995-010.4962530.5621631.8199671.9457911.0133401.4892582.8097332.7200601.448061...0.620082-0.8916160.0316850.590567-0.8471720.5373820.8660380.486255-0.036586-0.059644
TCGA-38-4625-11TCGA-38-4625-11-0.1100670.014913-1.962869-0.813442-0.416048-1.713702-0.940562-1.039181-1.737634...0.151399-0.685961-0.3063750.2891130.895153-0.455624-0.069085-0.627751-0.722210-0.002905
TCGA-69-7761-01TCGA-69-7761-01-0.1100670.5789510.341960-0.0305430.1013050.602601-1.764752-1.1164340.600953...-0.6955600.8427280.008627-0.768548-2.694695-0.043072-0.480471-0.274686-0.205755-1.255776
TCGA-67-6216-01TCGA-67-6216-01-0.414540-2.486586-0.1012311.1626520.7421260.936864-1.340492-1.1164340.747482...0.152220-0.2155300.0568400.926111-2.5445350.081287-0.070977-0.378933-1.056571-0.535812
TCGA-44-6148-01TCGA-44-6148-011.0988180.5952610.7287250.6704730.4395980.7291490.5129861.4454610.868339...0.323585-0.2815480.0943810.2494480.8957980.4062460.483969-0.844608-1.144351-0.227255
\n", + "

5 rows × 22579 columns

\n", + "
" + ], + "text/plain": [ + "sample sample RP11-34P13.7 RP11-34P13.8 CICP27 \\\n", + "TCGA-55-7995-01 TCGA-55-7995-01 0.496253 0.562163 1.819967 \n", + "TCGA-38-4625-11 TCGA-38-4625-11 -0.110067 0.014913 -1.962869 \n", + "TCGA-69-7761-01 TCGA-69-7761-01 -0.110067 0.578951 0.341960 \n", + "TCGA-67-6216-01 TCGA-67-6216-01 -0.414540 -2.486586 -0.101231 \n", + "TCGA-44-6148-01 TCGA-44-6148-01 1.098818 0.595261 0.728725 \n", + "\n", + "sample RP11-34P13.15 RP11-34P13.16 RP11-34P13.13 FO538757.2 \\\n", + "TCGA-55-7995-01 1.945791 1.013340 1.489258 2.809733 \n", + "TCGA-38-4625-11 -0.813442 -0.416048 -1.713702 -0.940562 \n", + "TCGA-69-7761-01 -0.030543 0.101305 0.602601 -1.764752 \n", + "TCGA-67-6216-01 1.162652 0.742126 0.936864 -1.340492 \n", + "TCGA-44-6148-01 0.670473 0.439598 0.729149 0.512986 \n", + "\n", + "sample AP006222.2 RP4-669L17.10 ... MT-CO2 MT-ATP8 MT-ATP6 \\\n", + "TCGA-55-7995-01 2.720060 1.448061 ... 0.620082 -0.891616 0.031685 \n", + "TCGA-38-4625-11 -1.039181 -1.737634 ... 0.151399 -0.685961 -0.306375 \n", + "TCGA-69-7761-01 -1.116434 0.600953 ... -0.695560 0.842728 0.008627 \n", + "TCGA-67-6216-01 -1.116434 0.747482 ... 0.152220 -0.215530 0.056840 \n", + "TCGA-44-6148-01 1.445461 0.868339 ... 0.323585 -0.281548 0.094381 \n", + "\n", + "sample MT-CO3 MT-ND3 MT-ND4L MT-ND4 MT-ND5 MT-ND6 \\\n", + "TCGA-55-7995-01 0.590567 -0.847172 0.537382 0.866038 0.486255 -0.036586 \n", + "TCGA-38-4625-11 0.289113 0.895153 -0.455624 -0.069085 -0.627751 -0.722210 \n", + "TCGA-69-7761-01 -0.768548 -2.694695 -0.043072 -0.480471 -0.274686 -0.205755 \n", + "TCGA-67-6216-01 0.926111 -2.544535 0.081287 -0.070977 -0.378933 -1.056571 \n", + "TCGA-44-6148-01 0.249448 0.895798 0.406246 0.483969 -0.844608 -1.144351 \n", + "\n", + "sample MT-CYB \n", + "TCGA-55-7995-01 -0.059644 \n", + "TCGA-38-4625-11 -0.002905 \n", + "TCGA-69-7761-01 -1.255776 \n", + "TCGA-67-6216-01 -0.535812 \n", + "TCGA-44-6148-01 -0.227255 \n", + "\n", + "[5 rows x 22579 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "expr.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "9dd72783-5b5d-488d-937e-f1d34535b29a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SYMBOL_TFSYMBOL_TG
0PARP1BRCA2
1AHRCYP1A1
2AHRCYP1A2
3AHRCYP1B1
4AHRFOS
\n", + "
" + ], + "text/plain": [ + " SYMBOL_TF SYMBOL_TG\n", + "0 PARP1 BRCA2\n", + "1 AHR CYP1A1\n", + "2 AHR CYP1A2\n", + "3 AHR CYP1B1\n", + "4 AHR FOS" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grn.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "abda2742-f17d-440c-b5b2-e021d2c26f9c", + "metadata": {}, + "outputs": [], + "source": [ + "CatCov=['race','gender'] \n", + "ConCov=['birth_days_to',]" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "id": "62b921d3-27e0-4f54-a77b-d3b35d5eedfb", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "0it [00:00, ?it/s]/home/johannes/miniconda3/envs/drn_package/lib/python3.11/site-packages/statsmodels-0.14.0-py3.11-linux-x86_64.egg/statsmodels/regression/linear_model.py:1965: RuntimeWarning: divide by zero encountered in scalar divide\n", + " return np.sqrt(eigvals[0]/eigvals[-1])\n", + "/home/johannes/miniconda3/envs/drn_package/lib/python3.11/site-packages/statsmodels-0.14.0-py3.11-linux-x86_64.egg/statsmodels/regression/linear_model.py:1965: RuntimeWarning: divide by zero encountered in scalar divide\n", + " return np.sqrt(eigvals[0]/eigvals[-1])\n", + "/home/johannes/miniconda3/envs/drn_package/lib/python3.11/site-packages/statsmodels-0.14.0-py3.11-linux-x86_64.egg/statsmodels/regression/linear_model.py:1965: RuntimeWarning: divide by zero encountered in scalar divide\n", + " return np.sqrt(eigvals[0]/eigvals[-1])\n", + "3it [00:00, 142.18it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Coefficients\n", + "PARP1 3.268853e-01\n", + "birth_days_to 6.271495e-05\n", + "race_ASIAN -2.220446e-16\n", + "race_BLACK OR AFRICAN AMERICAN 2.340830e-01\n", + "race_WHITE -2.340830e-01\n", + "race_not reported 0.000000e+00\n", + "gender_MALE 2.813038e-01\n", + " OLS Regression Results \n", + "==============================================================================\n", + "Dep. Variable: BRCA2 R-squared: 0.180\n", + "Model: OLS Adj. R-squared: 0.119\n", + "Method: Least Squares F-statistic: 2.960\n", + "Date: Fri, 09 Jun 2023 Prob (F-statistic): 0.0278\n", + "Time: 12:31:41 Log-Likelihood: -77.870\n", + "No. Observations: 59 AIC: 165.7\n", + "Df Residuals: 54 BIC: 176.1\n", + "Df Model: 4 \n", + "Covariance Type: nonrobust \n", + "==================================================================================================\n", + " coef std err t P>|t| [0.025 0.975]\n", + "--------------------------------------------------------------------------------------------------\n", + "const 1.0605 0.584 1.815 0.075 -0.111 2.232\n", + "PARP1 0.3269 0.128 2.553 0.014 0.070 0.584\n", + "birth_days_to 6.271e-05 3.54e-05 1.769 0.082 -8.35e-06 0.000\n", + "race_ASIAN 4.308e-17 5.58e-17 0.772 0.443 -6.87e-17 1.55e-16\n", + "race_BLACK OR AFRICAN AMERICAN 0.7643 0.400 1.909 0.062 -0.039 1.567\n", + "race_WHITE 0.2962 0.366 0.809 0.422 -0.438 1.030\n", + "race_not reported 0 0 nan nan 0 0\n", + "gender_MALE 0.2813 0.253 1.111 0.271 -0.226 0.789\n", + "==============================================================================\n", + "Omnibus: 1.700 Durbin-Watson: 2.015\n", + "Prob(Omnibus): 0.427 Jarque-Bera (JB): 1.224\n", + "Skew: 0.350 Prob(JB): 0.542\n", + "Kurtosis: 3.089 Cond. No. inf\n", + "==============================================================================\n", + "\n", + "Notes:\n", + "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", + "[2] The smallest eigenvalue is 0. This might indicate that there are\n", + "strong multicollinearity problems or that the design matrix is singular.\n", + " Coefficients\n", + "AHR -4.566222e-02\n", + "birth_days_to 5.034837e-05\n", + "race_ASIAN -5.551115e-17\n", + "race_BLACK OR AFRICAN AMERICAN 1.119309e-01\n", + "race_WHITE -1.119309e-01\n", + "race_not reported 0.000000e+00\n", + "gender_MALE 3.432154e-01\n", + " OLS Regression Results \n", + "==============================================================================\n", + "Dep. Variable: CYP1B1 R-squared: 0.082\n", + "Model: OLS Adj. R-squared: 0.014\n", + "Method: Least Squares F-statistic: 1.199\n", + "Date: Fri, 09 Jun 2023 Prob (F-statistic): 0.322\n", + "Time: 12:31:41 Log-Likelihood: -81.208\n", + "No. Observations: 59 AIC: 172.4\n", + "Df Residuals: 54 BIC: 182.8\n", + "Df Model: 4 \n", + "Covariance Type: nonrobust \n", + "==================================================================================================\n", + " coef std err t P>|t| [0.025 0.975]\n", + "--------------------------------------------------------------------------------------------------\n", + "const 0.7744 0.607 1.275 0.208 -0.443 1.992\n", + "AHR -0.0457 0.134 -0.341 0.734 -0.314 0.222\n", + "birth_days_to 5.035e-05 3.68e-05 1.369 0.177 -2.34e-05 0.000\n", + "race_ASIAN 1.573e-17 5.37e-17 0.293 0.771 -9.2e-17 1.23e-16\n", + "race_BLACK OR AFRICAN AMERICAN 0.4991 0.420 1.189 0.240 -0.342 1.341\n", + "race_WHITE 0.2753 0.383 0.719 0.475 -0.493 1.043\n", + "race_not reported 0 0 nan nan 0 0\n", + "gender_MALE 0.3432 0.270 1.273 0.208 -0.197 0.884\n", + "==============================================================================\n", + "Omnibus: 1.499 Durbin-Watson: 2.188\n", + "Prob(Omnibus): 0.473 Jarque-Bera (JB): 1.482\n", + "Skew: -0.357 Prob(JB): 0.477\n", + "Kurtosis: 2.693 Cond. No. inf\n", + "==============================================================================\n", + "\n", + "Notes:\n", + "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", + "[2] The smallest eigenvalue is 0. This might indicate that there are\n", + "strong multicollinearity problems or that the design matrix is singular.\n", + " Coefficients\n", + "AHR 2.965066e-01\n", + "birth_days_to -4.140146e-05\n", + "race_ASIAN -2.081668e-17\n", + "race_BLACK OR AFRICAN AMERICAN 3.074572e-02\n", + "race_WHITE -3.074572e-02\n", + "race_not reported 0.000000e+00\n", + "gender_MALE 4.652414e-02\n", + " OLS Regression Results \n", + "==============================================================================\n", + "Dep. Variable: FOS R-squared: 0.126\n", + "Model: OLS Adj. R-squared: 0.061\n", + "Method: Least Squares F-statistic: 1.949\n", + "Date: Fri, 09 Jun 2023 Prob (F-statistic): 0.116\n", + "Time: 12:31:41 Log-Likelihood: -79.739\n", + "No. Observations: 59 AIC: 169.5\n", + "Df Residuals: 54 BIC: 179.9\n", + "Df Model: 4 \n", + "Covariance Type: nonrobust \n", + "==================================================================================================\n", + " coef std err t P>|t| [0.025 0.975]\n", + "--------------------------------------------------------------------------------------------------\n", + "const -0.6589 0.592 -1.113 0.271 -1.846 0.528\n", + "AHR 0.2965 0.130 2.273 0.027 0.035 0.558\n", + "birth_days_to -4.14e-05 3.59e-05 -1.154 0.254 -0.000 3.05e-05\n", + "race_ASIAN -4.687e-17 5.24e-17 -0.894 0.375 -1.52e-16 5.82e-17\n", + "race_BLACK OR AFRICAN AMERICAN -0.2987 0.409 -0.730 0.469 -1.120 0.522\n", + "race_WHITE -0.3602 0.374 -0.964 0.339 -1.109 0.389\n", + "race_not reported 0 0 nan nan 0 0\n", + "gender_MALE 0.0465 0.263 0.177 0.860 -0.481 0.574\n", + "==============================================================================\n", + "Omnibus: 0.744 Durbin-Watson: 2.280\n", + "Prob(Omnibus): 0.689 Jarque-Bera (JB): 0.672\n", + "Skew: -0.250 Prob(JB): 0.715\n", + "Kurtosis: 2.850 Cond. No. inf\n", + "==============================================================================\n", + "\n", + "Notes:\n", + "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", + "[2] The smallest eigenvalue is 0. This might indicate that there are\n", + "strong multicollinearity problems or that the design matrix is singular.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "data=run(expression_data=expr,\n", + " meta=meta, \n", + " CatCov=CatCov,\n", + " ConCov=ConCov,\n", + " GRN=grn.head(),\n", + " conCol='condition')" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "c3aede6a-7044-456e-affd-3804f54eb9d7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
patient id(PARP1, BRCA2)(AHR, CYP1B1)(AHR, FOS)
0TCGA-55-7995-010.00.00.0
1TCGA-69-7761-010.00.00.0
2TCGA-67-6216-010.00.00.0
3TCGA-44-6148-010.00.00.0
4TCGA-71-8520-010.00.00.0
...............
510TCGA-69-7763-010.00.00.0
511TCGA-78-7150-010.00.00.0
512TCGA-MP-A4TI-010.00.00.0
513TCGA-44-6145-010.00.00.0
514TCGA-05-4427-010.00.00.0
\n", + "

515 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " patient id (PARP1, BRCA2) (AHR, CYP1B1) (AHR, FOS)\n", + "0 TCGA-55-7995-01 0.0 0.0 0.0\n", + "1 TCGA-69-7761-01 0.0 0.0 0.0\n", + "2 TCGA-67-6216-01 0.0 0.0 0.0\n", + "3 TCGA-44-6148-01 0.0 0.0 0.0\n", + "4 TCGA-71-8520-01 0.0 0.0 0.0\n", + ".. ... ... ... ...\n", + "510 TCGA-69-7763-01 0.0 0.0 0.0\n", + "511 TCGA-78-7150-01 0.0 0.0 0.0\n", + "512 TCGA-MP-A4TI-01 0.0 0.0 0.0\n", + "513 TCGA-44-6145-01 0.0 0.0 0.0\n", + "514 TCGA-05-4427-01 0.0 0.0 0.0\n", + "\n", + "[515 rows x 4 columns]" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.get_results()" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "47d5da15-68dc-4599-b605-4f1d0478897a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "race condition\n", + "WHITE 1 388\n", + "not reported 1 65\n", + "WHITE 0 55\n", + "BLACK OR AFRICAN AMERICAN 1 53\n", + "ASIAN 1 8\n", + "BLACK OR AFRICAN AMERICAN 0 4\n", + "AMERICAN INDIAN OR ALASKA NATIVE 1 1\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "meta[[\"race\",\"condition\"]].value_counts()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 2bea301d0417a023a9b4e8b37b7ba258ba40deff Mon Sep 17 00:00:00 2001 From: JohannesKersting Date: Tue, 18 Jul 2023 14:47:32 +0200 Subject: [PATCH 2/3] changed lm package from sklearn to statsmodels, added model stats --- dev_tests.ipynb | 1237 +++++++++++++++++++++++++++++++++------- dysregnet/dysregnet.py | 7 +- dysregnet/functions.py | 61 +- 3 files changed, 1061 insertions(+), 244 deletions(-) diff --git a/dev_tests.ipynb b/dev_tests.ipynb index 2bb5c38..cbd1b6a 100644 --- a/dev_tests.ipynb +++ b/dev_tests.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 109, + "execution_count": 94, "id": "d03d274e-6792-4bbf-93bf-b8c7259c1d7f", "metadata": {}, "outputs": [ @@ -24,7 +24,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 47, "id": "b6408bef-4768-42c3-88e3-21e250102240", "metadata": {}, "outputs": [], @@ -36,7 +36,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 48, "id": "0b876315-e883-450e-95a4-88b7caf76085", "metadata": {}, "outputs": [], @@ -53,7 +53,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 49, "id": "208046d3-5d5f-4ea4-aae7-c575094bc88e", "metadata": {}, "outputs": [ @@ -266,7 +266,7 @@ "[5 rows x 38 columns]" ] }, - "execution_count": 5, + "execution_count": 49, "metadata": {}, "output_type": "execute_result" } @@ -277,7 +277,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 50, "id": "35febb56-1441-4049-a452-450c25afebb1", "metadata": {}, "outputs": [ @@ -490,7 +490,7 @@ "[5 rows x 22579 columns]" ] }, - "execution_count": 6, + "execution_count": 50, "metadata": {}, "output_type": "execute_result" } @@ -501,7 +501,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 51, "id": "9dd72783-5b5d-488d-937e-f1d34535b29a", "metadata": {}, "outputs": [ @@ -569,7 +569,7 @@ "4 AHR FOS" ] }, - "execution_count": 7, + "execution_count": 51, "metadata": {}, "output_type": "execute_result" } @@ -580,18 +580,18 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 82, "id": "abda2742-f17d-440c-b5b2-e021d2c26f9c", "metadata": {}, "outputs": [], "source": [ - "CatCov=['race','gender'] \n", + "CatCov=['gender'] \n", "ConCov=['birth_days_to',]" ] }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 95, "id": "62b921d3-27e0-4f54-a77b-d3b35d5eedfb", "metadata": {}, "outputs": [ @@ -599,149 +599,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "0it [00:00, ?it/s]/home/johannes/miniconda3/envs/drn_package/lib/python3.11/site-packages/statsmodels-0.14.0-py3.11-linux-x86_64.egg/statsmodels/regression/linear_model.py:1965: RuntimeWarning: divide by zero encountered in scalar divide\n", - " return np.sqrt(eigvals[0]/eigvals[-1])\n", - "/home/johannes/miniconda3/envs/drn_package/lib/python3.11/site-packages/statsmodels-0.14.0-py3.11-linux-x86_64.egg/statsmodels/regression/linear_model.py:1965: RuntimeWarning: divide by zero encountered in scalar divide\n", - " return np.sqrt(eigvals[0]/eigvals[-1])\n", - "/home/johannes/miniconda3/envs/drn_package/lib/python3.11/site-packages/statsmodels-0.14.0-py3.11-linux-x86_64.egg/statsmodels/regression/linear_model.py:1965: RuntimeWarning: divide by zero encountered in scalar divide\n", - " return np.sqrt(eigvals[0]/eigvals[-1])\n", - "3it [00:00, 142.18it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Coefficients\n", - "PARP1 3.268853e-01\n", - "birth_days_to 6.271495e-05\n", - "race_ASIAN -2.220446e-16\n", - "race_BLACK OR AFRICAN AMERICAN 2.340830e-01\n", - "race_WHITE -2.340830e-01\n", - "race_not reported 0.000000e+00\n", - "gender_MALE 2.813038e-01\n", - " OLS Regression Results \n", - "==============================================================================\n", - "Dep. Variable: BRCA2 R-squared: 0.180\n", - "Model: OLS Adj. R-squared: 0.119\n", - "Method: Least Squares F-statistic: 2.960\n", - "Date: Fri, 09 Jun 2023 Prob (F-statistic): 0.0278\n", - "Time: 12:31:41 Log-Likelihood: -77.870\n", - "No. Observations: 59 AIC: 165.7\n", - "Df Residuals: 54 BIC: 176.1\n", - "Df Model: 4 \n", - "Covariance Type: nonrobust \n", - "==================================================================================================\n", - " coef std err t P>|t| [0.025 0.975]\n", - "--------------------------------------------------------------------------------------------------\n", - "const 1.0605 0.584 1.815 0.075 -0.111 2.232\n", - "PARP1 0.3269 0.128 2.553 0.014 0.070 0.584\n", - "birth_days_to 6.271e-05 3.54e-05 1.769 0.082 -8.35e-06 0.000\n", - "race_ASIAN 4.308e-17 5.58e-17 0.772 0.443 -6.87e-17 1.55e-16\n", - "race_BLACK OR AFRICAN AMERICAN 0.7643 0.400 1.909 0.062 -0.039 1.567\n", - "race_WHITE 0.2962 0.366 0.809 0.422 -0.438 1.030\n", - "race_not reported 0 0 nan nan 0 0\n", - "gender_MALE 0.2813 0.253 1.111 0.271 -0.226 0.789\n", - "==============================================================================\n", - "Omnibus: 1.700 Durbin-Watson: 2.015\n", - "Prob(Omnibus): 0.427 Jarque-Bera (JB): 1.224\n", - "Skew: 0.350 Prob(JB): 0.542\n", - "Kurtosis: 3.089 Cond. No. inf\n", - "==============================================================================\n", - "\n", - "Notes:\n", - "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", - "[2] The smallest eigenvalue is 0. This might indicate that there are\n", - "strong multicollinearity problems or that the design matrix is singular.\n", - " Coefficients\n", - "AHR -4.566222e-02\n", - "birth_days_to 5.034837e-05\n", - "race_ASIAN -5.551115e-17\n", - "race_BLACK OR AFRICAN AMERICAN 1.119309e-01\n", - "race_WHITE -1.119309e-01\n", - "race_not reported 0.000000e+00\n", - "gender_MALE 3.432154e-01\n", - " OLS Regression Results \n", - "==============================================================================\n", - "Dep. Variable: CYP1B1 R-squared: 0.082\n", - "Model: OLS Adj. R-squared: 0.014\n", - "Method: Least Squares F-statistic: 1.199\n", - "Date: Fri, 09 Jun 2023 Prob (F-statistic): 0.322\n", - "Time: 12:31:41 Log-Likelihood: -81.208\n", - "No. Observations: 59 AIC: 172.4\n", - "Df Residuals: 54 BIC: 182.8\n", - "Df Model: 4 \n", - "Covariance Type: nonrobust \n", - "==================================================================================================\n", - " coef std err t P>|t| [0.025 0.975]\n", - "--------------------------------------------------------------------------------------------------\n", - "const 0.7744 0.607 1.275 0.208 -0.443 1.992\n", - "AHR -0.0457 0.134 -0.341 0.734 -0.314 0.222\n", - "birth_days_to 5.035e-05 3.68e-05 1.369 0.177 -2.34e-05 0.000\n", - "race_ASIAN 1.573e-17 5.37e-17 0.293 0.771 -9.2e-17 1.23e-16\n", - "race_BLACK OR AFRICAN AMERICAN 0.4991 0.420 1.189 0.240 -0.342 1.341\n", - "race_WHITE 0.2753 0.383 0.719 0.475 -0.493 1.043\n", - "race_not reported 0 0 nan nan 0 0\n", - "gender_MALE 0.3432 0.270 1.273 0.208 -0.197 0.884\n", - "==============================================================================\n", - "Omnibus: 1.499 Durbin-Watson: 2.188\n", - "Prob(Omnibus): 0.473 Jarque-Bera (JB): 1.482\n", - "Skew: -0.357 Prob(JB): 0.477\n", - "Kurtosis: 2.693 Cond. No. inf\n", - "==============================================================================\n", - "\n", - "Notes:\n", - "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", - "[2] The smallest eigenvalue is 0. This might indicate that there are\n", - "strong multicollinearity problems or that the design matrix is singular.\n", - " Coefficients\n", - "AHR 2.965066e-01\n", - "birth_days_to -4.140146e-05\n", - "race_ASIAN -2.081668e-17\n", - "race_BLACK OR AFRICAN AMERICAN 3.074572e-02\n", - "race_WHITE -3.074572e-02\n", - "race_not reported 0.000000e+00\n", - "gender_MALE 4.652414e-02\n", - " OLS Regression Results \n", - "==============================================================================\n", - "Dep. Variable: FOS R-squared: 0.126\n", - "Model: OLS Adj. R-squared: 0.061\n", - "Method: Least Squares F-statistic: 1.949\n", - "Date: Fri, 09 Jun 2023 Prob (F-statistic): 0.116\n", - "Time: 12:31:41 Log-Likelihood: -79.739\n", - "No. Observations: 59 AIC: 169.5\n", - "Df Residuals: 54 BIC: 179.9\n", - "Df Model: 4 \n", - "Covariance Type: nonrobust \n", - "==================================================================================================\n", - " coef std err t P>|t| [0.025 0.975]\n", - "--------------------------------------------------------------------------------------------------\n", - "const -0.6589 0.592 -1.113 0.271 -1.846 0.528\n", - "AHR 0.2965 0.130 2.273 0.027 0.035 0.558\n", - "birth_days_to -4.14e-05 3.59e-05 -1.154 0.254 -0.000 3.05e-05\n", - "race_ASIAN -4.687e-17 5.24e-17 -0.894 0.375 -1.52e-16 5.82e-17\n", - "race_BLACK OR AFRICAN AMERICAN -0.2987 0.409 -0.730 0.469 -1.120 0.522\n", - "race_WHITE -0.3602 0.374 -0.964 0.339 -1.109 0.389\n", - "race_not reported 0 0 nan nan 0 0\n", - "gender_MALE 0.0465 0.263 0.177 0.860 -0.481 0.574\n", - "==============================================================================\n", - "Omnibus: 0.744 Durbin-Watson: 2.280\n", - "Prob(Omnibus): 0.689 Jarque-Bera (JB): 0.672\n", - "Skew: -0.250 Prob(JB): 0.715\n", - "Kurtosis: 2.850 Cond. No. inf\n", - "==============================================================================\n", - "\n", - "Notes:\n", - "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", - "[2] The smallest eigenvalue is 0. This might indicate that there are\n", - "strong multicollinearity problems or that the design matrix is singular.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" + "14979it [00:42, 354.63it/s]\n" ] } ], @@ -750,13 +608,13 @@ " meta=meta, \n", " CatCov=CatCov,\n", " ConCov=ConCov,\n", - " GRN=grn.head(),\n", + " GRN=grn,\n", " conCol='condition')" ] }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 96, "id": "c3aede6a-7044-456e-affd-3804f54eb9d7", "metadata": {}, "outputs": [ @@ -781,46 +639,172 @@ " \n", " \n", " \n", - " patient id\n", " (PARP1, BRCA2)\n", " (AHR, CYP1B1)\n", " (AHR, FOS)\n", + " (AHR, SOS1)\n", + " (AHR, UGT1A6)\n", + " (AR, AADAC)\n", + " (AR, ABCA1)\n", + " (AR, ABCA2)\n", + " (AR, ABCF1)\n", + " (AR, ABCA4)\n", + " ...\n", + " (ZNF419, CDKN2A)\n", + " (ZNF671, CDKN2A)\n", + " (THAP7, CDKN2A)\n", + " (FOXP2, PLAUR)\n", + " (FOXP2, CNTNAP2)\n", + " (ZNF653, CDKN2A)\n", + " (E2F7, SP1)\n", + " (ZNF417, CDKN2A)\n", + " (ZNF384, CDKN2A)\n", + " (ZNF384, COL1A1)\n", + " \n", + " \n", + " patient id\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " 0\n", - " TCGA-55-7995-01\n", + " TCGA-55-7995-01\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " ...\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " -11.1\n", + " 0.0\n", " 0.0\n", " 0.0\n", " 0.0\n", " \n", " \n", - " 1\n", - " TCGA-69-7761-01\n", + " TCGA-69-7761-01\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " ...\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", " 0.0\n", " 0.0\n", " 0.0\n", " \n", " \n", - " 2\n", - " TCGA-67-6216-01\n", + " TCGA-67-6216-01\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 6.9\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " ...\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " -5.3\n", + " 0.0\n", " 0.0\n", " 0.0\n", " 0.0\n", " \n", " \n", - " 3\n", - " TCGA-44-6148-01\n", + " TCGA-44-6148-01\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " ...\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", " 0.0\n", " 0.0\n", " 0.0\n", " \n", " \n", - " 4\n", - " TCGA-71-8520-01\n", + " TCGA-71-8520-01\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " ...\n", + " 4.8\n", + " 0.0\n", + " 5.5\n", + " 0.0\n", + " 0.0\n", " 0.0\n", " 0.0\n", + " 4.7\n", + " 4.7\n", " 0.0\n", " \n", " \n", @@ -829,65 +813,238 @@ " ...\n", " ...\n", " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", " \n", " \n", - " 510\n", - " TCGA-69-7763-01\n", + " TCGA-69-7763-01\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " ...\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", " 0.0\n", " 0.0\n", " 0.0\n", " \n", " \n", - " 511\n", - " TCGA-78-7150-01\n", + " TCGA-78-7150-01\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 5.4\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " ...\n", + " 5.3\n", + " 4.1\n", + " 5.4\n", + " 0.0\n", " 0.0\n", " 0.0\n", " 0.0\n", + " 4.9\n", + " 5.3\n", + " 0.0\n", " \n", " \n", - " 512\n", - " TCGA-MP-A4TI-01\n", + " TCGA-MP-A4TI-01\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " ...\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " -8.3\n", + " 0.0\n", " 0.0\n", " 0.0\n", " 0.0\n", " \n", " \n", - " 513\n", - " TCGA-44-6145-01\n", + " TCGA-44-6145-01\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " ...\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " -8.5\n", + " 0.0\n", " 0.0\n", " 0.0\n", " 0.0\n", " \n", " \n", - " 514\n", - " TCGA-05-4427-01\n", + " TCGA-05-4427-01\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " ...\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 6.0\n", + " -11.4\n", + " 0.0\n", " 0.0\n", " 0.0\n", " 0.0\n", " \n", " \n", "\n", - "

515 rows × 4 columns

\n", + "

515 rows × 14979 columns

\n", "" ], "text/plain": [ - " patient id (PARP1, BRCA2) (AHR, CYP1B1) (AHR, FOS)\n", - "0 TCGA-55-7995-01 0.0 0.0 0.0\n", - "1 TCGA-69-7761-01 0.0 0.0 0.0\n", - "2 TCGA-67-6216-01 0.0 0.0 0.0\n", - "3 TCGA-44-6148-01 0.0 0.0 0.0\n", - "4 TCGA-71-8520-01 0.0 0.0 0.0\n", - ".. ... ... ... ...\n", - "510 TCGA-69-7763-01 0.0 0.0 0.0\n", - "511 TCGA-78-7150-01 0.0 0.0 0.0\n", - "512 TCGA-MP-A4TI-01 0.0 0.0 0.0\n", - "513 TCGA-44-6145-01 0.0 0.0 0.0\n", - "514 TCGA-05-4427-01 0.0 0.0 0.0\n", - "\n", - "[515 rows x 4 columns]" + " (PARP1, BRCA2) (AHR, CYP1B1) (AHR, FOS) (AHR, SOS1) \\\n", + "patient id \n", + "TCGA-55-7995-01 0.0 0.0 0.0 0.0 \n", + "TCGA-69-7761-01 0.0 0.0 0.0 0.0 \n", + "TCGA-67-6216-01 0.0 0.0 0.0 0.0 \n", + "TCGA-44-6148-01 0.0 0.0 0.0 0.0 \n", + "TCGA-71-8520-01 0.0 0.0 0.0 0.0 \n", + "... ... ... ... ... \n", + "TCGA-69-7763-01 0.0 0.0 0.0 0.0 \n", + "TCGA-78-7150-01 0.0 0.0 0.0 0.0 \n", + "TCGA-MP-A4TI-01 0.0 0.0 0.0 0.0 \n", + "TCGA-44-6145-01 0.0 0.0 0.0 0.0 \n", + "TCGA-05-4427-01 0.0 0.0 0.0 0.0 \n", + "\n", + " (AHR, UGT1A6) (AR, AADAC) (AR, ABCA1) (AR, ABCA2) \\\n", + "patient id \n", + "TCGA-55-7995-01 0.0 0.0 0.0 0.0 \n", + "TCGA-69-7761-01 0.0 0.0 0.0 0.0 \n", + "TCGA-67-6216-01 0.0 6.9 0.0 0.0 \n", + "TCGA-44-6148-01 0.0 0.0 0.0 0.0 \n", + "TCGA-71-8520-01 0.0 0.0 0.0 0.0 \n", + "... ... ... ... ... \n", + "TCGA-69-7763-01 0.0 0.0 0.0 0.0 \n", + "TCGA-78-7150-01 0.0 5.4 0.0 0.0 \n", + "TCGA-MP-A4TI-01 0.0 0.0 0.0 0.0 \n", + "TCGA-44-6145-01 0.0 0.0 0.0 0.0 \n", + "TCGA-05-4427-01 0.0 0.0 0.0 0.0 \n", + "\n", + " (AR, ABCF1) (AR, ABCA4) ... (ZNF419, CDKN2A) \\\n", + "patient id ... \n", + "TCGA-55-7995-01 0.0 0.0 ... 0.0 \n", + "TCGA-69-7761-01 0.0 0.0 ... 0.0 \n", + "TCGA-67-6216-01 0.0 0.0 ... 0.0 \n", + "TCGA-44-6148-01 0.0 0.0 ... 0.0 \n", + "TCGA-71-8520-01 0.0 0.0 ... 4.8 \n", + "... ... ... ... ... \n", + "TCGA-69-7763-01 0.0 0.0 ... 0.0 \n", + "TCGA-78-7150-01 0.0 0.0 ... 5.3 \n", + "TCGA-MP-A4TI-01 0.0 0.0 ... 0.0 \n", + "TCGA-44-6145-01 0.0 0.0 ... 0.0 \n", + "TCGA-05-4427-01 0.0 0.0 ... 0.0 \n", + "\n", + " (ZNF671, CDKN2A) (THAP7, CDKN2A) (FOXP2, PLAUR) \\\n", + "patient id \n", + "TCGA-55-7995-01 0.0 0.0 0.0 \n", + "TCGA-69-7761-01 0.0 0.0 0.0 \n", + "TCGA-67-6216-01 0.0 0.0 0.0 \n", + "TCGA-44-6148-01 0.0 0.0 0.0 \n", + "TCGA-71-8520-01 0.0 5.5 0.0 \n", + "... ... ... ... \n", + "TCGA-69-7763-01 0.0 0.0 0.0 \n", + "TCGA-78-7150-01 4.1 5.4 0.0 \n", + "TCGA-MP-A4TI-01 0.0 0.0 0.0 \n", + "TCGA-44-6145-01 0.0 0.0 0.0 \n", + "TCGA-05-4427-01 0.0 0.0 0.0 \n", + "\n", + " (FOXP2, CNTNAP2) (ZNF653, CDKN2A) (E2F7, SP1) \\\n", + "patient id \n", + "TCGA-55-7995-01 0.0 -11.1 0.0 \n", + "TCGA-69-7761-01 0.0 0.0 0.0 \n", + "TCGA-67-6216-01 0.0 -5.3 0.0 \n", + "TCGA-44-6148-01 0.0 0.0 0.0 \n", + "TCGA-71-8520-01 0.0 0.0 0.0 \n", + "... ... ... ... \n", + "TCGA-69-7763-01 0.0 0.0 0.0 \n", + "TCGA-78-7150-01 0.0 0.0 0.0 \n", + "TCGA-MP-A4TI-01 0.0 -8.3 0.0 \n", + "TCGA-44-6145-01 0.0 -8.5 0.0 \n", + "TCGA-05-4427-01 6.0 -11.4 0.0 \n", + "\n", + " (ZNF417, CDKN2A) (ZNF384, CDKN2A) (ZNF384, COL1A1) \n", + "patient id \n", + "TCGA-55-7995-01 0.0 0.0 0.0 \n", + "TCGA-69-7761-01 0.0 0.0 0.0 \n", + "TCGA-67-6216-01 0.0 0.0 0.0 \n", + "TCGA-44-6148-01 0.0 0.0 0.0 \n", + "TCGA-71-8520-01 4.7 4.7 0.0 \n", + "... ... ... ... \n", + "TCGA-69-7763-01 0.0 0.0 0.0 \n", + "TCGA-78-7150-01 4.9 5.3 0.0 \n", + "TCGA-MP-A4TI-01 0.0 0.0 0.0 \n", + "TCGA-44-6145-01 0.0 0.0 0.0 \n", + "TCGA-05-4427-01 0.0 0.0 0.0 \n", + "\n", + "[515 rows x 14979 columns]" ] }, - "execution_count": 43, + "execution_count": 96, "metadata": {}, "output_type": "execute_result" } @@ -898,32 +1055,680 @@ }, { "cell_type": "code", - "execution_count": 55, - "id": "47d5da15-68dc-4599-b605-4f1d0478897a", + "execution_count": 97, + "id": "e719292a-4d97-426a-b1aa-c64e1f0b1837", "metadata": {}, "outputs": [ { "data": { - "text/plain": [ - "race condition\n", - "WHITE 1 388\n", - "not reported 1 65\n", - "WHITE 0 55\n", - "BLACK OR AFRICAN AMERICAN 1 53\n", - "ASIAN 1 8\n", - "BLACK OR AFRICAN AMERICAN 0 4\n", - "AMERICAN INDIAN OR ALASKA NATIVE 1 1\n", - "Name: count, dtype: int64" - ] - }, - "execution_count": 55, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "meta[[\"race\",\"condition\"]].value_counts()" - ] + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
R2coef_interceptcoef_TFcoef_birth_days_tocoef_gender_MALEpval_interceptpval_TFpval_birth_days_topval_gender_MALE
(PARP1, BRCA2)0.1663371.5157850.3307290.0000680.2853080.0877951.234803e-020.0565020.263960
(AHR, CYP1B1)0.0784801.125920-0.0429420.0000530.3446630.2162617.472208e-010.1474720.203034
(AHR, FOS)0.125917-0.9981600.297254-0.0000410.0469220.2597782.512678e-020.2509620.857749
(AHR, SOS1)0.545301-0.9857230.696733-0.0000400.0826820.1247166.121357e-100.1237210.661683
(AHR, UGT1A6)0.146786-0.752809-0.174668-0.000041-0.5629130.3885051.764701e-010.2391520.033036
..............................
(ZNF653, CDKN2A)0.333566-2.066818-0.020549-0.000100-0.8100590.0092358.554798e-010.0019180.000677
(E2F7, SP1)0.100177-1.7008840.162334-0.0000670.1942400.0585942.147244e-010.0621780.460722
(ZNF417, CDKN2A)0.337528-1.9906190.066682-0.000097-0.8240560.0107855.495458e-010.0021480.000572
(ZNF384, CDKN2A)0.340745-1.8830790.090320-0.000093-0.8338610.0181864.297584e-010.0040010.000505
(ZNF384, COL1A1)0.144069-0.5717880.348782-0.000025-0.0455090.5191969.296426e-030.4888150.860108
\n", + "

14979 rows × 9 columns

\n", + "
" + ], + "text/plain": [ + " R2 coef_intercept coef_TF coef_birth_days_to \\\n", + "(PARP1, BRCA2) 0.166337 1.515785 0.330729 0.000068 \n", + "(AHR, CYP1B1) 0.078480 1.125920 -0.042942 0.000053 \n", + "(AHR, FOS) 0.125917 -0.998160 0.297254 -0.000041 \n", + "(AHR, SOS1) 0.545301 -0.985723 0.696733 -0.000040 \n", + "(AHR, UGT1A6) 0.146786 -0.752809 -0.174668 -0.000041 \n", + "... ... ... ... ... \n", + "(ZNF653, CDKN2A) 0.333566 -2.066818 -0.020549 -0.000100 \n", + "(E2F7, SP1) 0.100177 -1.700884 0.162334 -0.000067 \n", + "(ZNF417, CDKN2A) 0.337528 -1.990619 0.066682 -0.000097 \n", + "(ZNF384, CDKN2A) 0.340745 -1.883079 0.090320 -0.000093 \n", + "(ZNF384, COL1A1) 0.144069 -0.571788 0.348782 -0.000025 \n", + "\n", + " coef_gender_MALE pval_intercept pval_TF \\\n", + "(PARP1, BRCA2) 0.285308 0.087795 1.234803e-02 \n", + "(AHR, CYP1B1) 0.344663 0.216261 7.472208e-01 \n", + "(AHR, FOS) 0.046922 0.259778 2.512678e-02 \n", + "(AHR, SOS1) 0.082682 0.124716 6.121357e-10 \n", + "(AHR, UGT1A6) -0.562913 0.388505 1.764701e-01 \n", + "... ... ... ... \n", + "(ZNF653, CDKN2A) -0.810059 0.009235 8.554798e-01 \n", + "(E2F7, SP1) 0.194240 0.058594 2.147244e-01 \n", + "(ZNF417, CDKN2A) -0.824056 0.010785 5.495458e-01 \n", + "(ZNF384, CDKN2A) -0.833861 0.018186 4.297584e-01 \n", + "(ZNF384, COL1A1) -0.045509 0.519196 9.296426e-03 \n", + "\n", + " pval_birth_days_to pval_gender_MALE \n", + "(PARP1, BRCA2) 0.056502 0.263960 \n", + "(AHR, CYP1B1) 0.147472 0.203034 \n", + "(AHR, FOS) 0.250962 0.857749 \n", + "(AHR, SOS1) 0.123721 0.661683 \n", + "(AHR, UGT1A6) 0.239152 0.033036 \n", + "... ... ... \n", + "(ZNF653, CDKN2A) 0.001918 0.000677 \n", + "(E2F7, SP1) 0.062178 0.460722 \n", + "(ZNF417, CDKN2A) 0.002148 0.000572 \n", + "(ZNF384, CDKN2A) 0.004001 0.000505 \n", + "(ZNF384, COL1A1) 0.488815 0.860108 \n", + "\n", + "[14979 rows x 9 columns]" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.get_model_stats()" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "id": "3aacbc4e-8f13-48ef-849a-7404f45e9573", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
(PARP1, BRCA2)(AHR, CYP1B1)(AHR, FOS)(AHR, SOS1)(AHR, UGT1A6)(AR, AADAC)(AR, ABCA1)(AR, ABCA2)(AR, ABCF1)(AR, ABCA4)...(ZNF419, CDKN2A)(ZNF671, CDKN2A)(THAP7, CDKN2A)(FOXP2, PLAUR)(FOXP2, CNTNAP2)(ZNF653, CDKN2A)(E2F7, SP1)(ZNF417, CDKN2A)(ZNF384, CDKN2A)(ZNF384, COL1A1)
patient id
TCGA-55-7995-010.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.01.00.00.00.00.0
TCGA-69-7761-010.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
TCGA-67-6216-010.00.00.00.00.01.00.00.00.00.0...0.00.00.00.00.01.00.00.00.00.0
TCGA-44-6148-010.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
TCGA-71-8520-010.00.00.00.00.00.00.00.00.00.0...1.00.01.00.00.00.00.01.01.00.0
..................................................................
TCGA-69-7763-010.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
TCGA-78-7150-010.00.00.00.00.01.00.00.00.00.0...1.01.01.00.00.00.00.01.01.00.0
TCGA-MP-A4TI-010.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.01.00.00.00.00.0
TCGA-44-6145-010.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.01.00.00.00.00.0
TCGA-05-4427-010.00.00.00.00.00.00.00.00.00.0...0.00.00.00.01.01.00.00.00.00.0
\n", + "

515 rows × 14979 columns

\n", + "
" + ], + "text/plain": [ + " (PARP1, BRCA2) (AHR, CYP1B1) (AHR, FOS) (AHR, SOS1) \\\n", + "patient id \n", + "TCGA-55-7995-01 0.0 0.0 0.0 0.0 \n", + "TCGA-69-7761-01 0.0 0.0 0.0 0.0 \n", + "TCGA-67-6216-01 0.0 0.0 0.0 0.0 \n", + "TCGA-44-6148-01 0.0 0.0 0.0 0.0 \n", + "TCGA-71-8520-01 0.0 0.0 0.0 0.0 \n", + "... ... ... ... ... \n", + "TCGA-69-7763-01 0.0 0.0 0.0 0.0 \n", + "TCGA-78-7150-01 0.0 0.0 0.0 0.0 \n", + "TCGA-MP-A4TI-01 0.0 0.0 0.0 0.0 \n", + "TCGA-44-6145-01 0.0 0.0 0.0 0.0 \n", + "TCGA-05-4427-01 0.0 0.0 0.0 0.0 \n", + "\n", + " (AHR, UGT1A6) (AR, AADAC) (AR, ABCA1) (AR, ABCA2) \\\n", + "patient id \n", + "TCGA-55-7995-01 0.0 0.0 0.0 0.0 \n", + "TCGA-69-7761-01 0.0 0.0 0.0 0.0 \n", + "TCGA-67-6216-01 0.0 1.0 0.0 0.0 \n", + "TCGA-44-6148-01 0.0 0.0 0.0 0.0 \n", + "TCGA-71-8520-01 0.0 0.0 0.0 0.0 \n", + "... ... ... ... ... \n", + "TCGA-69-7763-01 0.0 0.0 0.0 0.0 \n", + "TCGA-78-7150-01 0.0 1.0 0.0 0.0 \n", + "TCGA-MP-A4TI-01 0.0 0.0 0.0 0.0 \n", + "TCGA-44-6145-01 0.0 0.0 0.0 0.0 \n", + "TCGA-05-4427-01 0.0 0.0 0.0 0.0 \n", + "\n", + " (AR, ABCF1) (AR, ABCA4) ... (ZNF419, CDKN2A) \\\n", + "patient id ... \n", + "TCGA-55-7995-01 0.0 0.0 ... 0.0 \n", + "TCGA-69-7761-01 0.0 0.0 ... 0.0 \n", + "TCGA-67-6216-01 0.0 0.0 ... 0.0 \n", + "TCGA-44-6148-01 0.0 0.0 ... 0.0 \n", + "TCGA-71-8520-01 0.0 0.0 ... 1.0 \n", + "... ... ... ... ... \n", + "TCGA-69-7763-01 0.0 0.0 ... 0.0 \n", + "TCGA-78-7150-01 0.0 0.0 ... 1.0 \n", + "TCGA-MP-A4TI-01 0.0 0.0 ... 0.0 \n", + "TCGA-44-6145-01 0.0 0.0 ... 0.0 \n", + "TCGA-05-4427-01 0.0 0.0 ... 0.0 \n", + "\n", + " (ZNF671, CDKN2A) (THAP7, CDKN2A) (FOXP2, PLAUR) \\\n", + "patient id \n", + "TCGA-55-7995-01 0.0 0.0 0.0 \n", + "TCGA-69-7761-01 0.0 0.0 0.0 \n", + "TCGA-67-6216-01 0.0 0.0 0.0 \n", + "TCGA-44-6148-01 0.0 0.0 0.0 \n", + "TCGA-71-8520-01 0.0 1.0 0.0 \n", + "... ... ... ... \n", + "TCGA-69-7763-01 0.0 0.0 0.0 \n", + "TCGA-78-7150-01 1.0 1.0 0.0 \n", + "TCGA-MP-A4TI-01 0.0 0.0 0.0 \n", + "TCGA-44-6145-01 0.0 0.0 0.0 \n", + "TCGA-05-4427-01 0.0 0.0 0.0 \n", + "\n", + " (FOXP2, CNTNAP2) (ZNF653, CDKN2A) (E2F7, SP1) \\\n", + "patient id \n", + "TCGA-55-7995-01 0.0 1.0 0.0 \n", + "TCGA-69-7761-01 0.0 0.0 0.0 \n", + "TCGA-67-6216-01 0.0 1.0 0.0 \n", + "TCGA-44-6148-01 0.0 0.0 0.0 \n", + "TCGA-71-8520-01 0.0 0.0 0.0 \n", + "... ... ... ... \n", + "TCGA-69-7763-01 0.0 0.0 0.0 \n", + "TCGA-78-7150-01 0.0 0.0 0.0 \n", + "TCGA-MP-A4TI-01 0.0 1.0 0.0 \n", + "TCGA-44-6145-01 0.0 1.0 0.0 \n", + "TCGA-05-4427-01 1.0 1.0 0.0 \n", + "\n", + " (ZNF417, CDKN2A) (ZNF384, CDKN2A) (ZNF384, COL1A1) \n", + "patient id \n", + "TCGA-55-7995-01 0.0 0.0 0.0 \n", + "TCGA-69-7761-01 0.0 0.0 0.0 \n", + "TCGA-67-6216-01 0.0 0.0 0.0 \n", + "TCGA-44-6148-01 0.0 0.0 0.0 \n", + "TCGA-71-8520-01 1.0 1.0 0.0 \n", + "... ... ... ... \n", + "TCGA-69-7763-01 0.0 0.0 0.0 \n", + "TCGA-78-7150-01 1.0 1.0 0.0 \n", + "TCGA-MP-A4TI-01 0.0 0.0 0.0 \n", + "TCGA-44-6145-01 0.0 0.0 0.0 \n", + "TCGA-05-4427-01 0.0 0.0 0.0 \n", + "\n", + "[515 rows x 14979 columns]" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.get_results_binary()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a77d69fa-9401-466a-a20a-29d738649016", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/dysregnet/dysregnet.py b/dysregnet/dysregnet.py index a2e045a..6b96f05 100644 --- a/dysregnet/dysregnet.py +++ b/dysregnet/dysregnet.py @@ -138,7 +138,7 @@ def __init__(self, self.cov_df,self.expr, self.control, self.case = functions.process_data(self) - self.results=functions.dyregnet_model(self) + self.results, self.model_stats = functions.dyregnet_model(self) @@ -151,11 +151,14 @@ def get_results(self): def get_results_binary(self): res_binary=self.results.copy() - res_binary=res_binary.set_index('patient id') res_binary = res_binary.where(res_binary==0, other=1) return res_binary + + def get_model_stats(self): + return self.model_stats + diff --git a/dysregnet/functions.py b/dysregnet/functions.py index 8a423f8..12b6c8d 100644 --- a/dysregnet/functions.py +++ b/dysregnet/functions.py @@ -11,7 +11,7 @@ def process_data(data): - # process covariates and desing martic + # process covariates and design matrix all_covariates= data.CatCov + data.ConCov @@ -32,7 +32,7 @@ def process_data(data): # process categorial covariate # drop_first is important to avoid multicollinear - cov_df=pd.get_dummies(cov_df, columns=data.CatCov, drop_first=True) + cov_df=pd.get_dummies(cov_df, columns=data.CatCov, drop_first=True, dtype=int) @@ -67,63 +67,64 @@ def dyregnet_model(data): case=data.expr.loc[data.case] covariate_name=[] - edges={} - edges['patient id']=list(case.index) + edges = {} + edges['patient id']=list(case.index.values) + model_stats = {} for tup in tqdm(data.GRN.itertuples()): # pvalues for the same edge for all patients edge = (tup[1],tup[2]) # skip self loops - if edge[0]!=edge[1]: - + if edge[0] != edge[1]: + # prepare control for fitting model - x_train = control[ [edge[0]] + covariate_name ].values + x_train = control[ [edge[0]] + covariate_name ] + x_train = sm.add_constant(x_train) # add bias y_train = control[edge[1]].values # fit the model - reg = LinearRegression().fit(x_train, y_train) + model = sm.OLS(y_train, x_train) + results = model.fit() - #get residuals of control - resid_control =reg.predict(x_train) - y_train + # get residuals of control + resid_control = results.predict(x_train) - y_train - # test data (case or condition) - x_test = case[ [edge[0]]+ covariate_name ].values + x_test = case[ [edge[0]]+ covariate_name ] + x_test = sm.add_constant(x_test) # add bias y_test = case[edge[1]].values - # define residue for cases - resid_case = reg.predict(x_test) - y_test - - + resid_case = results.predict(x_test) - y_test + # condition of direction - cond=True - direction= np.sign(reg.coef_[0]) + cond = True + direction = np.sign(results.params[1]) # two sided p_value as default # if direction_condition is false calculate, two sided p value - sides=2 + sides = 2 if data.direction_condition: - cond=( direction * resid_case )>0 + cond = ( direction * resid_case ) > 0 # if direction_condition is true only calculate one sided p value - sides=1 + sides = 1 # calculate zscore - zscore=(resid_case-resid_control.mean())/resid_control.std() + zscore= (resid_case - resid_control.mean()) / resid_control.std() # Quality check of the fitness (optionally and must be provided by user) - if (data.R2_threshold is not None) and ( data.R2_threshold > reg.score(x_train, y_train) ): + if (data.R2_threshold is not None) and ( data.R2_threshold > results.rsquared ): # model fit is not that good on training # shrink the zscores edges[edge]= [0.0] * len(zscore) @@ -160,8 +161,16 @@ def dyregnet_model(data): zscore[~valid]=0.0 - edges[edge]=np.round(zscore, 1) + edges[edge] = np.round(zscore, 1) + model_stats[edge] = [results.rsquared] + list(results.params.values) + list(results.pvalues.values) + + - data=pd.DataFrame.from_dict(edges) + results = pd.DataFrame.from_dict(edges) + results = results.set_index('patient id') + + model_stats_cols = ["R2"] + ["coef_" + coef for coef in ["intercept", "TF"] + covariate_name] + ["pval_" + coef for coef in ["intercept", "TF"] + covariate_name] + model_stats = pd.DataFrame([model_stats[edge] for edge in results.columns], index=results.columns, columns=model_stats_cols) + - return data + return results, model_stats From e6b9d1a45f122acd7e0756e2dcfa9ab6bde90114 Mon Sep 17 00:00:00 2001 From: JohannesKersting Date: Wed, 19 Jul 2023 14:51:38 +0200 Subject: [PATCH 3/3] preparing for version 0.0.4 --- README.md | 3 + dev_tests.ipynb | 49 +- setup.py | 5 +- {dysregnet => src/dysregnet}/__init__.py | 0 {dysregnet => src/dysregnet}/dysregnet.py | 2 +- {dysregnet => src/dysregnet}/functions.py | 4 +- test.ipynb | 601 ++++++++++++++++++---- 7 files changed, 541 insertions(+), 123 deletions(-) rename {dysregnet => src/dysregnet}/__init__.py (100%) rename {dysregnet => src/dysregnet}/dysregnet.py (99%) rename {dysregnet => src/dysregnet}/functions.py (99%) diff --git a/README.md b/README.md index c62d027..3e595c1 100644 --- a/README.md +++ b/README.md @@ -116,6 +116,9 @@ data.get_results() data.get_results_binary() +# get R2 values, coefficients, and coefficient p-values for all models/edges +data.get_model_stats() + ``` The expected run time for the installation and running the demo dataset on a "normal" desktop computer is around 3~5 minutes. diff --git a/dev_tests.ipynb b/dev_tests.ipynb index cbd1b6a..1c16fad 100644 --- a/dev_tests.ipynb +++ b/dev_tests.ipynb @@ -2,29 +2,20 @@ "cells": [ { "cell_type": "code", - "execution_count": 94, + "execution_count": 1, "id": "d03d274e-6792-4bbf-93bf-b8c7259c1d7f", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], + "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2\n", "import pandas as pd\n", - "from dysregnet.dysregnet import run" + "from src.dysregnet.dysregnet import run" ] }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 2, "id": "b6408bef-4768-42c3-88e3-21e250102240", "metadata": {}, "outputs": [], @@ -36,7 +27,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 3, "id": "0b876315-e883-450e-95a4-88b7caf76085", "metadata": {}, "outputs": [], @@ -53,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 4, "id": "208046d3-5d5f-4ea4-aae7-c575094bc88e", "metadata": {}, "outputs": [ @@ -266,7 +257,7 @@ "[5 rows x 38 columns]" ] }, - "execution_count": 49, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -277,7 +268,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 5, "id": "35febb56-1441-4049-a452-450c25afebb1", "metadata": {}, "outputs": [ @@ -490,7 +481,7 @@ "[5 rows x 22579 columns]" ] }, - "execution_count": 50, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -501,7 +492,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 6, "id": "9dd72783-5b5d-488d-937e-f1d34535b29a", "metadata": {}, "outputs": [ @@ -569,7 +560,7 @@ "4 AHR FOS" ] }, - "execution_count": 51, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -580,7 +571,7 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 7, "id": "abda2742-f17d-440c-b5b2-e021d2c26f9c", "metadata": {}, "outputs": [], @@ -591,7 +582,7 @@ }, { "cell_type": "code", - "execution_count": 95, + "execution_count": 8, "id": "62b921d3-27e0-4f54-a77b-d3b35d5eedfb", "metadata": {}, "outputs": [ @@ -599,7 +590,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "14979it [00:42, 354.63it/s]\n" + "14979it [00:45, 332.12it/s]\n" ] } ], @@ -614,7 +605,7 @@ }, { "cell_type": "code", - "execution_count": 96, + "execution_count": 9, "id": "c3aede6a-7044-456e-affd-3804f54eb9d7", "metadata": {}, "outputs": [ @@ -1044,7 +1035,7 @@ "[515 rows x 14979 columns]" ] }, - "execution_count": 96, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -1055,7 +1046,7 @@ }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 10, "id": "e719292a-4d97-426a-b1aa-c64e1f0b1837", "metadata": {}, "outputs": [ @@ -1272,7 +1263,7 @@ "[14979 rows x 9 columns]" ] }, - "execution_count": 97, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -1283,7 +1274,7 @@ }, { "cell_type": "code", - "execution_count": 98, + "execution_count": 11, "id": "3aacbc4e-8f13-48ef-849a-7404f45e9573", "metadata": {}, "outputs": [ @@ -1713,7 +1704,7 @@ "[515 rows x 14979 columns]" ] }, - "execution_count": 98, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } diff --git a/setup.py b/setup.py index f406a04..227998b 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ setup(name='dysregnet', - version='0.0.3', + version='0.0.4', description='DysRegNet', long_description=README, long_description_content_type="text/markdown", @@ -27,7 +27,8 @@ "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", "Topic :: Scientific/Engineering :: Bio-Informatics", ], - packages=find_packages(), + package_dir = {'': 'src'}, + packages=['dysregnet'], include_package_data=True, python_requires='>=3.7', install_requires=[ diff --git a/dysregnet/__init__.py b/src/dysregnet/__init__.py similarity index 100% rename from dysregnet/__init__.py rename to src/dysregnet/__init__.py diff --git a/dysregnet/dysregnet.py b/src/dysregnet/dysregnet.py similarity index 99% rename from dysregnet/dysregnet.py rename to src/dysregnet/dysregnet.py index 6b96f05..3459a73 100644 --- a/dysregnet/dysregnet.py +++ b/src/dysregnet/dysregnet.py @@ -57,7 +57,7 @@ def __init__(self, List of continuous covariates. They should match the name of their columns in meta Dataframe. - zscoring: boolean, default: True + zscoring: boolean, default: False zscoring of expression data (if needed). bonferroni_alpha: Float diff --git a/dysregnet/functions.py b/src/dysregnet/functions.py similarity index 99% rename from dysregnet/functions.py rename to src/dysregnet/functions.py index 12b6c8d..22d8a49 100644 --- a/dysregnet/functions.py +++ b/src/dysregnet/functions.py @@ -86,6 +86,9 @@ def dyregnet_model(data): # fit the model model = sm.OLS(y_train, x_train) results = model.fit() + + model_stats[edge] = [results.rsquared] + list(results.params.values) + list(results.pvalues.values) + # get residuals of control resid_control = results.predict(x_train) - y_train @@ -162,7 +165,6 @@ def dyregnet_model(data): edges[edge] = np.round(zscore, 1) - model_stats[edge] = [results.rsquared] + list(results.params.values) + list(results.pvalues.values) diff --git a/test.ipynb b/test.ipynb index 86a0bbd..aa080da 100644 --- a/test.ipynb +++ b/test.ipynb @@ -5,9 +5,19 @@ "execution_count": 1, "id": "b15e0cd9-bcec-47ca-b095-326d208de825", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.0.4\n" + ] + } + ], "source": [ - "import dysregnet" + "import dysregnet\n", + "import importlib.metadata\n", + "print(importlib.metadata.version('dysregnet'))" ] }, { @@ -554,7 +564,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "14162it [00:37, 381.60it/s]\n" + "14162it [00:50, 280.14it/s]\n" ] } ], @@ -604,7 +614,6 @@ " \n", " \n", " \n", - " patient id\n", " (PARP1, BRCA2)\n", " (AHR, CYP1B1)\n", " (AHR, FOS)\n", @@ -614,6 +623,7 @@ " (AR, ABCA2)\n", " (AR, ABCF1)\n", " (AR, ABCA4)\n", + " (AR, ABL1)\n", " ...\n", " (ZNF419, CDKN2A)\n", " (ZNF671, CDKN2A)\n", @@ -626,11 +636,35 @@ " (ZNF384, CDKN2A)\n", " (ZNF384, COL1A1)\n", " \n", + " \n", + " patient id\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " 0\n", - " TCGA-3C-AAAU-01\n", + " TCGA-3C-AAAU-01\n", + " 0.0\n", " 0.0\n", " 0.0\n", " 0.0\n", @@ -653,8 +687,8 @@ " 0.0\n", " \n", " \n", - " 1\n", - " TCGA-3C-AALI-01\n", + " TCGA-3C-AALI-01\n", + " 0.0\n", " 0.0\n", " 0.0\n", " 0.0\n", @@ -677,8 +711,8 @@ " 0.0\n", " \n", " \n", - " 2\n", - " TCGA-3C-AALJ-01\n", + " TCGA-3C-AALJ-01\n", + " 0.0\n", " 0.0\n", " 0.0\n", " 0.0\n", @@ -701,8 +735,8 @@ " 0.0\n", " \n", " \n", - " 3\n", - " TCGA-3C-AALK-01\n", + " TCGA-3C-AALK-01\n", + " 0.0\n", " 0.0\n", " 0.0\n", " 0.0\n", @@ -725,8 +759,8 @@ " 0.0\n", " \n", " \n", - " 4\n", - " TCGA-4H-AAAK-01\n", + " TCGA-4H-AAAK-01\n", + " 0.0\n", " 0.0\n", " 0.0\n", " 0.0\n", @@ -773,8 +807,8 @@ " ...\n", " \n", " \n", - " 1093\n", - " TCGA-WT-AB44-01\n", + " TCGA-WT-AB44-01\n", + " 0.0\n", " 0.0\n", " 0.0\n", " 0.0\n", @@ -797,8 +831,8 @@ " 0.0\n", " \n", " \n", - " 1094\n", - " TCGA-XX-A899-01\n", + " TCGA-XX-A899-01\n", + " 0.0\n", " 0.0\n", " 0.0\n", " 0.0\n", @@ -821,8 +855,8 @@ " 0.0\n", " \n", " \n", - " 1095\n", - " TCGA-XX-A89A-01\n", + " TCGA-XX-A89A-01\n", + " 0.0\n", " 0.0\n", " 0.0\n", " 0.0\n", @@ -845,8 +879,8 @@ " 0.0\n", " \n", " \n", - " 1096\n", - " TCGA-Z7-A8R5-01\n", + " TCGA-Z7-A8R5-01\n", + " 0.0\n", " 0.0\n", " 0.0\n", " 0.0\n", @@ -869,8 +903,8 @@ " 0.0\n", " \n", " \n", - " 1097\n", - " TCGA-Z7-A8R6-01\n", + " TCGA-Z7-A8R6-01\n", + " 0.0\n", " 0.0\n", " 0.0\n", " 0.0\n", @@ -894,76 +928,95 @@ " \n", " \n", "\n", - "

1098 rows × 14148 columns

\n", + "

1098 rows × 14147 columns

\n", "" ], "text/plain": [ - " patient id (PARP1, BRCA2) (AHR, CYP1B1) (AHR, FOS) (AHR, SOS1) \\\n", - "0 TCGA-3C-AAAU-01 0.0 0.0 0.0 0.0 \n", - "1 TCGA-3C-AALI-01 0.0 0.0 0.0 0.0 \n", - "2 TCGA-3C-AALJ-01 0.0 0.0 0.0 0.0 \n", - "3 TCGA-3C-AALK-01 0.0 0.0 0.0 0.0 \n", - "4 TCGA-4H-AAAK-01 0.0 0.0 0.0 0.0 \n", - "... ... ... ... ... ... \n", - "1093 TCGA-WT-AB44-01 0.0 0.0 0.0 0.0 \n", - "1094 TCGA-XX-A899-01 0.0 0.0 0.0 0.0 \n", - "1095 TCGA-XX-A89A-01 0.0 0.0 0.0 0.0 \n", - "1096 TCGA-Z7-A8R5-01 0.0 0.0 0.0 0.0 \n", - "1097 TCGA-Z7-A8R6-01 0.0 0.0 0.0 0.0 \n", + " (PARP1, BRCA2) (AHR, CYP1B1) (AHR, FOS) (AHR, SOS1) \\\n", + "patient id \n", + "TCGA-3C-AAAU-01 0.0 0.0 0.0 0.0 \n", + "TCGA-3C-AALI-01 0.0 0.0 0.0 0.0 \n", + "TCGA-3C-AALJ-01 0.0 0.0 0.0 0.0 \n", + "TCGA-3C-AALK-01 0.0 0.0 0.0 0.0 \n", + "TCGA-4H-AAAK-01 0.0 0.0 0.0 0.0 \n", + "... ... ... ... ... \n", + "TCGA-WT-AB44-01 0.0 0.0 0.0 0.0 \n", + "TCGA-XX-A899-01 0.0 0.0 0.0 0.0 \n", + "TCGA-XX-A89A-01 0.0 0.0 0.0 0.0 \n", + "TCGA-Z7-A8R5-01 0.0 0.0 0.0 0.0 \n", + "TCGA-Z7-A8R6-01 0.0 0.0 0.0 0.0 \n", "\n", - " (AHR, UGT1A6) (AR, ABCA1) (AR, ABCA2) (AR, ABCF1) (AR, ABCA4) ... \\\n", - "0 0.0 0.0 0.0 0.0 0.0 ... \n", - "1 0.0 0.0 0.0 0.0 0.0 ... \n", - "2 0.0 0.0 0.0 0.0 0.0 ... \n", - "3 0.0 0.0 0.0 0.0 0.0 ... \n", - "4 0.0 0.0 0.0 0.0 0.0 ... \n", - "... ... ... ... ... ... ... \n", - "1093 0.0 0.0 0.0 0.0 0.0 ... \n", - "1094 0.0 0.0 0.0 0.0 0.0 ... \n", - "1095 0.0 0.0 0.0 0.0 0.0 ... \n", - "1096 0.0 0.0 0.0 0.0 0.0 ... \n", - "1097 0.0 0.0 0.0 0.0 0.0 ... \n", + " (AHR, UGT1A6) (AR, ABCA1) (AR, ABCA2) (AR, ABCF1) \\\n", + "patient id \n", + "TCGA-3C-AAAU-01 0.0 0.0 0.0 0.0 \n", + "TCGA-3C-AALI-01 0.0 0.0 0.0 0.0 \n", + "TCGA-3C-AALJ-01 0.0 0.0 0.0 0.0 \n", + "TCGA-3C-AALK-01 0.0 0.0 0.0 0.0 \n", + "TCGA-4H-AAAK-01 0.0 0.0 0.0 0.0 \n", + "... ... ... ... ... \n", + "TCGA-WT-AB44-01 0.0 0.0 0.0 0.0 \n", + "TCGA-XX-A899-01 0.0 0.0 0.0 0.0 \n", + "TCGA-XX-A89A-01 0.0 0.0 0.0 0.0 \n", + "TCGA-Z7-A8R5-01 0.0 0.0 0.0 0.0 \n", + "TCGA-Z7-A8R6-01 0.0 0.0 0.0 0.0 \n", "\n", - " (ZNF419, CDKN2A) (ZNF671, CDKN2A) (THAP7, CDKN2A) (FOXP2, PLAUR) \\\n", - "0 0.0 0.0 0.0 0.0 \n", - "1 0.0 0.0 0.0 0.0 \n", - "2 0.0 0.0 0.0 0.0 \n", - "3 0.0 0.0 0.0 0.0 \n", - "4 0.0 0.0 0.0 0.0 \n", - "... ... ... ... ... \n", - "1093 0.0 0.0 0.0 0.0 \n", - "1094 0.0 0.0 0.0 0.0 \n", - "1095 0.0 0.0 0.0 0.0 \n", - "1096 0.0 0.0 0.0 0.0 \n", - "1097 0.0 0.0 0.0 0.0 \n", + " (AR, ABCA4) (AR, ABL1) ... (ZNF419, CDKN2A) \\\n", + "patient id ... \n", + "TCGA-3C-AAAU-01 0.0 0.0 ... 0.0 \n", + "TCGA-3C-AALI-01 0.0 0.0 ... 0.0 \n", + "TCGA-3C-AALJ-01 0.0 0.0 ... 0.0 \n", + "TCGA-3C-AALK-01 0.0 0.0 ... 0.0 \n", + "TCGA-4H-AAAK-01 0.0 0.0 ... 0.0 \n", + "... ... ... ... ... \n", + "TCGA-WT-AB44-01 0.0 0.0 ... 0.0 \n", + "TCGA-XX-A899-01 0.0 0.0 ... 0.0 \n", + "TCGA-XX-A89A-01 0.0 0.0 ... 0.0 \n", + "TCGA-Z7-A8R5-01 0.0 0.0 ... 0.0 \n", + "TCGA-Z7-A8R6-01 0.0 0.0 ... 0.0 \n", "\n", - " (FOXP2, CNTNAP2) (ZNF653, CDKN2A) (E2F7, SP1) (ZNF417, CDKN2A) \\\n", - "0 0.0 0.0 0.0 0.0 \n", - "1 0.0 0.0 0.0 0.0 \n", - "2 0.0 0.0 0.0 0.0 \n", - "3 0.0 0.0 0.0 0.0 \n", - "4 0.0 0.0 0.0 0.0 \n", - "... ... ... ... ... \n", - "1093 0.0 0.0 0.0 0.0 \n", - "1094 0.0 0.0 0.0 0.0 \n", - "1095 0.0 0.0 0.0 0.0 \n", - "1096 0.0 0.0 0.0 0.0 \n", - "1097 0.0 0.0 0.0 0.0 \n", + " (ZNF671, CDKN2A) (THAP7, CDKN2A) (FOXP2, PLAUR) \\\n", + "patient id \n", + "TCGA-3C-AAAU-01 0.0 0.0 0.0 \n", + "TCGA-3C-AALI-01 0.0 0.0 0.0 \n", + "TCGA-3C-AALJ-01 0.0 0.0 0.0 \n", + "TCGA-3C-AALK-01 0.0 0.0 0.0 \n", + "TCGA-4H-AAAK-01 0.0 0.0 0.0 \n", + "... ... ... ... \n", + "TCGA-WT-AB44-01 0.0 0.0 0.0 \n", + "TCGA-XX-A899-01 0.0 0.0 0.0 \n", + "TCGA-XX-A89A-01 0.0 0.0 0.0 \n", + "TCGA-Z7-A8R5-01 0.0 0.0 0.0 \n", + "TCGA-Z7-A8R6-01 0.0 0.0 0.0 \n", "\n", - " (ZNF384, CDKN2A) (ZNF384, COL1A1) \n", - "0 0.0 0.0 \n", - "1 0.0 0.0 \n", - "2 0.0 0.0 \n", - "3 0.0 0.0 \n", - "4 0.0 0.0 \n", - "... ... ... \n", - "1093 0.0 0.0 \n", - "1094 0.0 0.0 \n", - "1095 0.0 0.0 \n", - "1096 0.0 0.0 \n", - "1097 0.0 0.0 \n", + " (FOXP2, CNTNAP2) (ZNF653, CDKN2A) (E2F7, SP1) \\\n", + "patient id \n", + "TCGA-3C-AAAU-01 0.0 0.0 0.0 \n", + "TCGA-3C-AALI-01 0.0 0.0 0.0 \n", + "TCGA-3C-AALJ-01 0.0 0.0 0.0 \n", + "TCGA-3C-AALK-01 0.0 0.0 0.0 \n", + "TCGA-4H-AAAK-01 0.0 0.0 0.0 \n", + "... ... ... ... \n", + "TCGA-WT-AB44-01 0.0 0.0 0.0 \n", + "TCGA-XX-A899-01 0.0 0.0 0.0 \n", + "TCGA-XX-A89A-01 0.0 0.0 0.0 \n", + "TCGA-Z7-A8R5-01 0.0 0.0 0.0 \n", + "TCGA-Z7-A8R6-01 0.0 0.0 0.0 \n", "\n", - "[1098 rows x 14148 columns]" + " (ZNF417, CDKN2A) (ZNF384, CDKN2A) (ZNF384, COL1A1) \n", + "patient id \n", + "TCGA-3C-AAAU-01 0.0 0.0 0.0 \n", + "TCGA-3C-AALI-01 0.0 0.0 0.0 \n", + "TCGA-3C-AALJ-01 0.0 0.0 0.0 \n", + "TCGA-3C-AALK-01 0.0 0.0 0.0 \n", + "TCGA-4H-AAAK-01 0.0 0.0 0.0 \n", + "... ... ... ... \n", + "TCGA-WT-AB44-01 0.0 0.0 0.0 \n", + "TCGA-XX-A899-01 0.0 0.0 0.0 \n", + "TCGA-XX-A89A-01 0.0 0.0 0.0 \n", + "TCGA-Z7-A8R5-01 0.0 0.0 0.0 \n", + "TCGA-Z7-A8R6-01 0.0 0.0 0.0 \n", + "\n", + "[1098 rows x 14147 columns]" ] }, "execution_count": 9, @@ -1418,11 +1471,379 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "0aa5c03b-1886-4b06-a6dd-c7933a00046c", "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
R2coef_interceptcoef_TFcoef_birth_days_tocoef_race_ASIANcoef_race_BLACK OR AFRICAN AMERICANcoef_race_WHITEcoef_race_[Not Evaluated]coef_gender_MALEpval_interceptpval_TFpval_birth_days_topval_race_ASIANpval_race_BLACK OR AFRICAN AMERICANpval_race_WHITEpval_race_[Not Evaluated]pval_gender_MALE
(PARP1, BRCA2)0.518944-0.6285821.0090240.0000161.6414091.6726601.6965220.0-0.1196850.3876062.496552e-140.1964700.0792810.0196980.010310NaN0.852860
(AHR, CYP1B1)0.460382-0.3728200.524032-0.0000070.7675100.1314280.0023620.0-0.1557640.4210932.162187e-140.3552660.2122390.7828950.995718NaN0.716738
(AHR, FOS)0.0756041.4470060.005936-0.000021-1.688957-0.004656-0.2603880.00.9979010.0771009.545845e-010.1328890.1187680.9955640.736247NaN0.187290
(AHR, SOS1)0.4692120.6750380.7640640.000008-0.986869-0.397666-0.2184860.01.3403830.3194152.434483e-140.4915120.2721520.5686830.734236NaN0.034528
(AHR, UGT1A6)0.071785-1.4222490.133097-0.000007-0.7513960.7896071.1009600.00.5633320.1475202.895810e-010.6753750.5619330.4336730.237741NaN0.534613
......................................................
(ZNF653, CDKN2A)0.261828-2.9623040.195867-0.0000370.9565031.2378421.2335440.0-1.6456330.0000091.319707e-040.0007670.2500470.0525910.038164NaN0.006131
(E2F7, SP1)0.2314730.3759560.3027680.0000280.8787211.1948011.1481870.00.1819810.5766015.622973e-050.0160790.3182380.0780940.070756NaN0.771183
(ZNF417, CDKN2A)0.183978-2.8130770.155436-0.0000370.6721920.8601860.8826940.0-1.7247030.0000584.457482e-020.0017220.4514910.2191160.170465NaN0.006266
(ZNF384, CDKN2A)0.165512-3.0179230.120197-0.0000360.9157991.1669941.1294100.0-1.8113910.0000191.957493e-010.0031560.3034100.0878050.075205NaN0.004527
(ZNF384, COL1A1)0.035308-2.3889960.016159-0.0000022.2669611.2270110.8390280.00.5283210.0299839.138044e-010.9148400.1155100.2639720.409823NaN0.601052
\n", + "

14147 rows × 17 columns

\n", + "
" + ], + "text/plain": [ + " R2 coef_intercept coef_TF coef_birth_days_to \\\n", + "(PARP1, BRCA2) 0.518944 -0.628582 1.009024 0.000016 \n", + "(AHR, CYP1B1) 0.460382 -0.372820 0.524032 -0.000007 \n", + "(AHR, FOS) 0.075604 1.447006 0.005936 -0.000021 \n", + "(AHR, SOS1) 0.469212 0.675038 0.764064 0.000008 \n", + "(AHR, UGT1A6) 0.071785 -1.422249 0.133097 -0.000007 \n", + "... ... ... ... ... \n", + "(ZNF653, CDKN2A) 0.261828 -2.962304 0.195867 -0.000037 \n", + "(E2F7, SP1) 0.231473 0.375956 0.302768 0.000028 \n", + "(ZNF417, CDKN2A) 0.183978 -2.813077 0.155436 -0.000037 \n", + "(ZNF384, CDKN2A) 0.165512 -3.017923 0.120197 -0.000036 \n", + "(ZNF384, COL1A1) 0.035308 -2.388996 0.016159 -0.000002 \n", + "\n", + " coef_race_ASIAN coef_race_BLACK OR AFRICAN AMERICAN \\\n", + "(PARP1, BRCA2) 1.641409 1.672660 \n", + "(AHR, CYP1B1) 0.767510 0.131428 \n", + "(AHR, FOS) -1.688957 -0.004656 \n", + "(AHR, SOS1) -0.986869 -0.397666 \n", + "(AHR, UGT1A6) -0.751396 0.789607 \n", + "... ... ... \n", + "(ZNF653, CDKN2A) 0.956503 1.237842 \n", + "(E2F7, SP1) 0.878721 1.194801 \n", + "(ZNF417, CDKN2A) 0.672192 0.860186 \n", + "(ZNF384, CDKN2A) 0.915799 1.166994 \n", + "(ZNF384, COL1A1) 2.266961 1.227011 \n", + "\n", + " coef_race_WHITE coef_race_[Not Evaluated] \\\n", + "(PARP1, BRCA2) 1.696522 0.0 \n", + "(AHR, CYP1B1) 0.002362 0.0 \n", + "(AHR, FOS) -0.260388 0.0 \n", + "(AHR, SOS1) -0.218486 0.0 \n", + "(AHR, UGT1A6) 1.100960 0.0 \n", + "... ... ... \n", + "(ZNF653, CDKN2A) 1.233544 0.0 \n", + "(E2F7, SP1) 1.148187 0.0 \n", + "(ZNF417, CDKN2A) 0.882694 0.0 \n", + "(ZNF384, CDKN2A) 1.129410 0.0 \n", + "(ZNF384, COL1A1) 0.839028 0.0 \n", + "\n", + " coef_gender_MALE pval_intercept pval_TF \\\n", + "(PARP1, BRCA2) -0.119685 0.387606 2.496552e-14 \n", + "(AHR, CYP1B1) -0.155764 0.421093 2.162187e-14 \n", + "(AHR, FOS) 0.997901 0.077100 9.545845e-01 \n", + "(AHR, SOS1) 1.340383 0.319415 2.434483e-14 \n", + "(AHR, UGT1A6) 0.563332 0.147520 2.895810e-01 \n", + "... ... ... ... \n", + "(ZNF653, CDKN2A) -1.645633 0.000009 1.319707e-04 \n", + "(E2F7, SP1) 0.181981 0.576601 5.622973e-05 \n", + "(ZNF417, CDKN2A) -1.724703 0.000058 4.457482e-02 \n", + "(ZNF384, CDKN2A) -1.811391 0.000019 1.957493e-01 \n", + "(ZNF384, COL1A1) 0.528321 0.029983 9.138044e-01 \n", + "\n", + " pval_birth_days_to pval_race_ASIAN \\\n", + "(PARP1, BRCA2) 0.196470 0.079281 \n", + "(AHR, CYP1B1) 0.355266 0.212239 \n", + "(AHR, FOS) 0.132889 0.118768 \n", + "(AHR, SOS1) 0.491512 0.272152 \n", + "(AHR, UGT1A6) 0.675375 0.561933 \n", + "... ... ... \n", + "(ZNF653, CDKN2A) 0.000767 0.250047 \n", + "(E2F7, SP1) 0.016079 0.318238 \n", + "(ZNF417, CDKN2A) 0.001722 0.451491 \n", + "(ZNF384, CDKN2A) 0.003156 0.303410 \n", + "(ZNF384, COL1A1) 0.914840 0.115510 \n", + "\n", + " pval_race_BLACK OR AFRICAN AMERICAN pval_race_WHITE \\\n", + "(PARP1, BRCA2) 0.019698 0.010310 \n", + "(AHR, CYP1B1) 0.782895 0.995718 \n", + "(AHR, FOS) 0.995564 0.736247 \n", + "(AHR, SOS1) 0.568683 0.734236 \n", + "(AHR, UGT1A6) 0.433673 0.237741 \n", + "... ... ... \n", + "(ZNF653, CDKN2A) 0.052591 0.038164 \n", + "(E2F7, SP1) 0.078094 0.070756 \n", + "(ZNF417, CDKN2A) 0.219116 0.170465 \n", + "(ZNF384, CDKN2A) 0.087805 0.075205 \n", + "(ZNF384, COL1A1) 0.263972 0.409823 \n", + "\n", + " pval_race_[Not Evaluated] pval_gender_MALE \n", + "(PARP1, BRCA2) NaN 0.852860 \n", + "(AHR, CYP1B1) NaN 0.716738 \n", + "(AHR, FOS) NaN 0.187290 \n", + "(AHR, SOS1) NaN 0.034528 \n", + "(AHR, UGT1A6) NaN 0.534613 \n", + "... ... ... \n", + "(ZNF653, CDKN2A) NaN 0.006131 \n", + "(E2F7, SP1) NaN 0.771183 \n", + "(ZNF417, CDKN2A) NaN 0.006266 \n", + "(ZNF384, CDKN2A) NaN 0.004527 \n", + "(ZNF384, COL1A1) NaN 0.601052 \n", + "\n", + "[14147 rows x 17 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.get_model_stats()" + ] } ], "metadata": { @@ -1441,7 +1862,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.2" + "version": "3.11.3" } }, "nbformat": 4,