diff --git a/dataset_processing/.DS_Store b/dataset_processing/.DS_Store index 755630f..cb39dbd 100644 Binary files a/dataset_processing/.DS_Store and b/dataset_processing/.DS_Store differ diff --git a/dataset_processing/notebooks/ShifrutMarson2018.ipynb b/dataset_processing/notebooks/ShifrutMarson2018.ipynb index df1d568..db427e1 100644 --- a/dataset_processing/notebooks/ShifrutMarson2018.ipynb +++ b/dataset_processing/notebooks/ShifrutMarson2018.ipynb @@ -200,7 +200,8 @@ "source": [ "adata.obs['perturbation'] = adata.obs['target']\n", "# set all NaN values to \"control\"\n", - "adata.obs['perturbation'] = adata.obs['perturbation'].fillna('control')\n", + "# commenting out to leave blank\n", + "#adata.obs['perturbation'] = adata.obs['perturbation'].fillna('control')\n", "# set all NonTarget values to \"control\"\n", "adata.obs['perturbation'] = adata.obs['perturbation'].replace('NonTarget', 'control')" ] @@ -226,27 +227,27 @@ "data": { "text/plain": [ "perturbation\n", - "control 30683\n", - "DGKA 2296\n", - "PDCD1 1484\n", - "TMEM222 1426\n", - "BTLA 1412\n", - "HAVCR2 1355\n", - "CBLB 1327\n", - "CD5 1080\n", - "C10orf54 1058\n", - "MEF2D 1026\n", - "DGKZ 1020\n", - "LCP2 981\n", - "TCEB2 929\n", - "RASA2 905\n", - "CD3D 856\n", - "LAG3 840\n", - "SOCS1 835\n", - "TNFRSF9 777\n", - "CDKN1B 749\n", - "ARID1A 625\n", - "STAT6 572\n", + "control 3541\n", + "DGKA 2296\n", + "PDCD1 1484\n", + "TMEM222 1426\n", + "BTLA 1412\n", + "HAVCR2 1355\n", + "CBLB 1327\n", + "CD5 1080\n", + "C10orf54 1058\n", + "MEF2D 1026\n", + "DGKZ 1020\n", + "LCP2 981\n", + "TCEB2 929\n", + "RASA2 905\n", + "CD3D 856\n", + "LAG3 840\n", + "SOCS1 835\n", + "TNFRSF9 777\n", + "CDKN1B 749\n", + "ARID1A 625\n", + "STAT6 572\n", "Name: count, dtype: int64" ] }, @@ -271,7 +272,7 @@ "adata.obs['organism']=\"human\"\n", "adata.obs['perturbation_type']=\"CRISPR\"\n", "adata.obs['perturbation_type_2']= \"TCR stimulation\"\n", - "adata.obs['nperts']=1\n", + "adata.obs['nperts']=0\n", "adata.obs['celltype']=\"T cells\"" ] }, @@ -280,6 +281,57 @@ "execution_count": 15, "metadata": {}, "outputs": [], + "source": [ + "# if perturbation is not np.nan, add one to nperts\n", + "adata.obs.loc[adata.obs['perturbation'].isna(), 'nperts'] +=1" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# add a second perturbation when applicable\n", + "adata.obs.loc[adata.obs['perturbation_2']==\"stim\",'nperts'] +=1" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AAACCTGAGACACTAA 1\n", + "AAACCTGAGAGACTTA 1\n", + "AAACCTGAGCATCATC 1\n", + "AAACCTGAGGGCTTCC 1\n", + "AAACCTGAGGTTACCT 1\n", + " ..\n", + "TTTGTCAGTAGCGTGA 2\n", + "TTTGTCATCATCGCTC 2\n", + "TTTGTCATCCTCAACC 2\n", + "TTTGTCATCTCGCATC 2\n", + "TTTGTCATCTTGTCAT 2\n", + "Name: nperts, Length: 27142, dtype: int64" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.obs[adata.obs['perturbation'].isna()]['nperts']" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], "source": [ "adata.var['mt'] = adata.var_names.str.startswith('MT-') # annotate the group of mitochondrial genes as 'mt'\n", "adata.var['ribo']= adata.var_names.str.startswith('RPS') | adata.var_names.str.startswith('RPL') # annotate the group of ribosomal genes as 'ribo'" @@ -287,7 +339,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -296,7 +348,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -311,7 +363,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -320,7 +372,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -426,7 +478,7 @@ "[33694 rows x 2 columns]" ] }, - "execution_count": 19, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -437,367 +489,49 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 23, "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
samplepatientguide_idguide_countstargetperturbationperturbation_2diseasecancertissue_typeorganismperturbation_typeperturbation_type_2npertscelltypencountsngenespercent_mitopercent_ribo
AAACCTGAGACACTAAD1_nostimD1NA0NaNcontrolcontrolhealthyFalseprimaryhumanCRISPRTCR stimulation1T cells4367.017161.85482026.402565
AAACCTGAGAGACTTAD1_nostimD1NA0NaNcontrolcontrolhealthyFalseprimaryhumanCRISPRTCR stimulation1T cells5846.019983.25008630.294218
AAACCTGAGCATCATCD1_nostimD1NA0NaNcontrolcontrolhealthyFalseprimaryhumanCRISPRTCR stimulation1T cells3377.014382.93159628.042641
AAACCTGAGCGATTCTD1_nostimD1ES.sg26.PDCD15PDCD1PDCD1controlhealthyFalseprimaryhumanCRISPRTCR stimulation1T cells5710.019932.41681333.047287
AAACCTGAGGGCTTCCD1_nostimD1NA0NaNcontrolcontrolhealthyFalseprimaryhumanCRISPRTCR stimulation1T cells3077.012660.74748136.529087
............................................................
TTTGTCATCCTCAACCD2_stimD2NA0NaNcontrolstimhealthyFalseprimaryhumanCRISPRTCR stimulation1T cells7121.023332.89285225.530121
TTTGTCATCTCGCATCD2_stimD2NA0NaNcontrolstimhealthyFalseprimaryhumanCRISPRTCR stimulation1T cells4100.015351.97561034.439026
TTTGTCATCTTAGAGCD2_stimD2ES.sg34.TCEB22TCEB2TCEB2stimhealthyFalseprimaryhumanCRISPRTCR stimulation1T cells4293.015562.07314235.616119
TTTGTCATCTTATCTGD2_stimD2ES.sg35.TCEB26TCEB2TCEB2stimhealthyFalseprimaryhumanCRISPRTCR stimulation1T cells7735.025322.30122827.666452
TTTGTCATCTTGTCATD2_stimD2NA0NaNcontrolstimhealthyFalseprimaryhumanCRISPRTCR stimulation1T cells3981.016513.36598827.530771
\n", - "

52236 rows × 19 columns

\n", - "
" - ], "text/plain": [ - " sample patient guide_id guide_counts target \\\n", - "AAACCTGAGACACTAA D1_nostim D1 NA 0 NaN \n", - "AAACCTGAGAGACTTA D1_nostim D1 NA 0 NaN \n", - "AAACCTGAGCATCATC D1_nostim D1 NA 0 NaN \n", - "AAACCTGAGCGATTCT D1_nostim D1 ES.sg26.PDCD1 5 PDCD1 \n", - "AAACCTGAGGGCTTCC D1_nostim D1 NA 0 NaN \n", - "... ... ... ... ... ... \n", - "TTTGTCATCCTCAACC D2_stim D2 NA 0 NaN \n", - "TTTGTCATCTCGCATC D2_stim D2 NA 0 NaN \n", - "TTTGTCATCTTAGAGC D2_stim D2 ES.sg34.TCEB2 2 TCEB2 \n", - "TTTGTCATCTTATCTG D2_stim D2 ES.sg35.TCEB2 6 TCEB2 \n", - "TTTGTCATCTTGTCAT D2_stim D2 NA 0 NaN \n", - "\n", - " perturbation perturbation_2 disease cancer tissue_type \\\n", - "AAACCTGAGACACTAA control control healthy False primary \n", - "AAACCTGAGAGACTTA control control healthy False primary \n", - "AAACCTGAGCATCATC control control healthy False primary \n", - "AAACCTGAGCGATTCT PDCD1 control healthy False primary \n", - "AAACCTGAGGGCTTCC control control healthy False primary \n", - "... ... ... ... ... ... \n", - "TTTGTCATCCTCAACC control stim healthy False primary \n", - "TTTGTCATCTCGCATC control stim healthy False primary \n", - "TTTGTCATCTTAGAGC TCEB2 stim healthy False primary \n", - "TTTGTCATCTTATCTG TCEB2 stim healthy False primary \n", - "TTTGTCATCTTGTCAT control stim healthy False primary \n", - "\n", - " organism perturbation_type perturbation_type_2 nperts \\\n", - "AAACCTGAGACACTAA human CRISPR TCR stimulation 1 \n", - "AAACCTGAGAGACTTA human CRISPR TCR stimulation 1 \n", - "AAACCTGAGCATCATC human CRISPR TCR stimulation 1 \n", - "AAACCTGAGCGATTCT human CRISPR TCR stimulation 1 \n", - "AAACCTGAGGGCTTCC human CRISPR TCR stimulation 1 \n", - "... ... ... ... ... \n", - "TTTGTCATCCTCAACC human CRISPR TCR stimulation 1 \n", - "TTTGTCATCTCGCATC human CRISPR TCR stimulation 1 \n", - "TTTGTCATCTTAGAGC human CRISPR TCR stimulation 1 \n", - "TTTGTCATCTTATCTG human CRISPR TCR stimulation 1 \n", - "TTTGTCATCTTGTCAT human CRISPR TCR stimulation 1 \n", - "\n", - " celltype ncounts ngenes percent_mito percent_ribo \n", - "AAACCTGAGACACTAA T cells 4367.0 1716 1.854820 26.402565 \n", - "AAACCTGAGAGACTTA T cells 5846.0 1998 3.250086 30.294218 \n", - "AAACCTGAGCATCATC T cells 3377.0 1438 2.931596 28.042641 \n", - "AAACCTGAGCGATTCT T cells 5710.0 1993 2.416813 33.047287 \n", - "AAACCTGAGGGCTTCC T cells 3077.0 1266 0.747481 36.529087 \n", - "... ... ... ... ... ... \n", - "TTTGTCATCCTCAACC T cells 7121.0 2333 2.892852 25.530121 \n", - "TTTGTCATCTCGCATC T cells 4100.0 1535 1.975610 34.439026 \n", - "TTTGTCATCTTAGAGC T cells 4293.0 1556 2.073142 35.616119 \n", - "TTTGTCATCTTATCTG T cells 7735.0 2532 2.301228 27.666452 \n", - "TTTGTCATCTTGTCAT T cells 3981.0 1651 3.365988 27.530771 \n", - "\n", - "[52236 rows x 19 columns]" + "25094" ] }, - "execution_count": 20, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "adata.obs" + "adata.obs['perturbation'].value_counts().sum()" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AnnData object with n_obs × n_vars = 52236 × 33694\n", + " obs: 'sample', 'patient', 'guide_id', 'guide_counts', 'target', 'perturbation', 'perturbation_2', 'disease', 'cancer', 'tissue_type', 'organism', 'perturbation_type', 'perturbation_type_2', 'nperts', 'celltype', 'ncounts', 'ngenes', 'percent_mito', 'percent_ribo'\n", + " var: 'ncounts', 'ncells'" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata" + ] + }, + { + "cell_type": "code", + "execution_count": 25, "metadata": {}, "outputs": [], "source": [