diff --git a/dataset_processing/.DS_Store b/dataset_processing/.DS_Store index 755630f..cb39dbd 100644 Binary files a/dataset_processing/.DS_Store and b/dataset_processing/.DS_Store differ diff --git a/dataset_processing/notebooks/ShifrutMarson2018.ipynb b/dataset_processing/notebooks/ShifrutMarson2018.ipynb index df1d568..db427e1 100644 --- a/dataset_processing/notebooks/ShifrutMarson2018.ipynb +++ b/dataset_processing/notebooks/ShifrutMarson2018.ipynb @@ -200,7 +200,8 @@ "source": [ "adata.obs['perturbation'] = adata.obs['target']\n", "# set all NaN values to \"control\"\n", - "adata.obs['perturbation'] = adata.obs['perturbation'].fillna('control')\n", + "# commenting out to leave blank\n", + "#adata.obs['perturbation'] = adata.obs['perturbation'].fillna('control')\n", "# set all NonTarget values to \"control\"\n", "adata.obs['perturbation'] = adata.obs['perturbation'].replace('NonTarget', 'control')" ] @@ -226,27 +227,27 @@ "data": { "text/plain": [ "perturbation\n", - "control 30683\n", - "DGKA 2296\n", - "PDCD1 1484\n", - "TMEM222 1426\n", - "BTLA 1412\n", - "HAVCR2 1355\n", - "CBLB 1327\n", - "CD5 1080\n", - "C10orf54 1058\n", - "MEF2D 1026\n", - "DGKZ 1020\n", - "LCP2 981\n", - "TCEB2 929\n", - "RASA2 905\n", - "CD3D 856\n", - "LAG3 840\n", - "SOCS1 835\n", - "TNFRSF9 777\n", - "CDKN1B 749\n", - "ARID1A 625\n", - "STAT6 572\n", + "control 3541\n", + "DGKA 2296\n", + "PDCD1 1484\n", + "TMEM222 1426\n", + "BTLA 1412\n", + "HAVCR2 1355\n", + "CBLB 1327\n", + "CD5 1080\n", + "C10orf54 1058\n", + "MEF2D 1026\n", + "DGKZ 1020\n", + "LCP2 981\n", + "TCEB2 929\n", + "RASA2 905\n", + "CD3D 856\n", + "LAG3 840\n", + "SOCS1 835\n", + "TNFRSF9 777\n", + "CDKN1B 749\n", + "ARID1A 625\n", + "STAT6 572\n", "Name: count, dtype: int64" ] }, @@ -271,7 +272,7 @@ "adata.obs['organism']=\"human\"\n", "adata.obs['perturbation_type']=\"CRISPR\"\n", "adata.obs['perturbation_type_2']= \"TCR stimulation\"\n", - "adata.obs['nperts']=1\n", + "adata.obs['nperts']=0\n", "adata.obs['celltype']=\"T cells\"" ] }, @@ -280,6 +281,57 @@ "execution_count": 15, "metadata": {}, "outputs": [], + "source": [ + "# if perturbation is not np.nan, add one to nperts\n", + "adata.obs.loc[adata.obs['perturbation'].isna(), 'nperts'] +=1" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# add a second perturbation when applicable\n", + "adata.obs.loc[adata.obs['perturbation_2']==\"stim\",'nperts'] +=1" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AAACCTGAGACACTAA 1\n", + "AAACCTGAGAGACTTA 1\n", + "AAACCTGAGCATCATC 1\n", + "AAACCTGAGGGCTTCC 1\n", + "AAACCTGAGGTTACCT 1\n", + " ..\n", + "TTTGTCAGTAGCGTGA 2\n", + "TTTGTCATCATCGCTC 2\n", + "TTTGTCATCCTCAACC 2\n", + "TTTGTCATCTCGCATC 2\n", + "TTTGTCATCTTGTCAT 2\n", + "Name: nperts, Length: 27142, dtype: int64" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.obs[adata.obs['perturbation'].isna()]['nperts']" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], "source": [ "adata.var['mt'] = adata.var_names.str.startswith('MT-') # annotate the group of mitochondrial genes as 'mt'\n", "adata.var['ribo']= adata.var_names.str.startswith('RPS') | adata.var_names.str.startswith('RPL') # annotate the group of ribosomal genes as 'ribo'" @@ -287,7 +339,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -296,7 +348,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -311,7 +363,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -320,7 +372,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -426,7 +478,7 @@ "[33694 rows x 2 columns]" ] }, - "execution_count": 19, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -437,367 +489,49 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 23, "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
\n", - " | sample | \n", - "patient | \n", - "guide_id | \n", - "guide_counts | \n", - "target | \n", - "perturbation | \n", - "perturbation_2 | \n", - "disease | \n", - "cancer | \n", - "tissue_type | \n", - "organism | \n", - "perturbation_type | \n", - "perturbation_type_2 | \n", - "nperts | \n", - "celltype | \n", - "ncounts | \n", - "ngenes | \n", - "percent_mito | \n", - "percent_ribo | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
AAACCTGAGACACTAA | \n", - "D1_nostim | \n", - "D1 | \n", - "NA | \n", - "0 | \n", - "NaN | \n", - "control | \n", - "control | \n", - "healthy | \n", - "False | \n", - "primary | \n", - "human | \n", - "CRISPR | \n", - "TCR stimulation | \n", - "1 | \n", - "T cells | \n", - "4367.0 | \n", - "1716 | \n", - "1.854820 | \n", - "26.402565 | \n", - "
AAACCTGAGAGACTTA | \n", - "D1_nostim | \n", - "D1 | \n", - "NA | \n", - "0 | \n", - "NaN | \n", - "control | \n", - "control | \n", - "healthy | \n", - "False | \n", - "primary | \n", - "human | \n", - "CRISPR | \n", - "TCR stimulation | \n", - "1 | \n", - "T cells | \n", - "5846.0 | \n", - "1998 | \n", - "3.250086 | \n", - "30.294218 | \n", - "
AAACCTGAGCATCATC | \n", - "D1_nostim | \n", - "D1 | \n", - "NA | \n", - "0 | \n", - "NaN | \n", - "control | \n", - "control | \n", - "healthy | \n", - "False | \n", - "primary | \n", - "human | \n", - "CRISPR | \n", - "TCR stimulation | \n", - "1 | \n", - "T cells | \n", - "3377.0 | \n", - "1438 | \n", - "2.931596 | \n", - "28.042641 | \n", - "
AAACCTGAGCGATTCT | \n", - "D1_nostim | \n", - "D1 | \n", - "ES.sg26.PDCD1 | \n", - "5 | \n", - "PDCD1 | \n", - "PDCD1 | \n", - "control | \n", - "healthy | \n", - "False | \n", - "primary | \n", - "human | \n", - "CRISPR | \n", - "TCR stimulation | \n", - "1 | \n", - "T cells | \n", - "5710.0 | \n", - "1993 | \n", - "2.416813 | \n", - "33.047287 | \n", - "
AAACCTGAGGGCTTCC | \n", - "D1_nostim | \n", - "D1 | \n", - "NA | \n", - "0 | \n", - "NaN | \n", - "control | \n", - "control | \n", - "healthy | \n", - "False | \n", - "primary | \n", - "human | \n", - "CRISPR | \n", - "TCR stimulation | \n", - "1 | \n", - "T cells | \n", - "3077.0 | \n", - "1266 | \n", - "0.747481 | \n", - "36.529087 | \n", - "
... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "... | \n", - "
TTTGTCATCCTCAACC | \n", - "D2_stim | \n", - "D2 | \n", - "NA | \n", - "0 | \n", - "NaN | \n", - "control | \n", - "stim | \n", - "healthy | \n", - "False | \n", - "primary | \n", - "human | \n", - "CRISPR | \n", - "TCR stimulation | \n", - "1 | \n", - "T cells | \n", - "7121.0 | \n", - "2333 | \n", - "2.892852 | \n", - "25.530121 | \n", - "
TTTGTCATCTCGCATC | \n", - "D2_stim | \n", - "D2 | \n", - "NA | \n", - "0 | \n", - "NaN | \n", - "control | \n", - "stim | \n", - "healthy | \n", - "False | \n", - "primary | \n", - "human | \n", - "CRISPR | \n", - "TCR stimulation | \n", - "1 | \n", - "T cells | \n", - "4100.0 | \n", - "1535 | \n", - "1.975610 | \n", - "34.439026 | \n", - "
TTTGTCATCTTAGAGC | \n", - "D2_stim | \n", - "D2 | \n", - "ES.sg34.TCEB2 | \n", - "2 | \n", - "TCEB2 | \n", - "TCEB2 | \n", - "stim | \n", - "healthy | \n", - "False | \n", - "primary | \n", - "human | \n", - "CRISPR | \n", - "TCR stimulation | \n", - "1 | \n", - "T cells | \n", - "4293.0 | \n", - "1556 | \n", - "2.073142 | \n", - "35.616119 | \n", - "
TTTGTCATCTTATCTG | \n", - "D2_stim | \n", - "D2 | \n", - "ES.sg35.TCEB2 | \n", - "6 | \n", - "TCEB2 | \n", - "TCEB2 | \n", - "stim | \n", - "healthy | \n", - "False | \n", - "primary | \n", - "human | \n", - "CRISPR | \n", - "TCR stimulation | \n", - "1 | \n", - "T cells | \n", - "7735.0 | \n", - "2532 | \n", - "2.301228 | \n", - "27.666452 | \n", - "
TTTGTCATCTTGTCAT | \n", - "D2_stim | \n", - "D2 | \n", - "NA | \n", - "0 | \n", - "NaN | \n", - "control | \n", - "stim | \n", - "healthy | \n", - "False | \n", - "primary | \n", - "human | \n", - "CRISPR | \n", - "TCR stimulation | \n", - "1 | \n", - "T cells | \n", - "3981.0 | \n", - "1651 | \n", - "3.365988 | \n", - "27.530771 | \n", - "
52236 rows × 19 columns
\n", - "