Added first query to scenario 4 (HAMLET data)

ejp-rd-vp · Jul 12, 2024 · eb64777 · eb64777
1 parent c4838f3
commit eb64777
Show file tree

Hide file tree

Showing 6 changed files with 186 additions and 37 deletions.
diff --git a/SPARQL/scenario_4/question1_a.rq b/SPARQL/scenario_4/question1_a.rq
@@ -0,0 +1,31 @@
+PREFIX ex: <https://example.org/>
+PREFIX obo: <http://purl.obolibrary.org/obo/>
+PREFIX sio: <http://semanticscience.org/resource/>
+PREFIX dcterms: <http://purl.org/dc/terms/> 
+
+SELECT ?geneid ?genesymbolvalue
+WHERE {
+	# Find phenopacket with given ID
+    ?phenopacket a obo:NCIT_C79269 ;
+                 sio:SIO_000228 ?role .
+    ?id sio:SIO_000020 ?role ;
+        sio:SIO_000300 "new-reference-files" .
+
+    # Find all genomic interpretations related to given phenopacket ID
+    ?phenopacket sio:SIO_001403 ?interpr .
+    ?interpr a obo:NCIT_C41255 ;
+    		 sio:SIO_001403 ?diagnosis .
+    ?diagnosis sio:SIO_001403 ?genomicinterpr .
+    ?genomicinterpr a obo:SO_0001026 .
+
+    # Find the genes that are relevant to the genomic interpretations
+    ?genomicinterpr sio:SIO_001403 ?varinterpr .
+    ?varinterpr a obo:SO_0001060 ;
+                sio:SIO_001403 ?vardescr .
+    ?vardescr a obo:NCIT_C97927 ;
+              sio:SIO_001403 ?genedescr .
+    ?genedescr a obo:NCIT_C16612 ;
+               dcterms:identifier ?geneid ;
+               sio:SIO_000008 ?genesymbol .
+ 	?genesymbol sio:SIO_000300 ?genesymbolvalue
+}
diff --git a/_toc.yml b/_toc.yml
@@ -7,6 +7,6 @@ chapters:
   - file: scenario1
   - file: scenario2
   - file: scenario3
-  - file: hamlet
+  - file: scenario4
 
 
diff --git a/hamlet.md b/hamlet.md
diff --git a/hamlet_notebook.ipynb b/hamlet_notebook.ipynb
@@ -2,12 +2,13 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 44,
    "metadata": {},
    "outputs": [],
    "source": [
     "import os\n",
-    "import rdflib"
+    "import rdflib\n",
+    "import pandas as pd"
    ]
   },
   {
@@ -74,57 +75,153 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 39,
    "metadata": {},
    "outputs": [],
    "source": [
-    "query = \"\"\"\n",
-    "    PREFIX ex: <https://example.org/>\n",
-    "    PREFIX obo: <http://purl.obolibrary.org/obo/>\n",
-    "    PREFIX sio: <http://semanticscience.org/resource/>\n",
+    "SCENARIO_NR = 4\n",
     "\n",
-    "    SELECT ?idvalue\n",
-    "    WHERE {\n",
-    "        ?phenopacket a obo:NCIT_C79269 .\n",
-    "        ?role a obo:NCIT_C48835 .\n",
-    "        ?phenopacket sio:SIO_000228 ?role .\n",
-    "        ?id a obo:IAO_0020000 .\n",
-    "        ?id sio:SIO_000020 ?role .\n",
-    "        ?id sio:SIO_000300 ?idvalue .\n",
-    "    }\n",
-    "\"\"\""
+    "with open(f'SPARQL/scenario_{SCENARIO_NR}/question1_a.rq', 'r') as file:\n",
+    "    query = file.read()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 42,
    "metadata": {},
    "outputs": [],
    "source": [
-    "results = g.query(query)\n",
-    "\n",
-    "id_values = []\n",
+    "results = g.query(query)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "genes = []\n",
     "\n",
     "for result_row in results:\n",
-    "    phenopacket_id = str(result_row[0])\n",
-    "    id_values.append(phenopacket_id)"
+    "    genes.append({'id': result_row[0], 'symbol': result_row[1]})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>symbol</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>ENSG00000213281</td>\n",
+       "      <td>NRAS</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>ENSG00000141510</td>\n",
+       "      <td>TP53</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                id symbol\n",
+       "0  ENSG00000213281   NRAS\n",
+       "1  ENSG00000141510   TP53"
+      ]
+     },
+     "execution_count": 45,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "genes_df = pd.DataFrame(genes)\n",
+    "genes_df"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 46,
    "metadata": {},
    "outputs": [
     {
      "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>symbol</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>ENSG00000213281</td>\n",
+       "      <td>NRAS</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>ENSG00000141510</td>\n",
+       "      <td>TP53</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
       "text/plain": [
-       "'new-reference-files'"
+       "                id symbol\n",
+       "0  ENSG00000213281   NRAS\n",
+       "1  ENSG00000141510   TP53"
       ]
      },
      "metadata": {
       "scrapbook": {
        "mime_prefix": "",
-       "name": "phenopacket_id"
+       "name": "scenario4_genes"
       }
      },
      "output_type": "display_data"
@@ -133,7 +230,7 @@
    "source": [
     "from myst_nb import glue\n",
     "\n",
-    "glue('phenopacket_id', id_values[0])"
+    "glue('scenario4_genes', genes_df)"
    ]
   }
  ],

diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
 jupyter-book
 matplotlib
 numpy
+rdflib
diff --git a/scenario4.md b/scenario4.md
@@ -0,0 +1,29 @@
+---
+title: Scenario 4 - Genetic Analysis on AML
+---
+
+# Scenario 4: Genetic Analysis on AML.
+
+## Cell Line Derived from a Patient with AML
+
+A researcher is interested in the diversity and prevalence of genetic aberrations that cause AML in patients. A cell line has been derived from a patient with acute myeloid leukemia (AML). In order to acquire data that can be used for genetic analysis on this cell line, whole transcriptome RNA sequencing was performed of which the results have been used as input of the HAMLET pipeline. The HAMLET pipeline enables simultaneous detection of genetic mutations resulting in a phenopacket dataset that stores genetic as well as phenotypic information.
+
+---
+
+## Research Questions
+
+### Question 1: Gene Mutation Prevalence
+
+Have patients been observed with variants in the same gene and the same symptoms?
+
+To answer this question, first the genes are found in which at least one mutation is present given the phenopacket of the AML cell line:
+
+```{literalinclude} SPARQL/scenario_4/question1_a.rq
+:language: sparql
+```
+
+```{glue} scenario4_genes
+:doc: hamlet_notebook.ipynb
+```
+
+---