From f8a6f21c5fd0d9ae53fce91aca2df335cfffdb6d Mon Sep 17 00:00:00 2001
From: Tomaz Bratanic <bratanic.tomaz@gmail.com>
Date: Sun, 17 Apr 2022 14:50:03 +0100
Subject: [PATCH] Created using Colaboratory

---
 ...h Neo4j Clustering on Gutenberg book.ipynb | 1115 +++++++++++------
 1 file changed, 708 insertions(+), 407 deletions(-)

diff --git a/Spacy_Neo4j_Gutenberg_Book/Spacy NER with Neo4j Clustering on Gutenberg book.ipynb b/Spacy_Neo4j_Gutenberg_Book/Spacy NER with Neo4j Clustering on Gutenberg book.ipynb
index 940a20a..7f2ace5 100644
--- a/Spacy_Neo4j_Gutenberg_Book/Spacy NER with Neo4j Clustering on Gutenberg book.ipynb	
+++ b/Spacy_Neo4j_Gutenberg_Book/Spacy NER with Neo4j Clustering on Gutenberg book.ipynb	
@@ -1,415 +1,716 @@
 {
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Data preprocessing"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# https://www.gutenberg.org/ebooks/95 Prisoner of Zelda\n",
-    "\n",
-    "# Fetch the data\n",
-    "target_url = 'https://www.gutenberg.org/files/95/95-0.txt'\n",
-    "import urllib.request\n",
-    "data = urllib.request.urlopen(target_url)\n",
-    "raw_data = data.read().decode('utf8').strip()\n",
-    "\n",
-    "# Preprocess text into chapters \n",
-    "import re\n",
-    "chapters = re.sub('[^A-z0-9 -]', ' ', raw_data).split('CHAPTER')[1:]\n",
-    "chapters[-1] = chapters[-1].split('End of the Project Gutenberg EBook')[0]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Import into Neo4j"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# import spacy and load an NLP model\n",
-    "import spacy\n",
-    "nlp = spacy.load(\"en_core_web_lg\", disable=[\"tagger\", \"parser\"])\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Import Neo4j and define cypher queries\n",
-    "import neo4j\n",
-    "host = \"bolt://localhost:7687\"\n",
-    "user = 'neo4j'\n",
-    "password = 'password'\n",
-    "\n",
-    "driver = neo4j.GraphDatabase.driver(host, auth=(user, password))\n",
-    "\n",
-    "save_query =\"\"\"\n",
-    "MERGE (p1:Person{name:$name1})\n",
-    "MERGE (p2:Person{name:$name2})\n",
-    "MERGE (p1)-[r:RELATED]-(p2)\n",
-    "ON CREATE SET r.score = 1\n",
-    "ON MATCH SET r.score = r.score + 1\"\"\"\n",
-    "\n",
-    "constraint_query=\"CREATE CONSTRAINT ON (p:Person) ASSERT p.name IS UNIQUE;\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
+  "cells": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Elphbergs Rudolf\n",
-      "Rudolf Rose\n",
-      "Robert Ancestry\n",
-      "Ancestry Rose\n",
-      "Robert Robert\n",
-      "Rudolf Rose\n",
-      "Rudolf Robert\n",
-      "Robert Rudolf\n",
-      "Ruritania Burlesdon\n",
-      "Lady Burlesdon George II\n",
-      "George II King\n",
-      "Rudolf the Third Ruritania\n",
-      "Burlesdon Amelia\n",
-      "James Burlesdon\n",
-      "Burlesdon a Knight of the Garter\n",
-      "a Knight of the Garter Rudolf\n",
-      "a Knight of the Garter Ruritania\n",
-      "Rudolf Ruritania\n",
-      "Jacob Jacob\n",
-      "Jacob Rudolf\n",
-      "Elphbergs Elphberg\n",
-      "Elphberg Rudolf\n",
-      "Bob Rose\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Run the analysis of the first chapter\n",
-    "c = chapters[0]\n",
-    "# Get involved\n",
-    "doc=nlp(c)\n",
-    "\n",
-    "with driver.session() as session:\n",
-    "    #define constraint\n",
-    "    session.run(constraint_query)\n",
-    "    # Extract Person labels\n",
-    "    involved = list(set([ent.text for ent in doc.ents if ent.label_=='PERSON']))\n",
-    "    # Preprocess text\n",
-    "    decode = dict()\n",
-    "    for i,x in enumerate(involved):\n",
-    "        # Get mapping\n",
-    "        decode['$${}$$'.format(i)] = x\n",
-    "        # Preprocess text\n",
-    "        c = c.replace(x,' $${}$$ '.format(i))\n",
-    "        \n",
-    "    # Split chapter into words\n",
-    "    ws = c.split()\n",
-    "    l = len(ws)\n",
-    "    # Iterate through words\n",
-    "    for wi,w in enumerate(ws):\n",
-    "        # Skip if the word is not a person\n",
-    "        if not w[:2] == '$$':\n",
-    "            continue\n",
-    "        # Check next x words for any involved person\n",
-    "        x = 14\n",
-    "        for i in range(wi+1,wi+x):\n",
-    "            # Avoid list index error\n",
-    "            if i >= l:\n",
-    "                break\n",
-    "            # Skip if the word is not a person\n",
-    "            if not ws[i][:2] == '$$':\n",
-    "                continue\n",
-    "            # Store to Neo4j\n",
-    "            params = {'name1':decode[ws[wi]],'name2':decode[ws[i]]}\n",
-    "            session.run(save_query, params)\n",
-    "            print(decode[ws[wi]],decode[ws[i]])\n",
-    "        "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Graph Analysis"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Run pagerank and louvain algorithm\n",
-    "pagerank =\"\"\"\n",
-    "CALL algo.pageRank('Person','RELATED',{direction:'BOTH'})\n",
-    "\"\"\"\n",
-    "louvain = \"\"\"\n",
-    "CALL algo.louvain('Person','RELATED',{direction:'BOTH'})\n",
-    "\"\"\"\n",
-    "with driver.session() as session:\n",
-    "    session.run(pagerank)\n",
-    "    session.run(louvain)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Vizualizations\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from IPython.display import IFrame, HTML\n",
-    "import json\n",
-    "import uuid\n",
-    "\n",
-    "\n",
-    "def generate_vis(host, user, password, cypher, labels_json, relationships_json):\n",
-    "    html = \"\"\"\\\n",
-    "<html>\n",
-    "<head>\n",
-    "    <title>Neovis.js Simple Example</title>\n",
-    "            <style type=\"text/css\">\n",
-    "                html, body {{\n",
-    "                    font: 16pt arial;\n",
-    "                }}\n",
-    "                #viz {{\n",
-    "                    width: 400px;\n",
-    "                    height: 450px;\n",
-    "                    font: 22pt arial;\n",
-    "                }}\n",
-    "            </style>\n",
-    "            <script src=\"https://cdn.neo4jlabs.com/neovis.js/v1.1.0/neovis.js\"></script>\n",
-    "            <script\n",
-    "                    src=\"https://code.jquery.com/jquery-3.2.1.min.js\"\n",
-    "                    integrity=\"sha256-hwg4gsxgFZhOsEEamdOYGBf13FyQuiTwlAQgxVSNgt4=\"\n",
-    "                    crossorigin=\"anonymous\"></script>\n",
-    "            <script type=\"text/javascript\">\n",
-    "                var viz;\n",
-    "                function draw() {{\n",
-    "                    var config = {{\n",
-    "                        container_id: \"viz\",\n",
-    "                        server_url: \"{host}\",\n",
-    "                        server_user: \"{user}\",\n",
-    "                        server_password: \"{password}\",\n",
-    "                        labels: {labels},\n",
-    "                        relationships: {relationships},\n",
-    "                        initial_cypher: \"{cypher}\"\n",
-    "                    }};\n",
-    "                    viz = new NeoVis.default(config);\n",
-    "                    viz.render();                    \n",
-    "                    viz.onVisualizationRendered(function(ctx) {{\n",
-    "                        let imageSrc = document.getElementsByTagName(\"canvas\")[0].toDataURL();\n",
-    "                        console.log(imageSrc);\n",
-    "                        document.getElementById(\"viz-image\").src=imageSrc;\n",
-    "                        //document.getElementById(\"viz\").style=\"display:none\";\n",
-    "                        \n",
-    "                        let kernel = IPython.notebook.kernel;\n",
-    "                        //let command = 'display(HTML('<img id=\"viz-image\" width=\"300px\" src=\"' + imageSrc + '\" />';\n",
-    "                        let command = \"foo = 'bar'\";\n",
-    "                        kernel.execute(command);\n",
-    "                        \n",
-    "                    }});\n",
-    "                }}\n",
-    "            </script>\n",
-    "         </head>\n",
-    "        <body onload=\"draw()\">\n",
-    "            <div id=\"viz\"></div>\n",
-    "        </body>\n",
-    "    </html>\n",
-    "    \"\"\"\n",
-    "\n",
-    "    html = html.format(\n",
-    "        host=host,\n",
-    "        user=user,\n",
-    "        password=password,\n",
-    "        cypher=cypher,\n",
-    "        labels = json.dumps(labels_json),\n",
-    "        relationships=json.dumps(relationships_json)\n",
-    "    )\n",
-    "\n",
-    "    unique_id = str(uuid.uuid4())\n",
-    "    filename = \"graph-{}.html\".format(unique_id)\n",
-    "\n",
-    "    with open(filename, \"w\") as f:\n",
-    "        f.write(html)\n",
-    "    return IFrame(src=filename, width=500, height=500)\n",
-    "\n",
-    "        "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/tomasonjo/blogs/blob/master/Spacy_Neo4j_Gutenberg_Book/Spacy%20NER%20with%20Neo4j%20Clustering%20on%20Gutenberg%20book.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
     {
-     "data": {
-      "text/html": [
-       "\n",
-       "        <iframe\n",
-       "            width=\"500\"\n",
-       "            height=\"500\"\n",
-       "            src=\"graph-7285cc03-44b1-45f2-b786-dfedebb1d149.html\"\n",
-       "            frameborder=\"0\"\n",
-       "            allowfullscreen\n",
-       "        ></iframe>\n",
-       "        "
+      "cell_type": "markdown",
+      "source": [
+        "* Updated to GDS 2.0 version\n",
+        "* Link to original blog post: https://towardsdatascience.com/network-analysis-of-prisoners-of-zenda-book-with-spacy-and-neo4j-b0839a640105"
       ],
-      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f8eb1188278>"
+      "metadata": {
+        "id": "PLkTF151vvYa"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!pip install neo4j spacy\n",
+        "!python -m spacy download en_core_web_lg\n"
+      ],
+      "metadata": {
+        "id": "xQuOPNtrv5tl",
+        "outputId": "85cb8c38-271c-482a-8f86-6a3e217b2193",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "execution_count": 1,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Collecting neo4j\n",
+            "  Downloading neo4j-4.4.2.tar.gz (89 kB)\n",
+            "\u001b[?25l\r\u001b[K     |███▋                            | 10 kB 16.4 MB/s eta 0:00:01\r\u001b[K     |███████▎                        | 20 kB 11.0 MB/s eta 0:00:01\r\u001b[K     |███████████                     | 30 kB 8.9 MB/s eta 0:00:01\r\u001b[K     |██████████████▋                 | 40 kB 8.3 MB/s eta 0:00:01\r\u001b[K     |██████████████████▎             | 51 kB 4.3 MB/s eta 0:00:01\r\u001b[K     |██████████████████████          | 61 kB 5.0 MB/s eta 0:00:01\r\u001b[K     |█████████████████████████▋      | 71 kB 5.4 MB/s eta 0:00:01\r\u001b[K     |█████████████████████████████▎  | 81 kB 5.6 MB/s eta 0:00:01\r\u001b[K     |████████████████████████████████| 89 kB 3.9 MB/s \n",
+            "\u001b[?25hRequirement already satisfied: spacy in /usr/local/lib/python3.7/dist-packages (2.2.4)\n",
+            "Requirement already satisfied: pytz in /usr/local/lib/python3.7/dist-packages (from neo4j) (2018.9)\n",
+            "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (2.23.0)\n",
+            "Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.0.0)\n",
+            "Requirement already satisfied: plac<1.2.0,>=0.9.6 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.1.3)\n",
+            "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (4.64.0)\n",
+            "Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (0.9.1)\n",
+            "Requirement already satisfied: thinc==7.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (7.4.0)\n",
+            "Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.21.5)\n",
+            "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy) (3.0.6)\n",
+            "Requirement already satisfied: blis<0.5.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (0.4.1)\n",
+            "Requirement already satisfied: srsly<1.1.0,>=1.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.0.5)\n",
+            "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy) (2.0.6)\n",
+            "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from spacy) (57.4.0)\n",
+            "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.0.6)\n",
+            "Requirement already satisfied: importlib-metadata>=0.20 in /usr/local/lib/python3.7/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy) (4.11.3)\n",
+            "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy) (3.8.0)\n",
+            "Requirement already satisfied: typing-extensions>=3.6.4 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy) (4.1.1)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2021.10.8)\n",
+            "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2.10)\n",
+            "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (3.0.4)\n",
+            "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (1.24.3)\n",
+            "Building wheels for collected packages: neo4j\n",
+            "  Building wheel for neo4j (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for neo4j: filename=neo4j-4.4.2-py3-none-any.whl size=115365 sha256=375f8f269c3b4b2bd5e68f1c77965807b84099501344af227bf704360c6d44e3\n",
+            "  Stored in directory: /root/.cache/pip/wheels/10/d6/28/95029d7f69690dbc3b93e4933197357987de34fbd44b50a0e4\n",
+            "Successfully built neo4j\n",
+            "Installing collected packages: neo4j\n",
+            "Successfully installed neo4j-4.4.2\n",
+            "Collecting en_core_web_lg==2.2.5\n",
+            "  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9 MB)\n",
+            "\u001b[K     |████████████████████████████████| 827.9 MB 1.1 MB/s \n",
+            "\u001b[?25hRequirement already satisfied: spacy>=2.2.2 in /usr/local/lib/python3.7/dist-packages (from en_core_web_lg==2.2.5) (2.2.4)\n",
+            "Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (1.21.5)\n",
+            "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (2.23.0)\n",
+            "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (1.0.6)\n",
+            "Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (1.0.0)\n",
+            "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (4.64.0)\n",
+            "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (57.4.0)\n",
+            "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (3.0.6)\n",
+            "Requirement already satisfied: thinc==7.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (7.4.0)\n",
+            "Requirement already satisfied: plac<1.2.0,>=0.9.6 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (1.1.3)\n",
+            "Requirement already satisfied: blis<0.5.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (0.4.1)\n",
+            "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (2.0.6)\n",
+            "Requirement already satisfied: srsly<1.1.0,>=1.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (1.0.5)\n",
+            "Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (0.9.1)\n",
+            "Requirement already satisfied: importlib-metadata>=0.20 in /usr/local/lib/python3.7/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_lg==2.2.5) (4.11.3)\n",
+            "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_lg==2.2.5) (3.8.0)\n",
+            "Requirement already satisfied: typing-extensions>=3.6.4 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_lg==2.2.5) (4.1.1)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_lg==2.2.5) (2021.10.8)\n",
+            "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_lg==2.2.5) (3.0.4)\n",
+            "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_lg==2.2.5) (2.10)\n",
+            "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_lg==2.2.5) (1.24.3)\n",
+            "Building wheels for collected packages: en-core-web-lg\n",
+            "  Building wheel for en-core-web-lg (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-py3-none-any.whl size=829180942 sha256=575aa8e998aefd0d0fd8abaeee8b7f4f7353dd86950646d465af448f2e44ffbf\n",
+            "  Stored in directory: /tmp/pip-ephem-wheel-cache-mrwx1m4p/wheels/11/95/ba/2c36cc368c0bd339b44a791c2c1881a1fb714b78c29a4cb8f5\n",
+            "Successfully built en-core-web-lg\n",
+            "Installing collected packages: en-core-web-lg\n",
+            "Successfully installed en-core-web-lg-2.2.5\n",
+            "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n",
+            "You can now load the model via spacy.load('en_core_web_lg')\n"
+          ]
+        }
       ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Create vizualization\n",
-    "cypher = \"MATCH (p1:Person)-[r:RELATED]->(p2:Person) RETURN *\"\n",
-    "\n",
-    "labels_json = {\n",
-    "    \"Person\": {\n",
-    "        \"caption\": \"name\",\n",
-    "        \"size\": \"pagerank\",\n",
-    "        \"community\": \"community\"\n",
-    "    }\n",
-    "}\n",
-    "\n",
-    "relationships_json = {\n",
-    "    \"RELATED\": {\n",
-    "        \"thickness\": \"score\",\n",
-    "        \"caption\": False\n",
-    "    }\n",
-    "}\n",
-    "\n",
-    "generate_vis(host, user, password, cypher, labels_json, relationships_json)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "GOVTfyZ2vtOt"
+      },
+      "source": [
+        "Restart runtime before continuing in order for SpaCy to work\n",
+        "\n",
+        "# Data preprocessing"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "id": "IzJm4AshvtOx"
+      },
+      "outputs": [],
+      "source": [
+        "# https://www.gutenberg.org/ebooks/95 Prisoner of Zelda\n",
+        "\n",
+        "# Fetch the data\n",
+        "target_url = 'https://www.gutenberg.org/files/95/95-0.txt'\n",
+        "import urllib.request\n",
+        "data = urllib.request.urlopen(target_url)\n",
+        "raw_data = data.read().decode('utf8').strip()\n",
+        "\n",
+        "# Preprocess text into chapters \n",
+        "import re\n",
+        "chapters = re.sub('[^A-z0-9 -]', ' ', raw_data).split('CHAPTER')[1:]\n",
+        "chapters[-1] = chapters[-1].split('End of the Project Gutenberg EBook')[0]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "182mC1YtvtOz"
+      },
+      "source": [
+        "# Import into Neo4j"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {
+        "id": "MPOKdT96vtOz"
+      },
+      "outputs": [],
+      "source": [
+        "# import spacy and load an NLP model\n",
+        "import spacy\n",
+        "nlp = spacy.load(\"en_core_web_lg\", disable=[\"tagger\", \"parser\"])\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "id": "OxeIYL9kvtOz"
+      },
+      "outputs": [],
+      "source": [
+        "# Import Neo4j and define cypher queries\n",
+        "import neo4j\n",
+        "host = 'bolt://3.235.2.228:7687'\n",
+        "user = 'neo4j'\n",
+        "password = 'seats-drunks-carbon'\n",
+        "\n",
+        "driver = neo4j.GraphDatabase.driver(host, auth=(user, password))\n",
+        "\n",
+        "save_query =\"\"\"\n",
+        "MERGE (p1:Person{name:$name1})\n",
+        "MERGE (p2:Person{name:$name2})\n",
+        "MERGE (p1)-[r:RELATED]-(p2)\n",
+        "ON CREATE SET r.score = 1\n",
+        "ON MATCH SET r.score = r.score + 1\"\"\"\n",
+        "\n",
+        "constraint_query=\"CREATE CONSTRAINT ON (p:Person) ASSERT p.name IS UNIQUE;\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {
+        "id": "wzglMHIIvtO0",
+        "outputId": "cb69fad5-405c-4795-cd45-d4b9c1d7c250",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Rassendylls Elphberg\n",
+            "Rudolf Rose  \n",
+            "Rassendylls Robert\n",
+            "Robert Robert\n",
+            "Rudolf Rose  \n",
+            "Robert Good heavens   \n",
+            "Good heavens    Rudolf\n",
+            "Rudolf Robert\n",
+            "Robert Rudolf\n",
+            "Elphberg Rassendylls\n",
+            "Burlesdon Strelsau\n",
+            "Burlesdon Amelia\n",
+            "James   Burlesdon\n",
+            "James   Rassendyll\n",
+            "Burlesdon Rassendyll\n",
+            "Burlesdon a Knight of the Garter\n",
+            "Rassendyll a Knight of the Garter\n",
+            "Rassendyll Rudolf\n",
+            "a Knight of the Garter Rudolf\n",
+            "Rose   Nonsense   \n",
+            "Jacob Jacob\n",
+            "Jacob Rudolf\n",
+            "Elphberg Elphberg\n",
+            "Elphberg Rudolf\n",
+            "Rudolf Strelsau\n",
+            "Bob Rose\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Run the analysis of the first chapter\n",
+        "c = chapters[0]\n",
+        "# Get involved\n",
+        "doc=nlp(c)\n",
+        "\n",
+        "with driver.session() as session:\n",
+        "    #define constraint\n",
+        "    session.run(constraint_query)\n",
+        "    # Extract Person labels\n",
+        "    involved = list(set([ent.text for ent in doc.ents if ent.label_=='PERSON']))\n",
+        "    # Preprocess text\n",
+        "    decode = dict()\n",
+        "    for i,x in enumerate(involved):\n",
+        "        # Get mapping\n",
+        "        decode['$${}$$'.format(i)] = x\n",
+        "        # Preprocess text\n",
+        "        c = c.replace(x,' $${}$$ '.format(i))\n",
+        "        \n",
+        "    # Split chapter into words\n",
+        "    ws = c.split()\n",
+        "    l = len(ws)\n",
+        "    # Iterate through words\n",
+        "    for wi,w in enumerate(ws):\n",
+        "        # Skip if the word is not a person\n",
+        "        if not w[:2] == '$$':\n",
+        "            continue\n",
+        "        # Check next x words for any involved person\n",
+        "        x = 14\n",
+        "        for i in range(wi+1,wi+x):\n",
+        "            # Avoid list index error\n",
+        "            if i >= l:\n",
+        "                break\n",
+        "            # Skip if the word is not a person\n",
+        "            if not ws[i][:2] == '$$':\n",
+        "                continue\n",
+        "            # Store to Neo4j\n",
+        "            params = {'name1':decode[ws[wi]],'name2':decode[ws[i]]}\n",
+        "            session.run(save_query, params)\n",
+        "            print(decode[ws[wi]],decode[ws[i]])\n",
+        "        "
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "QJQ6rAU-vtO2"
+      },
+      "source": [
+        "# Graph Analysis"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "id": "dKoppRH-vtO2"
+      },
+      "outputs": [],
+      "source": [
+        "# Project the graph\n",
+        "graph_projection = \"\"\"\n",
+        "CALL gds.graph.project('ch1', 'Person', {RELATED:{orientation:'UNDIRECTED'}})\n",
+        "\"\"\"\n",
+        "\n",
+        "# Run pagerank and louvain algorithm\n",
+        "pagerank =\"\"\"\n",
+        "CALL gds.pageRank.write('ch1',{writeProperty:'pagerank'})\n",
+        "\"\"\"\n",
+        "louvain = \"\"\"\n",
+        "CALL gds.louvain.write('ch1',{writeProperty:'louvain'})\n",
+        "\"\"\"\n",
+        "\n",
+        "drop_graph = \"\"\"\n",
+        "CALL gds.graph.drop('ch1')\n",
+        "\"\"\"\n",
+        "\n",
+        "with driver.session() as session:\n",
+        "    session.run(graph_projection)\n",
+        "    session.run(pagerank)\n",
+        "    session.run(louvain)\n",
+        "    session.run(drop_graph)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "mIU8L_lYvtO3"
+      },
+      "source": [
+        "# Results"
+      ]
+    },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Strelsau Burlesdon\n",
-      "House Elphberg\n"
-     ]
+      "cell_type": "code",
+      "execution_count": 6,
+      "metadata": {
+        "id": "tIBn7tj2vtO7"
+      },
+      "outputs": [],
+      "source": [
+        "# Import libraries\n",
+        "import pandas as pd\n",
+        "\n",
+        "def read_query(query, params={}):\n",
+        "    with driver.session() as session:\n",
+        "        result = session.run(query, params)\n",
+        "        return pd.DataFrame([r.values() for r in result], columns=result.keys())"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Evaluate pagerank\n",
+        "read_query(\"\"\"\n",
+        "MATCH (c:Person)\n",
+        "RETURN c.name AS character, c.pagerank AS score\n",
+        "ORDER BY score DESC LIMIT 5\n",
+        "\"\"\")"
+      ],
+      "metadata": {
+        "id": "UqPWBHBkx-J-",
+        "outputId": "4c9db6b6-4748-4b2d-8c69-8f0155ba0938",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 206
+        }
+      },
+      "execution_count": 8,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "    character     score\n",
+              "0      Rudolf  2.234279\n",
+              "1   Burlesdon  1.550467\n",
+              "2      Robert  1.366045\n",
+              "3  Rassendyll  1.177921\n",
+              "4    Elphberg  1.115947"
+            ],
+            "text/html": [
+              "\n",
+              "  <div id=\"df-9fa82270-a97c-4a70-afd1-0222c40ea2d4\">\n",
+              "    <div class=\"colab-df-container\">\n",
+              "      <div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>character</th>\n",
+              "      <th>score</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>0</th>\n",
+              "      <td>Rudolf</td>\n",
+              "      <td>2.234279</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>1</th>\n",
+              "      <td>Burlesdon</td>\n",
+              "      <td>1.550467</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>2</th>\n",
+              "      <td>Robert</td>\n",
+              "      <td>1.366045</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>3</th>\n",
+              "      <td>Rassendyll</td>\n",
+              "      <td>1.177921</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>4</th>\n",
+              "      <td>Elphberg</td>\n",
+              "      <td>1.115947</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "</div>\n",
+              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-9fa82270-a97c-4a70-afd1-0222c40ea2d4')\"\n",
+              "              title=\"Convert this dataframe to an interactive table.\"\n",
+              "              style=\"display:none;\">\n",
+              "        \n",
+              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
+              "       width=\"24px\">\n",
+              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
+              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
+              "  </svg>\n",
+              "      </button>\n",
+              "      \n",
+              "  <style>\n",
+              "    .colab-df-container {\n",
+              "      display:flex;\n",
+              "      flex-wrap:wrap;\n",
+              "      gap: 12px;\n",
+              "    }\n",
+              "\n",
+              "    .colab-df-convert {\n",
+              "      background-color: #E8F0FE;\n",
+              "      border: none;\n",
+              "      border-radius: 50%;\n",
+              "      cursor: pointer;\n",
+              "      display: none;\n",
+              "      fill: #1967D2;\n",
+              "      height: 32px;\n",
+              "      padding: 0 0 0 0;\n",
+              "      width: 32px;\n",
+              "    }\n",
+              "\n",
+              "    .colab-df-convert:hover {\n",
+              "      background-color: #E2EBFA;\n",
+              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
+              "      fill: #174EA6;\n",
+              "    }\n",
+              "\n",
+              "    [theme=dark] .colab-df-convert {\n",
+              "      background-color: #3B4455;\n",
+              "      fill: #D2E3FC;\n",
+              "    }\n",
+              "\n",
+              "    [theme=dark] .colab-df-convert:hover {\n",
+              "      background-color: #434B5C;\n",
+              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
+              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
+              "      fill: #FFFFFF;\n",
+              "    }\n",
+              "  </style>\n",
+              "\n",
+              "      <script>\n",
+              "        const buttonEl =\n",
+              "          document.querySelector('#df-9fa82270-a97c-4a70-afd1-0222c40ea2d4 button.colab-df-convert');\n",
+              "        buttonEl.style.display =\n",
+              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
+              "\n",
+              "        async function convertToInteractive(key) {\n",
+              "          const element = document.querySelector('#df-9fa82270-a97c-4a70-afd1-0222c40ea2d4');\n",
+              "          const dataTable =\n",
+              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
+              "                                                     [key], {});\n",
+              "          if (!dataTable) return;\n",
+              "\n",
+              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
+              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
+              "            + ' to learn more about interactive tables.';\n",
+              "          element.innerHTML = '';\n",
+              "          dataTable['output_type'] = 'display_data';\n",
+              "          await google.colab.output.renderOutput(dataTable, element);\n",
+              "          const docLink = document.createElement('div');\n",
+              "          docLink.innerHTML = docLinkHtml;\n",
+              "          element.appendChild(docLink);\n",
+              "        }\n",
+              "      </script>\n",
+              "    </div>\n",
+              "  </div>\n",
+              "  "
+            ]
+          },
+          "metadata": {},
+          "execution_count": 8
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Evaluate louvain\n",
+        "read_query(\"\"\"\n",
+        "MATCH (c:Person)\n",
+        "RETURN c.louvain AS community, collect(c.name) AS members\n",
+        "ORDER BY size(members) DESC\n",
+        "\"\"\")"
+      ],
+      "metadata": {
+        "id": "9Gm32eoLyE3Z",
+        "outputId": "7b97b170-90bc-4cbd-9076-f91acd3b7edf",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 175
+        }
+      },
+      "execution_count": 9,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "   community                                            members\n",
+              "0          3     [Rudolf, Rose  , Strelsau, Nonsense   , Jacob]\n",
+              "1         10  [Burlesdon, Amelia, James  , Rassendyll, a Kni...\n",
+              "2          5   [Rassendylls, Elphberg, Robert, Good heavens   ]\n",
+              "3         15                                        [Bob, Rose]"
+            ],
+            "text/html": [
+              "\n",
+              "  <div id=\"df-30bd820c-4c9a-41fa-b0ca-4fedc4cdb93c\">\n",
+              "    <div class=\"colab-df-container\">\n",
+              "      <div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>community</th>\n",
+              "      <th>members</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>0</th>\n",
+              "      <td>3</td>\n",
+              "      <td>[Rudolf, Rose  , Strelsau, Nonsense   , Jacob]</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>1</th>\n",
+              "      <td>10</td>\n",
+              "      <td>[Burlesdon, Amelia, James  , Rassendyll, a Kni...</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>2</th>\n",
+              "      <td>5</td>\n",
+              "      <td>[Rassendylls, Elphberg, Robert, Good heavens   ]</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>3</th>\n",
+              "      <td>15</td>\n",
+              "      <td>[Bob, Rose]</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "</div>\n",
+              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-30bd820c-4c9a-41fa-b0ca-4fedc4cdb93c')\"\n",
+              "              title=\"Convert this dataframe to an interactive table.\"\n",
+              "              style=\"display:none;\">\n",
+              "        \n",
+              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
+              "       width=\"24px\">\n",
+              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
+              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
+              "  </svg>\n",
+              "      </button>\n",
+              "      \n",
+              "  <style>\n",
+              "    .colab-df-container {\n",
+              "      display:flex;\n",
+              "      flex-wrap:wrap;\n",
+              "      gap: 12px;\n",
+              "    }\n",
+              "\n",
+              "    .colab-df-convert {\n",
+              "      background-color: #E8F0FE;\n",
+              "      border: none;\n",
+              "      border-radius: 50%;\n",
+              "      cursor: pointer;\n",
+              "      display: none;\n",
+              "      fill: #1967D2;\n",
+              "      height: 32px;\n",
+              "      padding: 0 0 0 0;\n",
+              "      width: 32px;\n",
+              "    }\n",
+              "\n",
+              "    .colab-df-convert:hover {\n",
+              "      background-color: #E2EBFA;\n",
+              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
+              "      fill: #174EA6;\n",
+              "    }\n",
+              "\n",
+              "    [theme=dark] .colab-df-convert {\n",
+              "      background-color: #3B4455;\n",
+              "      fill: #D2E3FC;\n",
+              "    }\n",
+              "\n",
+              "    [theme=dark] .colab-df-convert:hover {\n",
+              "      background-color: #434B5C;\n",
+              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
+              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
+              "      fill: #FFFFFF;\n",
+              "    }\n",
+              "  </style>\n",
+              "\n",
+              "      <script>\n",
+              "        const buttonEl =\n",
+              "          document.querySelector('#df-30bd820c-4c9a-41fa-b0ca-4fedc4cdb93c button.colab-df-convert');\n",
+              "        buttonEl.style.display =\n",
+              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
+              "\n",
+              "        async function convertToInteractive(key) {\n",
+              "          const element = document.querySelector('#df-30bd820c-4c9a-41fa-b0ca-4fedc4cdb93c');\n",
+              "          const dataTable =\n",
+              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
+              "                                                     [key], {});\n",
+              "          if (!dataTable) return;\n",
+              "\n",
+              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
+              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
+              "            + ' to learn more about interactive tables.';\n",
+              "          element.innerHTML = '';\n",
+              "          dataTable['output_type'] = 'display_data';\n",
+              "          await google.colab.output.renderOutput(dataTable, element);\n",
+              "          const docLink = document.createElement('div');\n",
+              "          docLink.innerHTML = docLinkHtml;\n",
+              "          element.appendChild(docLink);\n",
+              "        }\n",
+              "      </script>\n",
+              "    </div>\n",
+              "  </div>\n",
+              "  "
+            ]
+          },
+          "metadata": {},
+          "execution_count": 9
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        ""
+      ],
+      "metadata": {
+        "id": "OPJe0M-YyV5c"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.5.2"
+    },
+    "colab": {
+      "name": "Spacy NER with Neo4j Clustering on Gutenberg book.ipynb",
+      "provenance": [],
+      "include_colab_link": true
     }
-   ],
-   "source": [
-    "# Additional options\n",
-    "# Add orgs\n",
-    "c = chapters[0]\n",
-    "doc = nlp(c)\n",
-    "\n",
-    "save_org_query = \"\"\"\n",
-    "\n",
-    "MERGE (p:Person{name:$person})\n",
-    "MERGE (o:Organization{name:$org})\n",
-    "MERGE (p)-[r:PART_OF]->(o)\n",
-    "ON CREATE SET r.score = 1\n",
-    "ON MATCH SET r.score = r.score + 1\n",
-    "\n",
-    "\"\"\"\n",
-    "\n",
-    "with driver.session() as session:\n",
-    "    # Define the mapping\n",
-    "    persons = list(set([ent.text for ent in doc.ents if ent.label_=='PERSON']))\n",
-    "    orgs = list(set([ent.text for ent in doc.ents if ent.label_=='ORG']))\n",
-    "    decode_org = dict()\n",
-    "    decode_person = dict()\n",
-    "    # Replace person\n",
-    "    for i,p in enumerate(persons):\n",
-    "        decode_person['$${}$$'.format(i)] = p\n",
-    "        r = ' $${}$$ '.format(i)\n",
-    "        c = c.replace(p,r)\n",
-    "    # Replace organizations\n",
-    "    for i,o in enumerate(orgs):\n",
-    "        decode_org['&&{}&&'.format(i)] = o\n",
-    "        c = c.replace(o,' &&{}&& '.format(i))    \n",
-    "    # Split chapter into words\n",
-    "    ws = c.split()\n",
-    "    l = len(ws)\n",
-    "    for wi,w in enumerate(ws):\n",
-    "        # Skip if the word is not a organization\n",
-    "        if not w[:2] == '&&':\n",
-    "            continue\n",
-    "        # Check previous and next x words for any involved person\n",
-    "        x = 5\n",
-    "        for i in range(wi-x,wi+x):\n",
-    "            # Avoid list index error\n",
-    "            if i >= l:\n",
-    "                break\n",
-    "            # Skip if the word is not a person\n",
-    "            if (ws[i][:2]!='$$') or (i==wi):\n",
-    "                continue\n",
-    "            # Store to Neo4j\n",
-    "            # Todo: Maybe some automated mapping of name to surnames etc..\n",
-    "            params = {'org':decode_org[ws[wi]],'person':decode_person[ws[i]]}\n",
-    "            session.run(save_org_query, params)\n",
-    "            print(decode_org[ws[wi]],decode_person[ws[i]])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
   },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.5.2"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file