diff --git a/Spacy_Neo4j_Gutenberg_Book/Spacy NER with Neo4j Clustering on Gutenberg book.ipynb b/Spacy_Neo4j_Gutenberg_Book/Spacy NER with Neo4j Clustering on Gutenberg book.ipynb
index 940a20a..7f2ace5 100644
--- a/Spacy_Neo4j_Gutenberg_Book/Spacy NER with Neo4j Clustering on Gutenberg book.ipynb
+++ b/Spacy_Neo4j_Gutenberg_Book/Spacy NER with Neo4j Clustering on Gutenberg book.ipynb
@@ -1,415 +1,716 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Data preprocessing"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "# https://www.gutenberg.org/ebooks/95 Prisoner of Zelda\n",
- "\n",
- "# Fetch the data\n",
- "target_url = 'https://www.gutenberg.org/files/95/95-0.txt'\n",
- "import urllib.request\n",
- "data = urllib.request.urlopen(target_url)\n",
- "raw_data = data.read().decode('utf8').strip()\n",
- "\n",
- "# Preprocess text into chapters \n",
- "import re\n",
- "chapters = re.sub('[^A-z0-9 -]', ' ', raw_data).split('CHAPTER')[1:]\n",
- "chapters[-1] = chapters[-1].split('End of the Project Gutenberg EBook')[0]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Import into Neo4j"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "# import spacy and load an NLP model\n",
- "import spacy\n",
- "nlp = spacy.load(\"en_core_web_lg\", disable=[\"tagger\", \"parser\"])\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Import Neo4j and define cypher queries\n",
- "import neo4j\n",
- "host = \"bolt://localhost:7687\"\n",
- "user = 'neo4j'\n",
- "password = 'password'\n",
- "\n",
- "driver = neo4j.GraphDatabase.driver(host, auth=(user, password))\n",
- "\n",
- "save_query =\"\"\"\n",
- "MERGE (p1:Person{name:$name1})\n",
- "MERGE (p2:Person{name:$name2})\n",
- "MERGE (p1)-[r:RELATED]-(p2)\n",
- "ON CREATE SET r.score = 1\n",
- "ON MATCH SET r.score = r.score + 1\"\"\"\n",
- "\n",
- "constraint_query=\"CREATE CONSTRAINT ON (p:Person) ASSERT p.name IS UNIQUE;\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [
+ "cells": [
{
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Elphbergs Rudolf\n",
- "Rudolf Rose\n",
- "Robert Ancestry\n",
- "Ancestry Rose\n",
- "Robert Robert\n",
- "Rudolf Rose\n",
- "Rudolf Robert\n",
- "Robert Rudolf\n",
- "Ruritania Burlesdon\n",
- "Lady Burlesdon George II\n",
- "George II King\n",
- "Rudolf the Third Ruritania\n",
- "Burlesdon Amelia\n",
- "James Burlesdon\n",
- "Burlesdon a Knight of the Garter\n",
- "a Knight of the Garter Rudolf\n",
- "a Knight of the Garter Ruritania\n",
- "Rudolf Ruritania\n",
- "Jacob Jacob\n",
- "Jacob Rudolf\n",
- "Elphbergs Elphberg\n",
- "Elphberg Rudolf\n",
- "Bob Rose\n"
- ]
- }
- ],
- "source": [
- "# Run the analysis of the first chapter\n",
- "c = chapters[0]\n",
- "# Get involved\n",
- "doc=nlp(c)\n",
- "\n",
- "with driver.session() as session:\n",
- " #define constraint\n",
- " session.run(constraint_query)\n",
- " # Extract Person labels\n",
- " involved = list(set([ent.text for ent in doc.ents if ent.label_=='PERSON']))\n",
- " # Preprocess text\n",
- " decode = dict()\n",
- " for i,x in enumerate(involved):\n",
- " # Get mapping\n",
- " decode['$${}$$'.format(i)] = x\n",
- " # Preprocess text\n",
- " c = c.replace(x,' $${}$$ '.format(i))\n",
- " \n",
- " # Split chapter into words\n",
- " ws = c.split()\n",
- " l = len(ws)\n",
- " # Iterate through words\n",
- " for wi,w in enumerate(ws):\n",
- " # Skip if the word is not a person\n",
- " if not w[:2] == '$$':\n",
- " continue\n",
- " # Check next x words for any involved person\n",
- " x = 14\n",
- " for i in range(wi+1,wi+x):\n",
- " # Avoid list index error\n",
- " if i >= l:\n",
- " break\n",
- " # Skip if the word is not a person\n",
- " if not ws[i][:2] == '$$':\n",
- " continue\n",
- " # Store to Neo4j\n",
- " params = {'name1':decode[ws[wi]],'name2':decode[ws[i]]}\n",
- " session.run(save_query, params)\n",
- " print(decode[ws[wi]],decode[ws[i]])\n",
- " "
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Graph Analysis"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Run pagerank and louvain algorithm\n",
- "pagerank =\"\"\"\n",
- "CALL algo.pageRank('Person','RELATED',{direction:'BOTH'})\n",
- "\"\"\"\n",
- "louvain = \"\"\"\n",
- "CALL algo.louvain('Person','RELATED',{direction:'BOTH'})\n",
- "\"\"\"\n",
- "with driver.session() as session:\n",
- " session.run(pagerank)\n",
- " session.run(louvain)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Vizualizations\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [],
- "source": [
- "from IPython.display import IFrame, HTML\n",
- "import json\n",
- "import uuid\n",
- "\n",
- "\n",
- "def generate_vis(host, user, password, cypher, labels_json, relationships_json):\n",
- " html = \"\"\"\\\n",
- "\n",
- "
\n",
- " Neovis.js Simple Example\n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \"\"\"\n",
- "\n",
- " html = html.format(\n",
- " host=host,\n",
- " user=user,\n",
- " password=password,\n",
- " cypher=cypher,\n",
- " labels = json.dumps(labels_json),\n",
- " relationships=json.dumps(relationships_json)\n",
- " )\n",
- "\n",
- " unique_id = str(uuid.uuid4())\n",
- " filename = \"graph-{}.html\".format(unique_id)\n",
- "\n",
- " with open(filename, \"w\") as f:\n",
- " f.write(html)\n",
- " return IFrame(src=filename, width=500, height=500)\n",
- "\n",
- " "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {
- "scrolled": true
- },
- "outputs": [
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "view-in-github",
+ "colab_type": "text"
+ },
+ "source": [
+ ""
+ ]
+ },
{
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
+ "cell_type": "markdown",
+ "source": [
+ "* Updated to GDS 2.0 version\n",
+ "* Link to original blog post: https://towardsdatascience.com/network-analysis-of-prisoners-of-zenda-book-with-spacy-and-neo4j-b0839a640105"
],
- "text/plain": [
- ""
+ "metadata": {
+ "id": "PLkTF151vvYa"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "!pip install neo4j spacy\n",
+ "!python -m spacy download en_core_web_lg\n"
+ ],
+ "metadata": {
+ "id": "xQuOPNtrv5tl",
+ "outputId": "85cb8c38-271c-482a-8f86-6a3e217b2193",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ }
+ },
+ "execution_count": 1,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Collecting neo4j\n",
+ " Downloading neo4j-4.4.2.tar.gz (89 kB)\n",
+ "\u001b[?25l\r\u001b[K |███▋ | 10 kB 16.4 MB/s eta 0:00:01\r\u001b[K |███████▎ | 20 kB 11.0 MB/s eta 0:00:01\r\u001b[K |███████████ | 30 kB 8.9 MB/s eta 0:00:01\r\u001b[K |██████████████▋ | 40 kB 8.3 MB/s eta 0:00:01\r\u001b[K |██████████████████▎ | 51 kB 4.3 MB/s eta 0:00:01\r\u001b[K |██████████████████████ | 61 kB 5.0 MB/s eta 0:00:01\r\u001b[K |█████████████████████████▋ | 71 kB 5.4 MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▎ | 81 kB 5.6 MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 89 kB 3.9 MB/s \n",
+ "\u001b[?25hRequirement already satisfied: spacy in /usr/local/lib/python3.7/dist-packages (2.2.4)\n",
+ "Requirement already satisfied: pytz in /usr/local/lib/python3.7/dist-packages (from neo4j) (2018.9)\n",
+ "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (2.23.0)\n",
+ "Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.0.0)\n",
+ "Requirement already satisfied: plac<1.2.0,>=0.9.6 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.1.3)\n",
+ "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (4.64.0)\n",
+ "Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (0.9.1)\n",
+ "Requirement already satisfied: thinc==7.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (7.4.0)\n",
+ "Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.21.5)\n",
+ "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy) (3.0.6)\n",
+ "Requirement already satisfied: blis<0.5.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (0.4.1)\n",
+ "Requirement already satisfied: srsly<1.1.0,>=1.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.0.5)\n",
+ "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy) (2.0.6)\n",
+ "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from spacy) (57.4.0)\n",
+ "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.0.6)\n",
+ "Requirement already satisfied: importlib-metadata>=0.20 in /usr/local/lib/python3.7/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy) (4.11.3)\n",
+ "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy) (3.8.0)\n",
+ "Requirement already satisfied: typing-extensions>=3.6.4 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy) (4.1.1)\n",
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2021.10.8)\n",
+ "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2.10)\n",
+ "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (3.0.4)\n",
+ "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (1.24.3)\n",
+ "Building wheels for collected packages: neo4j\n",
+ " Building wheel for neo4j (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ " Created wheel for neo4j: filename=neo4j-4.4.2-py3-none-any.whl size=115365 sha256=375f8f269c3b4b2bd5e68f1c77965807b84099501344af227bf704360c6d44e3\n",
+ " Stored in directory: /root/.cache/pip/wheels/10/d6/28/95029d7f69690dbc3b93e4933197357987de34fbd44b50a0e4\n",
+ "Successfully built neo4j\n",
+ "Installing collected packages: neo4j\n",
+ "Successfully installed neo4j-4.4.2\n",
+ "Collecting en_core_web_lg==2.2.5\n",
+ " Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9 MB)\n",
+ "\u001b[K |████████████████████████████████| 827.9 MB 1.1 MB/s \n",
+ "\u001b[?25hRequirement already satisfied: spacy>=2.2.2 in /usr/local/lib/python3.7/dist-packages (from en_core_web_lg==2.2.5) (2.2.4)\n",
+ "Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (1.21.5)\n",
+ "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (2.23.0)\n",
+ "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (1.0.6)\n",
+ "Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (1.0.0)\n",
+ "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (4.64.0)\n",
+ "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (57.4.0)\n",
+ "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (3.0.6)\n",
+ "Requirement already satisfied: thinc==7.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (7.4.0)\n",
+ "Requirement already satisfied: plac<1.2.0,>=0.9.6 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (1.1.3)\n",
+ "Requirement already satisfied: blis<0.5.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (0.4.1)\n",
+ "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (2.0.6)\n",
+ "Requirement already satisfied: srsly<1.1.0,>=1.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (1.0.5)\n",
+ "Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (0.9.1)\n",
+ "Requirement already satisfied: importlib-metadata>=0.20 in /usr/local/lib/python3.7/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_lg==2.2.5) (4.11.3)\n",
+ "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_lg==2.2.5) (3.8.0)\n",
+ "Requirement already satisfied: typing-extensions>=3.6.4 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_lg==2.2.5) (4.1.1)\n",
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_lg==2.2.5) (2021.10.8)\n",
+ "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_lg==2.2.5) (3.0.4)\n",
+ "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_lg==2.2.5) (2.10)\n",
+ "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_lg==2.2.5) (1.24.3)\n",
+ "Building wheels for collected packages: en-core-web-lg\n",
+ " Building wheel for en-core-web-lg (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ " Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-py3-none-any.whl size=829180942 sha256=575aa8e998aefd0d0fd8abaeee8b7f4f7353dd86950646d465af448f2e44ffbf\n",
+ " Stored in directory: /tmp/pip-ephem-wheel-cache-mrwx1m4p/wheels/11/95/ba/2c36cc368c0bd339b44a791c2c1881a1fb714b78c29a4cb8f5\n",
+ "Successfully built en-core-web-lg\n",
+ "Installing collected packages: en-core-web-lg\n",
+ "Successfully installed en-core-web-lg-2.2.5\n",
+ "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n",
+ "You can now load the model via spacy.load('en_core_web_lg')\n"
+ ]
+ }
]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Create vizualization\n",
- "cypher = \"MATCH (p1:Person)-[r:RELATED]->(p2:Person) RETURN *\"\n",
- "\n",
- "labels_json = {\n",
- " \"Person\": {\n",
- " \"caption\": \"name\",\n",
- " \"size\": \"pagerank\",\n",
- " \"community\": \"community\"\n",
- " }\n",
- "}\n",
- "\n",
- "relationships_json = {\n",
- " \"RELATED\": {\n",
- " \"thickness\": \"score\",\n",
- " \"caption\": False\n",
- " }\n",
- "}\n",
- "\n",
- "generate_vis(host, user, password, cypher, labels_json, relationships_json)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "GOVTfyZ2vtOt"
+ },
+ "source": [
+ "Restart runtime before continuing in order for SpaCy to work\n",
+ "\n",
+ "# Data preprocessing"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "id": "IzJm4AshvtOx"
+ },
+ "outputs": [],
+ "source": [
+ "# https://www.gutenberg.org/ebooks/95 Prisoner of Zelda\n",
+ "\n",
+ "# Fetch the data\n",
+ "target_url = 'https://www.gutenberg.org/files/95/95-0.txt'\n",
+ "import urllib.request\n",
+ "data = urllib.request.urlopen(target_url)\n",
+ "raw_data = data.read().decode('utf8').strip()\n",
+ "\n",
+ "# Preprocess text into chapters \n",
+ "import re\n",
+ "chapters = re.sub('[^A-z0-9 -]', ' ', raw_data).split('CHAPTER')[1:]\n",
+ "chapters[-1] = chapters[-1].split('End of the Project Gutenberg EBook')[0]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "182mC1YtvtOz"
+ },
+ "source": [
+ "# Import into Neo4j"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "id": "MPOKdT96vtOz"
+ },
+ "outputs": [],
+ "source": [
+ "# import spacy and load an NLP model\n",
+ "import spacy\n",
+ "nlp = spacy.load(\"en_core_web_lg\", disable=[\"tagger\", \"parser\"])\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "id": "OxeIYL9kvtOz"
+ },
+ "outputs": [],
+ "source": [
+ "# Import Neo4j and define cypher queries\n",
+ "import neo4j\n",
+ "host = 'bolt://3.235.2.228:7687'\n",
+ "user = 'neo4j'\n",
+ "password = 'seats-drunks-carbon'\n",
+ "\n",
+ "driver = neo4j.GraphDatabase.driver(host, auth=(user, password))\n",
+ "\n",
+ "save_query =\"\"\"\n",
+ "MERGE (p1:Person{name:$name1})\n",
+ "MERGE (p2:Person{name:$name2})\n",
+ "MERGE (p1)-[r:RELATED]-(p2)\n",
+ "ON CREATE SET r.score = 1\n",
+ "ON MATCH SET r.score = r.score + 1\"\"\"\n",
+ "\n",
+ "constraint_query=\"CREATE CONSTRAINT ON (p:Person) ASSERT p.name IS UNIQUE;\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "id": "wzglMHIIvtO0",
+ "outputId": "cb69fad5-405c-4795-cd45-d4b9c1d7c250",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ }
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Rassendylls Elphberg\n",
+ "Rudolf Rose \n",
+ "Rassendylls Robert\n",
+ "Robert Robert\n",
+ "Rudolf Rose \n",
+ "Robert Good heavens \n",
+ "Good heavens Rudolf\n",
+ "Rudolf Robert\n",
+ "Robert Rudolf\n",
+ "Elphberg Rassendylls\n",
+ "Burlesdon Strelsau\n",
+ "Burlesdon Amelia\n",
+ "James Burlesdon\n",
+ "James Rassendyll\n",
+ "Burlesdon Rassendyll\n",
+ "Burlesdon a Knight of the Garter\n",
+ "Rassendyll a Knight of the Garter\n",
+ "Rassendyll Rudolf\n",
+ "a Knight of the Garter Rudolf\n",
+ "Rose Nonsense \n",
+ "Jacob Jacob\n",
+ "Jacob Rudolf\n",
+ "Elphberg Elphberg\n",
+ "Elphberg Rudolf\n",
+ "Rudolf Strelsau\n",
+ "Bob Rose\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Run the analysis of the first chapter\n",
+ "c = chapters[0]\n",
+ "# Get involved\n",
+ "doc=nlp(c)\n",
+ "\n",
+ "with driver.session() as session:\n",
+ " #define constraint\n",
+ " session.run(constraint_query)\n",
+ " # Extract Person labels\n",
+ " involved = list(set([ent.text for ent in doc.ents if ent.label_=='PERSON']))\n",
+ " # Preprocess text\n",
+ " decode = dict()\n",
+ " for i,x in enumerate(involved):\n",
+ " # Get mapping\n",
+ " decode['$${}$$'.format(i)] = x\n",
+ " # Preprocess text\n",
+ " c = c.replace(x,' $${}$$ '.format(i))\n",
+ " \n",
+ " # Split chapter into words\n",
+ " ws = c.split()\n",
+ " l = len(ws)\n",
+ " # Iterate through words\n",
+ " for wi,w in enumerate(ws):\n",
+ " # Skip if the word is not a person\n",
+ " if not w[:2] == '$$':\n",
+ " continue\n",
+ " # Check next x words for any involved person\n",
+ " x = 14\n",
+ " for i in range(wi+1,wi+x):\n",
+ " # Avoid list index error\n",
+ " if i >= l:\n",
+ " break\n",
+ " # Skip if the word is not a person\n",
+ " if not ws[i][:2] == '$$':\n",
+ " continue\n",
+ " # Store to Neo4j\n",
+ " params = {'name1':decode[ws[wi]],'name2':decode[ws[i]]}\n",
+ " session.run(save_query, params)\n",
+ " print(decode[ws[wi]],decode[ws[i]])\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "QJQ6rAU-vtO2"
+ },
+ "source": [
+ "# Graph Analysis"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "id": "dKoppRH-vtO2"
+ },
+ "outputs": [],
+ "source": [
+ "# Project the graph\n",
+ "graph_projection = \"\"\"\n",
+ "CALL gds.graph.project('ch1', 'Person', {RELATED:{orientation:'UNDIRECTED'}})\n",
+ "\"\"\"\n",
+ "\n",
+ "# Run pagerank and louvain algorithm\n",
+ "pagerank =\"\"\"\n",
+ "CALL gds.pageRank.write('ch1',{writeProperty:'pagerank'})\n",
+ "\"\"\"\n",
+ "louvain = \"\"\"\n",
+ "CALL gds.louvain.write('ch1',{writeProperty:'louvain'})\n",
+ "\"\"\"\n",
+ "\n",
+ "drop_graph = \"\"\"\n",
+ "CALL gds.graph.drop('ch1')\n",
+ "\"\"\"\n",
+ "\n",
+ "with driver.session() as session:\n",
+ " session.run(graph_projection)\n",
+ " session.run(pagerank)\n",
+ " session.run(louvain)\n",
+ " session.run(drop_graph)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "mIU8L_lYvtO3"
+ },
+ "source": [
+ "# Results"
+ ]
+ },
{
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Strelsau Burlesdon\n",
- "House Elphberg\n"
- ]
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "id": "tIBn7tj2vtO7"
+ },
+ "outputs": [],
+ "source": [
+ "# Import libraries\n",
+ "import pandas as pd\n",
+ "\n",
+ "def read_query(query, params={}):\n",
+ " with driver.session() as session:\n",
+ " result = session.run(query, params)\n",
+ " return pd.DataFrame([r.values() for r in result], columns=result.keys())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Evaluate pagerank\n",
+ "read_query(\"\"\"\n",
+ "MATCH (c:Person)\n",
+ "RETURN c.name AS character, c.pagerank AS score\n",
+ "ORDER BY score DESC LIMIT 5\n",
+ "\"\"\")"
+ ],
+ "metadata": {
+ "id": "UqPWBHBkx-J-",
+ "outputId": "4c9db6b6-4748-4b2d-8c69-8f0155ba0938",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ }
+ },
+ "execution_count": 8,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " character score\n",
+ "0 Rudolf 2.234279\n",
+ "1 Burlesdon 1.550467\n",
+ "2 Robert 1.366045\n",
+ "3 Rassendyll 1.177921\n",
+ "4 Elphberg 1.115947"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " character | \n",
+ " score | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Rudolf | \n",
+ " 2.234279 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Burlesdon | \n",
+ " 1.550467 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Robert | \n",
+ " 1.366045 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Rassendyll | \n",
+ " 1.177921 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Elphberg | \n",
+ " 1.115947 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 8
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Evaluate louvain\n",
+ "read_query(\"\"\"\n",
+ "MATCH (c:Person)\n",
+ "RETURN c.louvain AS community, collect(c.name) AS members\n",
+ "ORDER BY size(members) DESC\n",
+ "\"\"\")"
+ ],
+ "metadata": {
+ "id": "9Gm32eoLyE3Z",
+ "outputId": "7b97b170-90bc-4cbd-9076-f91acd3b7edf",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 175
+ }
+ },
+ "execution_count": 9,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " community members\n",
+ "0 3 [Rudolf, Rose , Strelsau, Nonsense , Jacob]\n",
+ "1 10 [Burlesdon, Amelia, James , Rassendyll, a Kni...\n",
+ "2 5 [Rassendylls, Elphberg, Robert, Good heavens ]\n",
+ "3 15 [Bob, Rose]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " community | \n",
+ " members | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 3 | \n",
+ " [Rudolf, Rose , Strelsau, Nonsense , Jacob] | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 10 | \n",
+ " [Burlesdon, Amelia, James , Rassendyll, a Kni... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 5 | \n",
+ " [Rassendylls, Elphberg, Robert, Good heavens ] | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 15 | \n",
+ " [Bob, Rose] | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 9
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ ""
+ ],
+ "metadata": {
+ "id": "OPJe0M-YyV5c"
+ },
+ "execution_count": null,
+ "outputs": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.5.2"
+ },
+ "colab": {
+ "name": "Spacy NER with Neo4j Clustering on Gutenberg book.ipynb",
+ "provenance": [],
+ "include_colab_link": true
}
- ],
- "source": [
- "# Additional options\n",
- "# Add orgs\n",
- "c = chapters[0]\n",
- "doc = nlp(c)\n",
- "\n",
- "save_org_query = \"\"\"\n",
- "\n",
- "MERGE (p:Person{name:$person})\n",
- "MERGE (o:Organization{name:$org})\n",
- "MERGE (p)-[r:PART_OF]->(o)\n",
- "ON CREATE SET r.score = 1\n",
- "ON MATCH SET r.score = r.score + 1\n",
- "\n",
- "\"\"\"\n",
- "\n",
- "with driver.session() as session:\n",
- " # Define the mapping\n",
- " persons = list(set([ent.text for ent in doc.ents if ent.label_=='PERSON']))\n",
- " orgs = list(set([ent.text for ent in doc.ents if ent.label_=='ORG']))\n",
- " decode_org = dict()\n",
- " decode_person = dict()\n",
- " # Replace person\n",
- " for i,p in enumerate(persons):\n",
- " decode_person['$${}$$'.format(i)] = p\n",
- " r = ' $${}$$ '.format(i)\n",
- " c = c.replace(p,r)\n",
- " # Replace organizations\n",
- " for i,o in enumerate(orgs):\n",
- " decode_org['&&{}&&'.format(i)] = o\n",
- " c = c.replace(o,' &&{}&& '.format(i)) \n",
- " # Split chapter into words\n",
- " ws = c.split()\n",
- " l = len(ws)\n",
- " for wi,w in enumerate(ws):\n",
- " # Skip if the word is not a organization\n",
- " if not w[:2] == '&&':\n",
- " continue\n",
- " # Check previous and next x words for any involved person\n",
- " x = 5\n",
- " for i in range(wi-x,wi+x):\n",
- " # Avoid list index error\n",
- " if i >= l:\n",
- " break\n",
- " # Skip if the word is not a person\n",
- " if (ws[i][:2]!='$$') or (i==wi):\n",
- " continue\n",
- " # Store to Neo4j\n",
- " # Todo: Maybe some automated mapping of name to surnames etc..\n",
- " params = {'org':decode_org[ws[wi]],'person':decode_person[ws[i]]}\n",
- " session.run(save_org_query, params)\n",
- " print(decode_org[ws[wi]],decode_person[ws[i]])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
},
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.5.2"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
\ No newline at end of file