From f8a6f21c5fd0d9ae53fce91aca2df335cfffdb6d Mon Sep 17 00:00:00 2001 From: Tomaz Bratanic Date: Sun, 17 Apr 2022 14:50:03 +0100 Subject: [PATCH] Created using Colaboratory --- ...h Neo4j Clustering on Gutenberg book.ipynb | 1115 +++++++++++------ 1 file changed, 708 insertions(+), 407 deletions(-) diff --git a/Spacy_Neo4j_Gutenberg_Book/Spacy NER with Neo4j Clustering on Gutenberg book.ipynb b/Spacy_Neo4j_Gutenberg_Book/Spacy NER with Neo4j Clustering on Gutenberg book.ipynb index 940a20a..7f2ace5 100644 --- a/Spacy_Neo4j_Gutenberg_Book/Spacy NER with Neo4j Clustering on Gutenberg book.ipynb +++ b/Spacy_Neo4j_Gutenberg_Book/Spacy NER with Neo4j Clustering on Gutenberg book.ipynb @@ -1,415 +1,716 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Data preprocessing" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# https://www.gutenberg.org/ebooks/95 Prisoner of Zelda\n", - "\n", - "# Fetch the data\n", - "target_url = 'https://www.gutenberg.org/files/95/95-0.txt'\n", - "import urllib.request\n", - "data = urllib.request.urlopen(target_url)\n", - "raw_data = data.read().decode('utf8').strip()\n", - "\n", - "# Preprocess text into chapters \n", - "import re\n", - "chapters = re.sub('[^A-z0-9 -]', ' ', raw_data).split('CHAPTER')[1:]\n", - "chapters[-1] = chapters[-1].split('End of the Project Gutenberg EBook')[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Import into Neo4j" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# import spacy and load an NLP model\n", - "import spacy\n", - "nlp = spacy.load(\"en_core_web_lg\", disable=[\"tagger\", \"parser\"])\n" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "# Import Neo4j and define cypher queries\n", - "import neo4j\n", - "host = \"bolt://localhost:7687\"\n", - "user = 'neo4j'\n", - "password = 'password'\n", - "\n", - "driver = neo4j.GraphDatabase.driver(host, auth=(user, password))\n", - "\n", - "save_query =\"\"\"\n", - "MERGE (p1:Person{name:$name1})\n", - "MERGE (p2:Person{name:$name2})\n", - "MERGE (p1)-[r:RELATED]-(p2)\n", - "ON CREATE SET r.score = 1\n", - "ON MATCH SET r.score = r.score + 1\"\"\"\n", - "\n", - "constraint_query=\"CREATE CONSTRAINT ON (p:Person) ASSERT p.name IS UNIQUE;\"" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ + "cells": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Elphbergs Rudolf\n", - "Rudolf Rose\n", - "Robert Ancestry\n", - "Ancestry Rose\n", - "Robert Robert\n", - "Rudolf Rose\n", - "Rudolf Robert\n", - "Robert Rudolf\n", - "Ruritania Burlesdon\n", - "Lady Burlesdon George II\n", - "George II King\n", - "Rudolf the Third Ruritania\n", - "Burlesdon Amelia\n", - "James Burlesdon\n", - "Burlesdon a Knight of the Garter\n", - "a Knight of the Garter Rudolf\n", - "a Knight of the Garter Ruritania\n", - "Rudolf Ruritania\n", - "Jacob Jacob\n", - "Jacob Rudolf\n", - "Elphbergs Elphberg\n", - "Elphberg Rudolf\n", - "Bob Rose\n" - ] - } - ], - "source": [ - "# Run the analysis of the first chapter\n", - "c = chapters[0]\n", - "# Get involved\n", - "doc=nlp(c)\n", - "\n", - "with driver.session() as session:\n", - " #define constraint\n", - " session.run(constraint_query)\n", - " # Extract Person labels\n", - " involved = list(set([ent.text for ent in doc.ents if ent.label_=='PERSON']))\n", - " # Preprocess text\n", - " decode = dict()\n", - " for i,x in enumerate(involved):\n", - " # Get mapping\n", - " decode['$${}$$'.format(i)] = x\n", - " # Preprocess text\n", - " c = c.replace(x,' $${}$$ '.format(i))\n", - " \n", - " # Split chapter into words\n", - " ws = c.split()\n", - " l = len(ws)\n", - " # Iterate through words\n", - " for wi,w in enumerate(ws):\n", - " # Skip if the word is not a person\n", - " if not w[:2] == '$$':\n", - " continue\n", - " # Check next x words for any involved person\n", - " x = 14\n", - " for i in range(wi+1,wi+x):\n", - " # Avoid list index error\n", - " if i >= l:\n", - " break\n", - " # Skip if the word is not a person\n", - " if not ws[i][:2] == '$$':\n", - " continue\n", - " # Store to Neo4j\n", - " params = {'name1':decode[ws[wi]],'name2':decode[ws[i]]}\n", - " session.run(save_query, params)\n", - " print(decode[ws[wi]],decode[ws[i]])\n", - " " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Graph Analysis" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# Run pagerank and louvain algorithm\n", - "pagerank =\"\"\"\n", - "CALL algo.pageRank('Person','RELATED',{direction:'BOTH'})\n", - "\"\"\"\n", - "louvain = \"\"\"\n", - "CALL algo.louvain('Person','RELATED',{direction:'BOTH'})\n", - "\"\"\"\n", - "with driver.session() as session:\n", - " session.run(pagerank)\n", - " session.run(louvain)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Vizualizations\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "from IPython.display import IFrame, HTML\n", - "import json\n", - "import uuid\n", - "\n", - "\n", - "def generate_vis(host, user, password, cypher, labels_json, relationships_json):\n", - " html = \"\"\"\\\n", - "\n", - "\n", - " Neovis.js Simple Example\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " \n", - " \"\"\"\n", - "\n", - " html = html.format(\n", - " host=host,\n", - " user=user,\n", - " password=password,\n", - " cypher=cypher,\n", - " labels = json.dumps(labels_json),\n", - " relationships=json.dumps(relationships_json)\n", - " )\n", - "\n", - " unique_id = str(uuid.uuid4())\n", - " filename = \"graph-{}.html\".format(unique_id)\n", - "\n", - " with open(filename, \"w\") as f:\n", - " f.write(html)\n", - " return IFrame(src=filename, width=500, height=500)\n", - "\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "scrolled": true - }, - "outputs": [ + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, { - "data": { - "text/html": [ - "\n", - " \n", - " " + "cell_type": "markdown", + "source": [ + "* Updated to GDS 2.0 version\n", + "* Link to original blog post: https://towardsdatascience.com/network-analysis-of-prisoners-of-zenda-book-with-spacy-and-neo4j-b0839a640105" ], - "text/plain": [ - "" + "metadata": { + "id": "PLkTF151vvYa" + } + }, + { + "cell_type": "code", + "source": [ + "!pip install neo4j spacy\n", + "!python -m spacy download en_core_web_lg\n" + ], + "metadata": { + "id": "xQuOPNtrv5tl", + "outputId": "85cb8c38-271c-482a-8f86-6a3e217b2193", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": 1, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting neo4j\n", + " Downloading neo4j-4.4.2.tar.gz (89 kB)\n", + "\u001b[?25l\r\u001b[K |███▋ | 10 kB 16.4 MB/s eta 0:00:01\r\u001b[K |███████▎ | 20 kB 11.0 MB/s eta 0:00:01\r\u001b[K |███████████ | 30 kB 8.9 MB/s eta 0:00:01\r\u001b[K |██████████████▋ | 40 kB 8.3 MB/s eta 0:00:01\r\u001b[K |██████████████████▎ | 51 kB 4.3 MB/s eta 0:00:01\r\u001b[K |██████████████████████ | 61 kB 5.0 MB/s eta 0:00:01\r\u001b[K |█████████████████████████▋ | 71 kB 5.4 MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▎ | 81 kB 5.6 MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 89 kB 3.9 MB/s \n", + "\u001b[?25hRequirement already satisfied: spacy in /usr/local/lib/python3.7/dist-packages (2.2.4)\n", + "Requirement already satisfied: pytz in /usr/local/lib/python3.7/dist-packages (from neo4j) (2018.9)\n", + "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (2.23.0)\n", + "Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.0.0)\n", + "Requirement already satisfied: plac<1.2.0,>=0.9.6 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.1.3)\n", + "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (4.64.0)\n", + "Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (0.9.1)\n", + "Requirement already satisfied: thinc==7.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (7.4.0)\n", + "Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.21.5)\n", + "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy) (3.0.6)\n", + "Requirement already satisfied: blis<0.5.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (0.4.1)\n", + "Requirement already satisfied: srsly<1.1.0,>=1.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.0.5)\n", + "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy) (2.0.6)\n", + "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from spacy) (57.4.0)\n", + "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.0.6)\n", + "Requirement already satisfied: importlib-metadata>=0.20 in /usr/local/lib/python3.7/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy) (4.11.3)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy) (3.8.0)\n", + "Requirement already satisfied: typing-extensions>=3.6.4 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy) (4.1.1)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2021.10.8)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2.10)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (3.0.4)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (1.24.3)\n", + "Building wheels for collected packages: neo4j\n", + " Building wheel for neo4j (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for neo4j: filename=neo4j-4.4.2-py3-none-any.whl size=115365 sha256=375f8f269c3b4b2bd5e68f1c77965807b84099501344af227bf704360c6d44e3\n", + " Stored in directory: /root/.cache/pip/wheels/10/d6/28/95029d7f69690dbc3b93e4933197357987de34fbd44b50a0e4\n", + "Successfully built neo4j\n", + "Installing collected packages: neo4j\n", + "Successfully installed neo4j-4.4.2\n", + "Collecting en_core_web_lg==2.2.5\n", + " Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9 MB)\n", + "\u001b[K |████████████████████████████████| 827.9 MB 1.1 MB/s \n", + "\u001b[?25hRequirement already satisfied: spacy>=2.2.2 in /usr/local/lib/python3.7/dist-packages (from en_core_web_lg==2.2.5) (2.2.4)\n", + "Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (1.21.5)\n", + "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (2.23.0)\n", + "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (1.0.6)\n", + "Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (1.0.0)\n", + "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (4.64.0)\n", + "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (57.4.0)\n", + "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (3.0.6)\n", + "Requirement already satisfied: thinc==7.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (7.4.0)\n", + "Requirement already satisfied: plac<1.2.0,>=0.9.6 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (1.1.3)\n", + "Requirement already satisfied: blis<0.5.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (0.4.1)\n", + "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (2.0.6)\n", + "Requirement already satisfied: srsly<1.1.0,>=1.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (1.0.5)\n", + "Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_lg==2.2.5) (0.9.1)\n", + "Requirement already satisfied: importlib-metadata>=0.20 in /usr/local/lib/python3.7/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_lg==2.2.5) (4.11.3)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_lg==2.2.5) (3.8.0)\n", + "Requirement already satisfied: typing-extensions>=3.6.4 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_lg==2.2.5) (4.1.1)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_lg==2.2.5) (2021.10.8)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_lg==2.2.5) (3.0.4)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_lg==2.2.5) (2.10)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_lg==2.2.5) (1.24.3)\n", + "Building wheels for collected packages: en-core-web-lg\n", + " Building wheel for en-core-web-lg (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-py3-none-any.whl size=829180942 sha256=575aa8e998aefd0d0fd8abaeee8b7f4f7353dd86950646d465af448f2e44ffbf\n", + " Stored in directory: /tmp/pip-ephem-wheel-cache-mrwx1m4p/wheels/11/95/ba/2c36cc368c0bd339b44a791c2c1881a1fb714b78c29a4cb8f5\n", + "Successfully built en-core-web-lg\n", + "Installing collected packages: en-core-web-lg\n", + "Successfully installed en-core-web-lg-2.2.5\n", + "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n", + "You can now load the model via spacy.load('en_core_web_lg')\n" + ] + } ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Create vizualization\n", - "cypher = \"MATCH (p1:Person)-[r:RELATED]->(p2:Person) RETURN *\"\n", - "\n", - "labels_json = {\n", - " \"Person\": {\n", - " \"caption\": \"name\",\n", - " \"size\": \"pagerank\",\n", - " \"community\": \"community\"\n", - " }\n", - "}\n", - "\n", - "relationships_json = {\n", - " \"RELATED\": {\n", - " \"thickness\": \"score\",\n", - " \"caption\": False\n", - " }\n", - "}\n", - "\n", - "generate_vis(host, user, password, cypher, labels_json, relationships_json)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GOVTfyZ2vtOt" + }, + "source": [ + "Restart runtime before continuing in order for SpaCy to work\n", + "\n", + "# Data preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "IzJm4AshvtOx" + }, + "outputs": [], + "source": [ + "# https://www.gutenberg.org/ebooks/95 Prisoner of Zelda\n", + "\n", + "# Fetch the data\n", + "target_url = 'https://www.gutenberg.org/files/95/95-0.txt'\n", + "import urllib.request\n", + "data = urllib.request.urlopen(target_url)\n", + "raw_data = data.read().decode('utf8').strip()\n", + "\n", + "# Preprocess text into chapters \n", + "import re\n", + "chapters = re.sub('[^A-z0-9 -]', ' ', raw_data).split('CHAPTER')[1:]\n", + "chapters[-1] = chapters[-1].split('End of the Project Gutenberg EBook')[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "182mC1YtvtOz" + }, + "source": [ + "# Import into Neo4j" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "MPOKdT96vtOz" + }, + "outputs": [], + "source": [ + "# import spacy and load an NLP model\n", + "import spacy\n", + "nlp = spacy.load(\"en_core_web_lg\", disable=[\"tagger\", \"parser\"])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "OxeIYL9kvtOz" + }, + "outputs": [], + "source": [ + "# Import Neo4j and define cypher queries\n", + "import neo4j\n", + "host = 'bolt://3.235.2.228:7687'\n", + "user = 'neo4j'\n", + "password = 'seats-drunks-carbon'\n", + "\n", + "driver = neo4j.GraphDatabase.driver(host, auth=(user, password))\n", + "\n", + "save_query =\"\"\"\n", + "MERGE (p1:Person{name:$name1})\n", + "MERGE (p2:Person{name:$name2})\n", + "MERGE (p1)-[r:RELATED]-(p2)\n", + "ON CREATE SET r.score = 1\n", + "ON MATCH SET r.score = r.score + 1\"\"\"\n", + "\n", + "constraint_query=\"CREATE CONSTRAINT ON (p:Person) ASSERT p.name IS UNIQUE;\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "wzglMHIIvtO0", + "outputId": "cb69fad5-405c-4795-cd45-d4b9c1d7c250", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Rassendylls Elphberg\n", + "Rudolf Rose \n", + "Rassendylls Robert\n", + "Robert Robert\n", + "Rudolf Rose \n", + "Robert Good heavens \n", + "Good heavens Rudolf\n", + "Rudolf Robert\n", + "Robert Rudolf\n", + "Elphberg Rassendylls\n", + "Burlesdon Strelsau\n", + "Burlesdon Amelia\n", + "James Burlesdon\n", + "James Rassendyll\n", + "Burlesdon Rassendyll\n", + "Burlesdon a Knight of the Garter\n", + "Rassendyll a Knight of the Garter\n", + "Rassendyll Rudolf\n", + "a Knight of the Garter Rudolf\n", + "Rose Nonsense \n", + "Jacob Jacob\n", + "Jacob Rudolf\n", + "Elphberg Elphberg\n", + "Elphberg Rudolf\n", + "Rudolf Strelsau\n", + "Bob Rose\n" + ] + } + ], + "source": [ + "# Run the analysis of the first chapter\n", + "c = chapters[0]\n", + "# Get involved\n", + "doc=nlp(c)\n", + "\n", + "with driver.session() as session:\n", + " #define constraint\n", + " session.run(constraint_query)\n", + " # Extract Person labels\n", + " involved = list(set([ent.text for ent in doc.ents if ent.label_=='PERSON']))\n", + " # Preprocess text\n", + " decode = dict()\n", + " for i,x in enumerate(involved):\n", + " # Get mapping\n", + " decode['$${}$$'.format(i)] = x\n", + " # Preprocess text\n", + " c = c.replace(x,' $${}$$ '.format(i))\n", + " \n", + " # Split chapter into words\n", + " ws = c.split()\n", + " l = len(ws)\n", + " # Iterate through words\n", + " for wi,w in enumerate(ws):\n", + " # Skip if the word is not a person\n", + " if not w[:2] == '$$':\n", + " continue\n", + " # Check next x words for any involved person\n", + " x = 14\n", + " for i in range(wi+1,wi+x):\n", + " # Avoid list index error\n", + " if i >= l:\n", + " break\n", + " # Skip if the word is not a person\n", + " if not ws[i][:2] == '$$':\n", + " continue\n", + " # Store to Neo4j\n", + " params = {'name1':decode[ws[wi]],'name2':decode[ws[i]]}\n", + " session.run(save_query, params)\n", + " print(decode[ws[wi]],decode[ws[i]])\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QJQ6rAU-vtO2" + }, + "source": [ + "# Graph Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "dKoppRH-vtO2" + }, + "outputs": [], + "source": [ + "# Project the graph\n", + "graph_projection = \"\"\"\n", + "CALL gds.graph.project('ch1', 'Person', {RELATED:{orientation:'UNDIRECTED'}})\n", + "\"\"\"\n", + "\n", + "# Run pagerank and louvain algorithm\n", + "pagerank =\"\"\"\n", + "CALL gds.pageRank.write('ch1',{writeProperty:'pagerank'})\n", + "\"\"\"\n", + "louvain = \"\"\"\n", + "CALL gds.louvain.write('ch1',{writeProperty:'louvain'})\n", + "\"\"\"\n", + "\n", + "drop_graph = \"\"\"\n", + "CALL gds.graph.drop('ch1')\n", + "\"\"\"\n", + "\n", + "with driver.session() as session:\n", + " session.run(graph_projection)\n", + " session.run(pagerank)\n", + " session.run(louvain)\n", + " session.run(drop_graph)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mIU8L_lYvtO3" + }, + "source": [ + "# Results" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Strelsau Burlesdon\n", - "House Elphberg\n" - ] + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "tIBn7tj2vtO7" + }, + "outputs": [], + "source": [ + "# Import libraries\n", + "import pandas as pd\n", + "\n", + "def read_query(query, params={}):\n", + " with driver.session() as session:\n", + " result = session.run(query, params)\n", + " return pd.DataFrame([r.values() for r in result], columns=result.keys())" + ] + }, + { + "cell_type": "code", + "source": [ + "# Evaluate pagerank\n", + "read_query(\"\"\"\n", + "MATCH (c:Person)\n", + "RETURN c.name AS character, c.pagerank AS score\n", + "ORDER BY score DESC LIMIT 5\n", + "\"\"\")" + ], + "metadata": { + "id": "UqPWBHBkx-J-", + "outputId": "4c9db6b6-4748-4b2d-8c69-8f0155ba0938", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + } + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " character score\n", + "0 Rudolf 2.234279\n", + "1 Burlesdon 1.550467\n", + "2 Robert 1.366045\n", + "3 Rassendyll 1.177921\n", + "4 Elphberg 1.115947" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
characterscore
0Rudolf2.234279
1Burlesdon1.550467
2Robert1.366045
3Rassendyll1.177921
4Elphberg1.115947
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 8 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Evaluate louvain\n", + "read_query(\"\"\"\n", + "MATCH (c:Person)\n", + "RETURN c.louvain AS community, collect(c.name) AS members\n", + "ORDER BY size(members) DESC\n", + "\"\"\")" + ], + "metadata": { + "id": "9Gm32eoLyE3Z", + "outputId": "7b97b170-90bc-4cbd-9076-f91acd3b7edf", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 175 + } + }, + "execution_count": 9, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " community members\n", + "0 3 [Rudolf, Rose , Strelsau, Nonsense , Jacob]\n", + "1 10 [Burlesdon, Amelia, James , Rassendyll, a Kni...\n", + "2 5 [Rassendylls, Elphberg, Robert, Good heavens ]\n", + "3 15 [Bob, Rose]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
communitymembers
03[Rudolf, Rose , Strelsau, Nonsense , Jacob]
110[Burlesdon, Amelia, James , Rassendyll, a Kni...
25[Rassendylls, Elphberg, Robert, Good heavens ]
315[Bob, Rose]
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 9 + } + ] + }, + { + "cell_type": "code", + "source": [ + "" + ], + "metadata": { + "id": "OPJe0M-YyV5c" + }, + "execution_count": null, + "outputs": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + }, + "colab": { + "name": "Spacy NER with Neo4j Clustering on Gutenberg book.ipynb", + "provenance": [], + "include_colab_link": true } - ], - "source": [ - "# Additional options\n", - "# Add orgs\n", - "c = chapters[0]\n", - "doc = nlp(c)\n", - "\n", - "save_org_query = \"\"\"\n", - "\n", - "MERGE (p:Person{name:$person})\n", - "MERGE (o:Organization{name:$org})\n", - "MERGE (p)-[r:PART_OF]->(o)\n", - "ON CREATE SET r.score = 1\n", - "ON MATCH SET r.score = r.score + 1\n", - "\n", - "\"\"\"\n", - "\n", - "with driver.session() as session:\n", - " # Define the mapping\n", - " persons = list(set([ent.text for ent in doc.ents if ent.label_=='PERSON']))\n", - " orgs = list(set([ent.text for ent in doc.ents if ent.label_=='ORG']))\n", - " decode_org = dict()\n", - " decode_person = dict()\n", - " # Replace person\n", - " for i,p in enumerate(persons):\n", - " decode_person['$${}$$'.format(i)] = p\n", - " r = ' $${}$$ '.format(i)\n", - " c = c.replace(p,r)\n", - " # Replace organizations\n", - " for i,o in enumerate(orgs):\n", - " decode_org['&&{}&&'.format(i)] = o\n", - " c = c.replace(o,' &&{}&& '.format(i)) \n", - " # Split chapter into words\n", - " ws = c.split()\n", - " l = len(ws)\n", - " for wi,w in enumerate(ws):\n", - " # Skip if the word is not a organization\n", - " if not w[:2] == '&&':\n", - " continue\n", - " # Check previous and next x words for any involved person\n", - " x = 5\n", - " for i in range(wi-x,wi+x):\n", - " # Avoid list index error\n", - " if i >= l:\n", - " break\n", - " # Skip if the word is not a person\n", - " if (ws[i][:2]!='$$') or (i==wi):\n", - " continue\n", - " # Store to Neo4j\n", - " # Todo: Maybe some automated mapping of name to surnames etc..\n", - " params = {'org':decode_org[ws[wi]],'person':decode_person[ws[i]]}\n", - " session.run(save_org_query, params)\n", - " print(decode_org[ws[wi]],decode_person[ws[i]])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.2" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file