From 56cc3ad2738571f99df592b8db0487537e882891 Mon Sep 17 00:00:00 2001 From: Tomaz Bratanic Date: Mon, 11 May 2020 23:39:02 +0200 Subject: [PATCH] Add apoc nlp notebook --- apoc_nlp_procedures/APOC NLP procedures.ipynb | 2114 +++++++++++++++++ 1 file changed, 2114 insertions(+) create mode 100644 apoc_nlp_procedures/APOC NLP procedures.ipynb diff --git a/apoc_nlp_procedures/APOC NLP procedures.ipynb b/apoc_nlp_procedures/APOC NLP procedures.ipynb new file mode 100644 index 0000000..4c94175 --- /dev/null +++ b/apoc_nlp_procedures/APOC NLP procedures.ipynb @@ -0,0 +1,2114 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Define Neo4j connections\n", + "from neo4j import GraphDatabase\n", + "host = 'neo4j://localhost:7687'\n", + "user = 'neo4j'\n", + "password = 'letmein'\n", + "driver = GraphDatabase.driver(host,auth=(user, password))" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Import libraries\n", + "import pandas as pd\n", + "\n", + "def read_query(query):\n", + " with driver.session() as session:\n", + " result = session.run(query)\n", + " return pd.DataFrame([r.values() for r in result], columns=result.keys())\n", + " \n", + "def drop_graph(name):\n", + " with driver.session() as session:\n", + " drop_graph_query = \"\"\"\n", + " CALL gds.graph.drop('{}');\n", + " \"\"\".format(name)\n", + " session.run(drop_graph_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Agenda\n", + "- Graph import\n", + "- Text classification\n", + "- Named-entity recognition\n", + "- Sentiment analysis\n", + "- Unipartite projection of a bipartite network\n", + "- Community detection\n", + "- Data enrichment" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Graph import\n", + "\n", + "We will be using the Kaggle News dataset made available by Kevin Toms. It contains the content of around 10500 news articles. The dataset was prepared in January 2019. Unfortunately, the author offers no information about the source and the timeline of the dataset. We have to download the dataset and copy it to the import folder.\n", + "\n", + "\n", + "We will import each article as a separate node with the title and the content stored as the properties of the node." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "graph_import_query = \"\"\"\n", + "\n", + "LOAD CSV WITH HEADERS FROM \"file:///text_summarizer_data.csv\" as row \n", + "CREATE (a:Article{title:row.title, content: row.content})\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(graph_import_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Text classification with Google\n", + "\n", + "We will start our analysis with the APOC text classification procedure powered by Google's Natural language API. It classifies the input text in distinct content categories. As far as I have noticed, it works best for news articles and not so well on fictional literature. Google offers 30.000 free classification requests per month, so that's quite enough to start a hobby project.\n", + "\n", + "As we need to process 10.000 articles, we will be using apoc.periodic.iterate\n", + "procedure to handle the batching process. Learn more about it in the documentation. The process will take a couple of minutes, so feel free to grab a coffee or do your favorite yoga pose while you wait." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "text_classification = \"\"\"\n", + "\n", + "CALL apoc.periodic.iterate(\" \n", + " // get all articles \n", + " MATCH (node:Article) RETURN node \n", + " \",\" \n", + " // classify each article \n", + " CALL apoc.nlp.gcp.classify.graph(node, { \n", + " // we retrieve gcp api key from static value storage\n", + " key: apoc.static.get('gcp.apiKey'),\n", + " // node property that contains the text\n", + " nodeProperty: 'content',\n", + " write:true }) YIELD graph RETURN distinct 'done'\", \n", + " {batchSize:10})\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(text_classification)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "While you sip your delicious coffee, we can take a look at the results of the classification. Let's start by viewing some example category names just so we know what we are dealing with." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
category
0/Arts & Entertainment/Movies
1/Finance/Insurance
2/News
3/Arts & Entertainment/Celebrities & Entertainm...
4/Sports/Team Sports/Cricket
\n", + "
" + ], + "text/plain": [ + " category\n", + "0 /Arts & Entertainment/Movies\n", + "1 /Finance/Insurance\n", + "2 /News\n", + "3 /Arts & Entertainment/Celebrities & Entertainm...\n", + "4 /Sports/Team Sports/Cricket" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inspect_categories_query = \"\"\"\n", + "\n", + "MATCH (n:Category) \n", + "RETURN n.name as category LIMIT 5\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(inspect_categories_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Category names are between one and three levels deep. Levels are separated by the slash character ( / ). For example, \"/News\" contains only one level, while \"/Finance/Insurance\" contains two levels of categories. We will extract the top-level category for each classification name and store it back to our graph. This will enable us to filter and group by top-level categories more straightforwardly.\n", + "\n", + "When dealing with hierarchical trees in Neo4j, there are some rules I learned along the way that can help us simplify and optimize our queries. One of them is that we should have a single relationship type throughout the whole tree. This way, we can easily query one or two levels deep, as we will see.\n", + "\n", + "Let's define a unique constraint for the top-level category node." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "read_query(\"CREATE CONSTRAINT ON (t:TopCategory) ASSERT t.name IS UNIQUE;\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can extract and store the top-level category by merely splitting the classification name." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "top_category_extraction_query = \"\"\"\n", + "\n", + "MATCH (n:Category) \n", + "WITH n, split(n.name,'/')[1] as top_level\n", + "MERGE (t:TopCategory{name:top_level})\n", + "MERGE (n)-[:CATEGORY]->(t)\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(top_category_extraction_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's check out the number of articles by the top-level category. You can observe that with only a single relationship type in our category tree, traversing two levels deep is very simple." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
categorynumber_of_articles
0News4041
1Arts & Entertainment2487
2Law & Government2036
3Sports1470
4Business & Industrial1044
5Sensitive Subjects837
6People & Society538
7Science337
8Finance295
9Travel226
\n", + "
" + ], + "text/plain": [ + " category number_of_articles\n", + "0 News 4041\n", + "1 Arts & Entertainment 2487\n", + "2 Law & Government 2036\n", + "3 Sports 1470\n", + "4 Business & Industrial 1044\n", + "5 Sensitive Subjects 837\n", + "6 People & Society 538\n", + "7 Science 337\n", + "8 Finance 295\n", + "9 Travel 226" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inspect_top_level_categories_query = \"\"\"\n", + "\n", + "MATCH (t:TopCategory) \n", + "RETURN t.name as category,\n", + " size((t)<-[:CATEGORY*2..2]-()) as number_of_articles \n", + "ORDER BY number_of_articles DESC \n", + "LIMIT 10\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(inspect_top_level_categories_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "One thing to notice is that the results of the query show more articles than there actually are in our graph. This is due to some articles having more than a single classification. We will quickly inspect results by looking at the content of two articles from the Science category." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
text
0Scientists in the UK have demonstrated a \"comm...
1The SI unit of mass is redefined using the Pla...
\n", + "
" + ], + "text/plain": [ + " text\n", + "0 Scientists in the UK have demonstrated a \"comm...\n", + "1 The SI unit of mass is redefined using the Pla..." + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "example_science_articles = \"\"\"\n", + "\n", + "MATCH (t:TopCategory{name:\"Science\"})<-[:CATEGORY*2..2]-(article) \n", + "RETURN article.content as text LIMIT 2\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(example_science_articles)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Both articles seem to be heavily focused on science and technology. We didn't really do an in-depth analysis of the results, but we will assume that the Natural Language API does its job of classifying news articles well.\n", + "\n", + "### Named-entity recognition\n", + "\n", + "In essence, NER is a process of identifying various entities in text and grouping them in categories such as persons, organizations, locations, and more. Which entity types we are looking for, depends entirely on the use-case. Sometimes we want to know which persons and organizations are mentioned in a given article, and other times we might be more interested in knowing which genes and proteins are referred. There exist a plethora of pre-trained models that you can use, and if none works for you, you can always train your own NER model, but this is beyond the scope of this blog post.\n", + "\n", + "Both AWS and GCP offer their own NER models in their cloud API portfolio. We will compare them on two random stories and choose the more fitting for our use-case." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
text
0Aegon Life iTerm insurance plan helps you save...
1An 81-year-old woman named Eileen Macken who g...
\n", + "
" + ], + "text/plain": [ + " text\n", + "0 Aegon Life iTerm insurance plan helps you save...\n", + "1 An 81-year-old woman named Eileen Macken who g..." + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "example_ner_articles = \"\"\"\n", + "\n", + "MATCH (node:Article) \n", + "WITH node LIMIT 2 \n", + "RETURN node.content as text\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(example_ner_articles)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "At first glance, we can observe that AWS found fewer entities than GCP. It also does not include words like \"woman\" and \"mother\" under person entities. My opinion is that if you are trying to do some sort of topic modeling, then maybe GCP might be a better fit, while for creating a knowledge graph, AWS might be better. We will use AWS to analyze and extract entities from all articles in our database. As we need to batch our NER process, we will use the apoc.periodic.iterate procedure." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ner_batch_process_query = \"\"\"\n", + "\n", + "CALL apoc.periodic.iterate(\"\n", + " MATCH (n:Article) \n", + " WITH collect(n) as total\n", + " // Create a batch of 25 articles\n", + " CALL apoc.coll.partition(total,25) \n", + " YIELD value as nodes RETURN nodes\n", + " \",\"\n", + " CALL apoc.nlp.aws.entities.graph(nodes, { \n", + " key: apoc.static.get('aws.apiKey'), \n", + " secret: apoc.static.get('aws.apiSecret'), \n", + " nodeProperty: 'content', \n", + " relationshipType: 'AWS_ENTITY', \n", + " // store the results to Neo4j \n", + " write:true }) YIELD graph RETURN distinct 'done'\", \n", + " {batchSize:1})\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(ner_batch_process_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you hurry, you have time for another great yoga pose before the NER process finishes. Let me know how it goes. Alright, we can now check the results of the NER process. We will begin by investigating the most mentioned people in the articles." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
personmentions
0PM560
1Narendra Modi488
2President355
3CM342
4CEO276
5Prime Minister248
6Rahul Gandhi215
7Virat Kohli212
8Modi163
9Union Minister157
\n", + "
" + ], + "text/plain": [ + " person mentions\n", + "0 PM 560\n", + "1 Narendra Modi 488\n", + "2 President 355\n", + "3 CM 342\n", + "4 CEO 276\n", + "5 Prime Minister 248\n", + "6 Rahul Gandhi 215\n", + "7 Virat Kohli 212\n", + "8 Modi 163\n", + "9 Union Minister 157" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ner_results_query = \"\"\"\n", + "\n", + "MATCH (n:PERSON) \n", + "RETURN n.text as person, \n", + " size((n)<-[:AWS_ENTITY]-()) as mentions \n", + "ORDER BY mentions DESC \n", + "LIMIT 10\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(ner_results_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We chose AWS as it did not include words like \"orphan\" or \"genealogist\" under person type. I guess no model is perfect as we can see that \"PM\", \"CEO\", and \"President\" appear as person entities. One way to solve this problem would be to add coreference resolution as a step of the NER pipeline, but as we are dealing with third-party NLP pipelines, we don't have that luxury. Let's now look at the most mentioned persons in sports articles." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
personmentions
0Virat Kohli197
1Kohli110
2MS Dhoni65
3Sachin Tendulkar64
4Rishabh Pant49
5Rohit Sharma45
6Cheteshwar Pujara45
7Hardik Pandya42
8Dhoni42
9Gautam Gambhir41
\n", + "
" + ], + "text/plain": [ + " person mentions\n", + "0 Virat Kohli 197\n", + "1 Kohli 110\n", + "2 MS Dhoni 65\n", + "3 Sachin Tendulkar 64\n", + "4 Rishabh Pant 49\n", + "5 Rohit Sharma 45\n", + "6 Cheteshwar Pujara 45\n", + "7 Hardik Pandya 42\n", + "8 Dhoni 42\n", + "9 Gautam Gambhir 41" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sports_person_query = \"\"\"\n", + "\n", + "MATCH (n:PERSON) \n", + "RETURN n.text as person, \n", + " size((n)<-[:AWS_ENTITY]-()-[:CATEGORY*2..2]->({name:'Sports'})) as mentions \n", + "ORDER BY mentions \n", + "DESC LIMIT 10\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(sports_person_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "All top ten mentioned persons in sports are Indian cricket players. Virat Kohli really stands out, so I guess he is the captain of the team. One weak point of such NER extraction is that Virat Kohli and Kohli are treated as two separate entities. This can be explained that he is sometimes mentioned with full name and other times only with the last name. Another interesting use-case would be to look at which locations and events show up in the same article the most." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
locationeventmentions
0Las VegasCES 201911
1BritainBrexit10
2UKBrexit9
3EUBrexit6
4KolkataUnited India Rally6
5USChristmas6
6RajasthanAssembly elections5
7ArgentinaG205
8BhubaneswarHockey World Cup5
9DavosWorld Economic Forum4
\n", + "
" + ], + "text/plain": [ + " location event mentions\n", + "0 Las Vegas CES 2019 11\n", + "1 Britain Brexit 10\n", + "2 UK Brexit 9\n", + "3 EU Brexit 6\n", + "4 Kolkata United India Rally 6\n", + "5 US Christmas 6\n", + "6 Rajasthan Assembly elections 5\n", + "7 Argentina G20 5\n", + "8 Bhubaneswar Hockey World Cup 5\n", + "9 Davos World Economic Forum 4" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "location_event_query = \"\"\"\n", + "\n", + "MATCH (a:Article)-[:AWS_ENTITY]->(event:EVENT),\n", + " (a)-[:AWS_ENTITY]->(location:LOCATION)\n", + "RETURN location.text as location,\n", + " event.text as event,\n", + " count(*) as mentions\n", + "ORDER BY mentions DESC\n", + "LIMIT 10\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(location_event_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The results look quite nice. It looks like CES 2019 happened in Las Vegas, United India Rally in Kolkata, G20 forum in Argentina, and Word Economic Forum in Davos. On top of that, there is talk about Brexit between the UK and the EU.\n", + "\n", + "### Sentiment analysis\n", + "\n", + "AWS also offers sentiment analysis in its Comprehend API bundle. It categorizes text into four distinct sentiment groups:\n", + "\n", + "- Positive\n", + "- Neutral\n", + "- Negative\n", + "- Mixed\n", + "\n", + "We will run the sentiment analysis on all of our articles. I would imagine that more or less news should be neutral, except for maybe sports. There is a particular APOC procedure apoc.nlp.aws.sentiment.* that handles AWS sentiment API requests with the same two modes (stream & graph) as the procedure for NER processing." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sentiment_analysis_query = \"\"\"\n", + "\n", + "CALL apoc.periodic.iterate(\"\n", + " MATCH (n:Article) \n", + " WITH collect(n) as total \n", + " CALL apoc.coll.partition(total,25) \n", + " YIELD value as nodes RETURN nodes \n", + " \",\" \n", + " CALL apoc.nlp.aws.sentiment.graph(nodes, { \n", + " key: apoc.static.get('aws.apiKey'),\n", + " secret: apoc.static.get('aws.apiSecret'),\n", + " nodeProperty: 'content', write:true }) \n", + " YIELD graph RETURN distinct 'done'\", \n", + " {batchSize:1})\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(sentiment_analysis_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will look at co-mentions of persons and events in sports articles that AWS Comprehend deemed as positive." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
personeventmentions
0Serena WilliamsAustralian Open3
1Roger FedererAustralian Open3
2Virat KohliMelbourne Test3
3Gautam GambhirIPL2
4Serena WilliamsGrand Slam2
5FedererAustralian Open2
6Naomi OsakaAustralian Open2
7Roger FedererGrand Slam2
8GambhirIPL2
9Virat Kohli2011 World Cup2
\n", + "
" + ], + "text/plain": [ + " person event mentions\n", + "0 Serena Williams Australian Open 3\n", + "1 Roger Federer Australian Open 3\n", + "2 Virat Kohli Melbourne Test 3\n", + "3 Gautam Gambhir IPL 2\n", + "4 Serena Williams Grand Slam 2\n", + "5 Federer Australian Open 2\n", + "6 Naomi Osaka Australian Open 2\n", + "7 Roger Federer Grand Slam 2\n", + "8 Gambhir IPL 2\n", + "9 Virat Kohli 2011 World Cup 2" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sentiment_analysis_results = \"\"\"\n", + "\n", + "MATCH (a:Article) \n", + "WHERE a.sentiment = 'Positive' AND \n", + " (a)-[:CATEGORY*2..2]->({name:'Sports'}) \n", + "MATCH (a)-[:AWS_ENTITY]->(person:PERSON), \n", + " (a)-[:AWS_ENTITY]->(event:EVENT) \n", + "RETURN person.text as person,\n", + " event.text as event,\n", + " count(*) as mentions \n", + " ORDER BY mentions DESC\n", + "LIMIT 10\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(sentiment_analysis_results)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It looks like tennis players are winning tennis tournaments. The Indian cricket team with Virat Kohli as their captain has won the 2011 World cup and Melbourne Test. Or so I might imagine given the results.\n", + "\n", + "## Graph data science library\n", + "\n", + "If you have read any of my blog posts, you know that I like to write about the Graph Data Science library. This blog is no different. First, we will project a bipartite network to a unipartite one with the help of the Node similarity algorithm. In the next step, we will search for communities within the projected unipartite graph with the Louvain algorithm.\n", + "\n", + "### Unipartite projection of a bipartite network\n", + "\n", + "A bipartite network is just a fancy way of saying that the graph contains two distinct sets of nodes, and likewise, a unipartite network contains only a single set of nodes. In our example, we will start with a bipartite network that includes both articles and NER entities. In the next step, we will project it to a unipartite network with the help of similarity algorithms in GDS.\n", + "\n", + "We can think of unipartite projection as a process of translating indirect relationships to direct ones. The difference between similarity algorithms is just the metric being used to calculate similarity score or weight. For example, the Node similarity algorithm uses the Jaccard similarity score. It is defined as the size of the intersection divided by the size of the union. If we wanted to, we could also project a unipartite network of articles instead of persons and inspect how similar articles are based on the entities mentioned in them.\n", + "\n", + "Before we run any algorithms, let's just quickly refresh how does the GDS library work.\n", + "\n", + "The graph analytics pipeline consists of three main parts. In the first part, the graph loader reads the stored graph from Neo4j and loads it as an in-memory projected graph. We can use either native projection or cypher projection to load the projected graph. In the second step, we execute the graph algorithms in sequence. We can use the results of one graph algorithm as an input to another one. Last but not least, we store or stream the results back to Neo4j.\n", + "\n", + "We will use the cypher projection to load the in-memory graph. If you need a quick refresher on how it works, I suggest you take a look at the official documentation. In the node statement, we will describe all articles that are in the top-level category of news and also all person entities. In the relationship statement, we will describe all the links between news articles and person entities." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nodeQueryrelationshipQuerygraphNamenodeCountrelationshipCountcreateMillis
0MATCH (a:Article) \\n WHERE (a)-[:CATEGORY*...// match all links between news articles and p...person_similarity1188813847620
\n", + "
" + ], + "text/plain": [ + " nodeQuery \\\n", + "0 MATCH (a:Article) \\n WHERE (a)-[:CATEGORY*... \n", + "\n", + " relationshipQuery graphName \\\n", + "0 // match all links between news articles and p... person_similarity \n", + "\n", + " nodeCount relationshipCount createMillis \n", + "0 11888 13847 620 " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "load_projected_graph_query = \"\"\"\n", + "\n", + "CALL gds.graph.create.cypher(\"person_similarity\", \n", + " // match articles that are in the 'News' category \n", + " \"MATCH (a:Article) \n", + " WHERE (a)-[:CATEGORY*2..2]->({name:'News'}) \n", + " RETURN id(a) as id, labels(a) as labels \n", + " UNION \n", + " // match all person entities\n", + " MATCH (p:PERSON) \n", + " RETURN id(p) as id, labels(p) as labels \n", + " \",\" \n", + " // match all links between news articles and person entities \n", + " MATCH (a:Article)-[r:AWS_ENTITY]->(p:PERSON) \n", + " WHERE (a)-[:CATEGORY*2..2]->({name:'News'}) \n", + " RETURN id(p) as source, id(a) as target, type(r) as type\")\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(load_projected_graph_query)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nodesminp25p50p75p90mean
039990.00.060.170.51.00.32
\n", + "
" + ], + "text/plain": [ + " nodes min p25 p50 p75 p90 mean\n", + "0 3999 0.0 0.06 0.17 0.5 1.0 0.32" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "node_similarity_stats_query = \"\"\"\n", + "\n", + "CALL gds.nodeSimilarity.stats('person_similarity') \n", + "YIELD nodesCompared, similarityDistribution \n", + "RETURN nodesCompared as nodes, \n", + " apoc.math.round(similarityDistribution.min,2) as min,\n", + " apoc.math.round(similarityDistribution.p25,2) as p25,\n", + " apoc.math.round(similarityDistribution.p50,2) as p50,\n", + " apoc.math.round(similarityDistribution.p75,2) as p75,\n", + " apoc.math.round(similarityDistribution.p90,2) as p90,\n", + " apoc.math.round(similarityDistribution.mean,2) as mean\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(node_similarity_stats_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "At first glance, the nodes don't seem to be very similar. This is fine. We want to infer a sparse network as community detection algorithms fare poorly on very dense graphs. Determining the most optimal similarity parameter values requires a combination of art and science, but with some experience, you will get good at it. We will set the similarityCutoff to be 0.6 while leaving the degreeCutoff and topK parameters at their default values. Using the mutate mode, we store the results of the algorithm back to the in-memory projected graph." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
createMilliscomputeMillismutateMillispostProcessingMillisnodesComparedrelationshipsWrittensimilarityDistributionconfiguration
0062919-139993778{'p1': 0.6190452575683594, 'max': 1.0000038146...{'topK': 10, 'degreeCutoff': 1, 'bottomK': 10,...
\n", + "
" + ], + "text/plain": [ + " createMillis computeMillis mutateMillis postProcessingMillis \\\n", + "0 0 629 19 -1 \n", + "\n", + " nodesCompared relationshipsWritten \\\n", + "0 3999 3778 \n", + "\n", + " similarityDistribution \\\n", + "0 {'p1': 0.6190452575683594, 'max': 1.0000038146... \n", + "\n", + " configuration \n", + "0 {'topK': 10, 'degreeCutoff': 1, 'bottomK': 10,... " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "node_similarity_mutate_query = \"\"\"\n", + "\n", + "CALL gds.nodeSimilarity.mutate('person_similarity', \n", + " {degreeCutoff:1, similarityCutoff:0.6, \n", + " topK:10, mutateRelationshipType: 'SIMILAR', \n", + " mutateProperty: 'score'})\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(node_similarity_mutate_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Community detection\n", + "\n", + "Community detection algorithms are designed to help us understand the structure of complex networks. The most apparent application is finding groups of friends in a social network. We think of a community as a densely connected group of nodes, similar to how a group of friends is highly interconnected.\n", + "\n", + "In this blog post, we will use the Louvain algorithm to inspect the community structure of our inferred similarity network." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
communityIdmembers
01891[Leander Paes, Miguel Ángel Reyes-Varela, Aust...
14515[MLA Sujit Bose, Uluberia, MLA Dr Nirmal Maji,...
25523[Army General, Mark Milley, Chairman of the Jo...
33293[Halle Berry, Harrison Ford, Chadwick Boseman,...
45217[Vanisha Mittal, Kylie Minogue, Sanjay Hinduja...
\n", + "
" + ], + "text/plain": [ + " communityId members\n", + "0 1891 [Leander Paes, Miguel Ángel Reyes-Varela, Aust...\n", + "1 4515 [MLA Sujit Bose, Uluberia, MLA Dr Nirmal Maji,...\n", + "2 5523 [Army General, Mark Milley, Chairman of the Jo...\n", + "3 3293 [Halle Berry, Harrison Ford, Chadwick Boseman,...\n", + "4 5217 [Vanisha Mittal, Kylie Minogue, Sanjay Hinduja..." + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "louvain_stream_query = \"\"\"\n", + "\n", + "CALL gds.louvain.stream('person_similarity',\n", + " {nodeLabels:['PERSON'], relationshipTypes:['SIMILAR']})\n", + "YIELD nodeId, communityId\n", + "RETURN communityId,\n", + " collect(gds.util.asNode(nodeId).text) as members \n", + "ORDER BY size(members) DESC LIMIT 5\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(louvain_stream_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The communities found are relatively small. This is directly a consequence of the similarityCutoff parameter value. If we chose a lower threshold, more nodes would be deemed similar, and likely we would get bigger communities. Likewise, we would also get bigger communities if we increased the size of our dataset and processed a couple thousand articles more. That being said, it is time to move to the data enrichment chapter and add some context to our entities.\n", + "\n", + "### Data Enrichment\n", + "\n", + "We found a couple of small communities in our graph. Some persons are quite famous, and it is not hard to find a common denominator between the members of communities. For example, I know that Harrison Ford, Halle Berry, and Jim Parsons are all actors, so I would assume the rest of the group are also actors. We could go and google them. A better idea would be to programmatically enrich the entities in our graph using external data providers like Google Knowledge Graph or WikiData. That is precisely what we are going to do next.\n", + "\n", + "We are going to enrich only the persons that were returned in the previous query (members of the top five largest communities). To avoid running the community detection multiple times, we will first store the results of the Louvain algorithm back to Neo4j." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nodePropertiesWrittencreateMilliscomputeMilliswriteMillispostProcessingMillisranLevelscommunityCountmodularitymodularitiescommunityDistributionconfiguration
078511423466165610.997357[0.9973570202904951]{'p99': 4, 'min': 1, 'max': 8, 'mean': 1.19661...{'maxIterations': 10, 'writeConcurrency': 4, '...
\n", + "
" + ], + "text/plain": [ + " nodePropertiesWritten createMillis computeMillis writeMillis \\\n", + "0 7851 1 423 46 \n", + "\n", + " postProcessingMillis ranLevels communityCount modularity \\\n", + "0 6 1 6561 0.997357 \n", + "\n", + " modularities communityDistribution \\\n", + "0 [0.9973570202904951] {'p99': 4, 'min': 1, 'max': 8, 'mean': 1.19661... \n", + "\n", + " configuration \n", + "0 {'maxIterations': 10, 'writeConcurrency': 4, '... " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "louvain_write_query = \"\"\"\n", + "\n", + "CALL gds.louvain.write('person_similarity', \n", + " {nodeLabels:['PERSON'], relationshipTypes:['SIMILAR'], \n", + " writeProperty:'louvain'})\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(louvain_write_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Google Knowledge Graph\n", + "\n", + "A long time ago I have already used Google Knowledge Graph API in one of my blog posts. One of the reasons I write is that I have a repository of useful cypher queries that I can copy-paste from and use in later analyses.\n", + "\n", + "We will use the Knowledge Graph API to enrich the members of the largest communities with description and detailed description properties." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: []\n", + "Index: []" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "enrich_with_gkg = \"\"\"\n", + "\n", + "// get the members of the top five communities \n", + "MATCH (p:PERSON) \n", + "WITH p.louvain as communityId, \n", + " collect(p) as members ORDER BY size(members) \n", + " DESC LIMIT 5 \n", + " UNWIND members as member \n", + " WITH member, apoc.text.urlencode(member.text) as name, apoc.static.get('gcp.apiKey') as key \n", + " // send a request to KG API \n", + " CALL apoc.load.json(\"https://kgsearch.googleapis.com/v1/entities:search?query=\" \n", + " + name + \"&key=\" + key + \"&limit=1&indent=True\") YIELD value \n", + " WITH member, value['itemListElement'][0]['result'] as results \n", + " // store results back to Neo4j \n", + " SET member.kg_description = results.description, \n", + " member.kg_detailed_description = results.detailedDescription.articleBody\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(enrich_with_gkg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's now look at the largest five communities of persons and include their Knowledge Graph descriptions." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
communityIdmembersdescription
01891[Leander Paes, Miguel Ángel Reyes-Varela, Aust...[Indian tennis player, Tennis player, American...
14515[MLA Sujit Bose, Uluberia, MLA Dr Nirmal Maji,...[City in India]
25523[Army General, Mark Milley, Chairman of the Jo...[General]
33293[Halle Berry, Harrison Ford, Chadwick Boseman,...[American actress, American actor, Kenyan-Mexi...
45217[Vanisha Mittal, Kylie Minogue, Sanjay Hinduja...[Indian businessperson, Singer-songwriter, Bus...
\n", + "
" + ], + "text/plain": [ + " communityId members \\\n", + "0 1891 [Leander Paes, Miguel Ángel Reyes-Varela, Aust... \n", + "1 4515 [MLA Sujit Bose, Uluberia, MLA Dr Nirmal Maji,... \n", + "2 5523 [Army General, Mark Milley, Chairman of the Jo... \n", + "3 3293 [Halle Berry, Harrison Ford, Chadwick Boseman,... \n", + "4 5217 [Vanisha Mittal, Kylie Minogue, Sanjay Hinduja... \n", + "\n", + " description \n", + "0 [Indian tennis player, Tennis player, American... \n", + "1 [City in India] \n", + "2 [General] \n", + "3 [American actress, American actor, Kenyan-Mexi... \n", + "4 [Indian businessperson, Singer-songwriter, Bus... " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "top_five_with_kg_query = \"\"\"\n", + "\n", + "MATCH (p:PERSON) \n", + "RETURN p.louvain as communityId, \n", + " collect(p.text) as members, \n", + " collect(DISTINCT p.kg_description) as description \n", + "ORDER BY size(members) DESC LIMIT 5\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(top_five_with_kg_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With the addition of the enriched descriptions, we get a better sense of who are the members of communities. We can observe that the largest cluster is comprised of tennis players. The knowledge graph doesn't identify any members of the second-largest group. It does, however, know that \"Uluberia\" is actually a city in India and not a person.\n", + "\n", + "### Wikidata enrichment\n", + "\n", + "The main reason why I am subscribed to the weekly twin4j developer newsletter is that I can copy-paste cypher queries that other people have written. Mark Needham did a whole series on how to query WikiData with cypher. He did all the hard work, so we can enjoy the fruits of his labor by copy-pasting his cypher queries. He also developed the APOC NLP procedures we just used, so kudos to Mark!\n", + "\n", + "For all of you who have never heard of the Wikidata before, they describe themselves as a free and open knowledge base that can be read and edited by both humans and machines. After inspecting it for a bit, I can say that it is quite wealthy with information and definitely worth checking out. We will add the date of birth, occupation, and nationality information to the members of the largest communities. Check out the Wikidata query service for more details about data enrichment options." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
batchestotaltimeTakencommittedOperationsfailedOperationsfailedBatchesretrieserrorMessagesbatchoperationswasTerminatedfailedParams
02344634000{}{'total': 2, 'committed': 2, 'failed': 0, 'err...{'total': 34, 'committed': 34, 'failed': 0, 'e...False{}
\n", + "
" + ], + "text/plain": [ + " batches total timeTaken committedOperations failedOperations \\\n", + "0 2 34 46 34 0 \n", + "\n", + " failedBatches retries errorMessages \\\n", + "0 0 0 {} \n", + "\n", + " batch \\\n", + "0 {'total': 2, 'committed': 2, 'failed': 0, 'err... \n", + "\n", + " operations wasTerminated \\\n", + "0 {'total': 34, 'committed': 34, 'failed': 0, 'e... False \n", + "\n", + " failedParams \n", + "0 {} " + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wikidata_enrich_query = \"\"\"\n", + "\n", + "CALL apoc.periodic.iterate( \n", + " // get all persons from the biggest five communities \n", + " \"MATCH (p:PERSON) \n", + " WITH p.louvain as communityId, collect(p) as members \n", + " ORDER BY size(members) DESC LIMIT 5 \n", + " UNWIND members as member \n", + " RETURN member \n", + " \",\" \n", + " // prepare a sparql query \n", + " WITH 'SELECT * WHERE { ?person rdfs:label \\\\\\\"' + member.text + '\\\\\\\"@en ; \n", + " wdt:P569 ?dateOfBirth ; \n", + " wdt:P106 [ rdfs:label ?occupation ] ; \n", + " wdt:P27 [ rdfs:label ?countryName ] . \n", + " filter(lang(?countryName) = \\\\\\\"en\\\\\\\" && lang(?occupation) = \\\\\\\"en\\\\\\\") }' AS sparql, member \n", + " // make a request to wikidata \n", + " CALL apoc.load.jsonParams( \\\\\\\"https://query.wikidata.org/sparql?query=\\\\\\\" + apoc.text.urlencode(sparql), \n", + " { Accept: \\\\\\\"application/sparql-results+json\\\\\\\"}, null ) \n", + " YIELD value \n", + " CALL apoc.do.when( \n", + " // if there are any results \n", + " size(value.results.bindings) > 0, \n", + " // store results \n", + " 'WITH value.results.bindings[0] AS result, member \n", + " SET member.dateOfBirth = date(datetime(result.dateOfBirth.value)), \n", + " member.wikidataImportDone = true \n", + " // store nationality \n", + " MERGE (c:Country {name: result.countryName.value }) \n", + " MERGE (member)-[:NATIONALITY]->(c) \n", + " // store occupation \n", + " MERGE (o:Occupation {name: result.occupation.value}) \n", + " MERGE (member)-[:HAS_OCCUPATION]->(o) RETURN member', \n", + " // else if no results \n", + " 'SET member.wikidataImportDone = true RETURN member', \n", + " {value: value, member: member}) YIELD value AS result RETURN count(*)\", \n", + " {batchSize: 20})\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(wikidata_enrich_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's now inspect the largest communities and include the additional information we acquired from Wikidata." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
communityIdmembersaverage_agenationalitiesoccupations
01891[Leander Paes, Miguel Ángel Reyes-Varela, Aust...35.83[India, Mexico, United States of America, Russia][tennis player]
14515[MLA Sujit Bose, Uluberia, MLA Dr Nirmal Maji,...NaN[][]
25523[Army General, Mark Milley, Chairman of the Jo...NaN[][]
33293[Halle Berry, Harrison Ford, Chadwick Boseman,...58.17[United States of America, Mexico][actor, stage actor, film actor, television ac...
45217[Vanisha Mittal, Kylie Minogue, Sanjay Hinduja...45.00[United Kingdom, Puerto Rico, United States of...[actor, singer-songwriter, composer]
\n", + "
" + ], + "text/plain": [ + " communityId members \\\n", + "0 1891 [Leander Paes, Miguel Ángel Reyes-Varela, Aust... \n", + "1 4515 [MLA Sujit Bose, Uluberia, MLA Dr Nirmal Maji,... \n", + "2 5523 [Army General, Mark Milley, Chairman of the Jo... \n", + "3 3293 [Halle Berry, Harrison Ford, Chadwick Boseman,... \n", + "4 5217 [Vanisha Mittal, Kylie Minogue, Sanjay Hinduja... \n", + "\n", + " average_age nationalities \\\n", + "0 35.83 [India, Mexico, United States of America, Russia] \n", + "1 NaN [] \n", + "2 NaN [] \n", + "3 58.17 [United States of America, Mexico] \n", + "4 45.00 [United Kingdom, Puerto Rico, United States of... \n", + "\n", + " occupations \n", + "0 [tennis player] \n", + "1 [] \n", + "2 [] \n", + "3 [actor, stage actor, film actor, television ac... \n", + "4 [actor, singer-songwriter, composer] " + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "top_five_with_wiki_query = \"\"\"\n", + "\n", + "MATCH (p:PERSON) \n", + "RETURN p.louvain as communityId, \n", + " collect(p.text) as members,\n", + " apoc.math.round(avg(duration.inMonths(p.dateOfBirth, date()).years),2) as average_age,\n", + " collect(distinct [(p)-[:NATIONALITY]->(n) | n.name][0]) as nationalities,\n", + " collect(distinct [(p)-[:HAS_OCCUPATION]->(o) | o.name][0]) as occupations \n", + "ORDER BY size(members) DESC \n", + "LIMIT 5\n", + "\n", + "\"\"\"\n", + "\n", + "read_query(top_five_with_wiki_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The results look very impressive. I bet you didn't know that there are at least two people named Jim Parsons. One of them is an actor, and the other one is a race car driver. And we are only scratching the surface of the information that is available through Wikidata API." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "drop_graph('person_similarity')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}