From ed5a4df3af3f5a27248673bbf339455a986ebf1c Mon Sep 17 00:00:00 2001 From: abhay Date: Sun, 10 May 2020 20:59:05 -0500 Subject: [PATCH] add graph processing --- .../pagerank-python-checkpoint.ipynb | 280 +++++++++++++++++ graph/pagerank-python.ipynb | 287 ++++++++++++++++++ 2 files changed, 567 insertions(+) create mode 100644 graph/.ipynb_checkpoints/pagerank-python-checkpoint.ipynb create mode 100644 graph/pagerank-python.ipynb diff --git a/graph/.ipynb_checkpoints/pagerank-python-checkpoint.ipynb b/graph/.ipynb_checkpoints/pagerank-python-checkpoint.ipynb new file mode 100644 index 0000000..4a5abe3 --- /dev/null +++ b/graph/.ipynb_checkpoints/pagerank-python-checkpoint.ipynb @@ -0,0 +1,280 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql.functions import *\n", + "from pyspark.sql.functions import pandas_udf, PandasUDFType\n", + "from pyspark.sql.types import *\n", + "from graphframes import *\n", + "from pyspark.sql.functions import *" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "spark = SparkSession \\\n", + " .builder \\\n", + " .appName(\"PageRank\") \\\n", + " .config(\"spark.executor.heartbeatInterval\", \"30000s\")\\\n", + " .getOrCreate()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "

SparkSession - hive

\n", + " \n", + "
\n", + "

SparkContext

\n", + "\n", + "

Spark UI

\n", + "\n", + "
\n", + "
Version
\n", + "
v2.4.3
\n", + "
Master
\n", + "
local[*]
\n", + "
AppName
\n", + "
PySparkShell
\n", + "
\n", + "
\n", + " \n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spark" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "pages = spark.read.format(\"csv\").load(\"../data/w3data/pagerank/*pages.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "links = spark.read.format(\"csv\").load(\"../data/w3data/pagerank/*links.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "links=links.select(col(\"_c0\").alias(\"src\"), (col(\"_c1\").alias(\"dst\")))\n", + "pages=pages.select(col(\"_c0\").alias(\"id\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------------------------------------------------------------------------------+\n", + "|id |\n", + "+---------------------------------------------------------------------------------+\n", + "|https://w3.ibm.com/w3publisher/google-analytics/education-training/when-to-use-ga|\n", + "|https://w3.ibm.com/w3publisher/gts-efc/events/initiatives/balance-sheet-reporting|\n", + "|https://w3.ibm.com/w3publisher/patents/research-master-inventor |\n", + "|https://w3.ibm.com/w3publisher/gts-next/gts-narrative |\n", + "|https://w3.ibm.com/w3publisher/gbs-eu-pde/archive/consulting-fund-2 |\n", + "+---------------------------------------------------------------------------------+\n", + "only showing top 5 rows\n", + "\n" + ] + } + ], + "source": [ + "pages.show(5, False)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------------------------------------------------------------+-------------------------------------------------------------+\n", + "|src |dst |\n", + "+---------------------------------------------------------------+-------------------------------------------------------------+\n", + "|https://w3.ibm.com/w3publisher/patents/research-master-inventor|https://w3.ibm.com/w3publisher/patents/why-we-patent |\n", + "|https://w3.ibm.com/w3publisher/patents/research-master-inventor|https://w3.ibm.com/w3publisher/patents/awards |\n", + "|https://w3.ibm.com/w3publisher/patents/research-master-inventor|https://w3.ibm.com/ |\n", + "|https://w3.ibm.com/w3publisher/patents/research-master-inventor|http://w3.ibm.com/w3/info_terms_of_use.html |\n", + "|https://w3.ibm.com/w3publisher/patents/research-master-inventor|https://w3.ibm.com/w3publisher/patents/tips-and-presentations|\n", + "+---------------------------------------------------------------+-------------------------------------------------------------+\n", + "only showing top 5 rows\n", + "\n" + ] + } + ], + "source": [ + "links.show(5, False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a graphframe from vertices and edges" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "w3graph = GraphFrame(pages, links)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Run a Pagerank algorithm" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "w3pagerank = w3graph.pageRank(resetProbability=0.15, maxIter=5)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----------------------------------------------------------------------------------------------------+------------------+\n", + "|id |pagerank |\n", + "+-----------------------------------------------------------------------------------------------------+------------------+\n", + "|https://w3.ibm.com/w3publisher |918.6350758663948 |\n", + "|https://w3.ibm.com/w3publisher/w3-privacy-notice/ |917.9575882531376 |\n", + "|https://w3.ibm.com/w3publisher/sales-and-delivery-101-mod-3/consult-to-operate |29.492137597197754|\n", + "|https://w3.ibm.com/w3publisher/process-portal-benelux/process-portal-benelux/contact |28.53062710038013 |\n", + "|https://w3.ibm.com/w3publisher/ |27.380086802454347|\n", + "|https://w3.ibm.com/w3publisher/privacy-at-ibm/gdpr-awareness-course |26.689342455816792|\n", + "|https://w3.ibm.com/w3publisher/undefined/ |24.013627161928223|\n", + "|https://w3.ibm.com/w3publisher/undefined// |16.587042868259328|\n", + "|https://w3.ibm.com/w3publisher/hpc2018 |16.22722095383144 |\n", + "|https://w3.ibm.com/w3publisher/businessresourcegroups |15.559478172024198|\n", + "|https://w3.ibm.com/w3publisher/pps-shortcuts/ |13.204535900388832|\n", + "|https://w3.ibm.com/w3publisher/celebrations/ar-operations |12.726049892941768|\n", + "|https://w3.ibm.com/w3publisher/blog |11.593287201869359|\n", + "|https://w3.ibm.com/w3publisher/women-at-ibm |11.359712827645462|\n", + "|https://w3.ibm.com/w3publisher/new-collar/new-collar/meet-the-apprentices |11.112107646371827|\n", + "|https://w3.ibm.com/w3publisher/people-with-disabilities-gateway |10.843015784715224|\n", + "|https://w3.ibm.com/w3publisher/order-management-education/2019/undefined/ |10.79000282342931 |\n", + "|https://w3.ibm.com/w3publisher/tss-canada-quarterly-newsletter |10.704340508292939|\n", + "|https://w3.ibm.com/w3publisher/sales-and-delivery-101-mod-4/iot-sensor-demo |10.669126926881551|\n", + "|https://w3.ibm.com/w3publisher/uaab4lh |10.608933822202024|\n", + "|https://w3.ibm.com/w3publisher/europe-gbs-red-hat-enablement |10.591033920619854|\n", + "|https://w3.ibm.com/w3publisher/ai-ibm/use-cases/undefined/use-cases |10.555391423742247|\n", + "|https://w3.ibm.com/w3publisher/sample-course/sales-opportunities%2Fiot-value-proposition |10.46516987243922 |\n", + "|https://w3.ibm.com/w3publisher/internet-of-things-sales-delivery-101-m56/m6nextsteps |10.19311456800684 |\n", + "|https://w3.ibm.com/w3publisher/ibm-cloud-bundles/undefined/ |10.033878023150816|\n", + "|https://w3.ibm.com/w3publisher/fastcloud/undefined/ |10.033878023150816|\n", + "|https://w3.ibm.com/w3publisher/pridegateway |9.879788844611001 |\n", + "|https://w3.ibm.com/w3publisher/veterans-gateway |9.762175399658117 |\n", + "|https://w3.ibm.com/w3publisher/hispanics-at-ibm-gateway |9.259024590999664 |\n", + "|https://w3.ibm.com/w3publisher/asian-employee-gateway |9.20173532785606 |\n", + "|https://w3.ibm.com/w3publisher/tap-education/introduction |9.07120765979852 |\n", + "|https://w3.ibm.com/w3publisher/talent-acquisition-for-hr-professionals/cognitive-solutions |9.025255534937381 |\n", + "|https://w3.ibm.com/w3publisher/native-americans-at-ibm-gateway |8.979308389889017 |\n", + "|https://w3.ibm.com/w3publisher/tap-education/unit-1 |8.755475070378825 |\n", + "|https://w3.ibm.com/w3publisher/meet-the-team |8.631114750692095 |\n", + "|https://w3.ibm.com/w3publisher/sales-and-delivery-101-mod-4/ibm-watson-iot-platform |8.571226920933485 |\n", + "|https://w3.ibm.com/w3publisher/zresilience-web/ |8.329998922120668 |\n", + "|https://w3.ibm.com/w3publisher/think2019/sales-resources |8.26912511795766 |\n", + "|https://w3.ibm.com/w3publisher/resources |8.262864092305383 |\n", + "|https://w3.ibm.com/w3publisher/black-gateway |8.121392329423122 |\n", + "|https://w3.ibm.com/w3publisher/pps-shortcuts/pps-contacts |8.107727069638473 |\n", + "|https://w3.ibm.com/w3publisher/sibos-2019 |7.9565126202668655|\n", + "|https://w3.ibm.com/w3publisher/inspire-me/inspire-me/events |7.877418776708795 |\n", + "|https://w3.ibm.com/w3publisher/elc-mentorship/find-a-mentor/undefined/find-a-mentor/business-mentors |7.877418776708795 |\n", + "|https://w3.ibm.com/w3publisher/byod/why-powerbi |7.688728592645118 |\n", + "|https://w3.ibm.com/w3publisher/ai-ibm/use-cases |7.661772852064592 |\n", + "|https://w3.ibm.com/w3publisher/gbs-l-k-spotlight-monthly-may-2018/gbs-l-k-spotlight-monthly-may-2018/|7.551928209198955 |\n", + "|https://w3.ibm.com/w3publisher/gts-mea-take-5/take-5-form |7.542097794489055 |\n", + "|https://w3.ibm.com/w3publisher/ces-2019/ginni-keynote |7.524301045618796 |\n", + "|https://w3.ibm.com/w3publisher/gbs-l-k-spotlight-monthly-jul-2018/ |7.338303965098289 |\n", + "+-----------------------------------------------------------------------------------------------------+------------------+\n", + "only showing top 50 rows\n", + "\n" + ] + } + ], + "source": [ + "w3pagerank.vertices.orderBy(\"pagerank\", ascending=False).show(50, False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/graph/pagerank-python.ipynb b/graph/pagerank-python.ipynb new file mode 100644 index 0000000..55f7757 --- /dev/null +++ b/graph/pagerank-python.ipynb @@ -0,0 +1,287 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql.functions import *\n", + "from pyspark.sql.functions import pandas_udf, PandasUDFType\n", + "from pyspark.sql.types import *\n", + "from graphframes import *\n", + "from pyspark.sql.functions import *" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "spark = SparkSession \\\n", + " .builder \\\n", + " .appName(\"PageRank\") \\\n", + " .config(\"spark.executor.heartbeatInterval\", \"30000s\")\\\n", + " .getOrCreate()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "

SparkSession - hive

\n", + " \n", + "
\n", + "

SparkContext

\n", + "\n", + "

Spark UI

\n", + "\n", + "
\n", + "
Version
\n", + "
v2.4.3
\n", + "
Master
\n", + "
local[*]
\n", + "
AppName
\n", + "
PySparkShell
\n", + "
\n", + "
\n", + " \n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spark" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "pages = spark.read.format(\"csv\").load(\"../data/w3data/pagerank/*pages.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "links = spark.read.format(\"csv\").load(\"../data/w3data/pagerank/*links.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "links=links.select(col(\"_c0\").alias(\"src\"), (col(\"_c1\").alias(\"dst\")))\n", + "pages=pages.select(col(\"_c0\").alias(\"id\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------------------------------------------------------------------------------+\n", + "|id |\n", + "+---------------------------------------------------------------------------------+\n", + "|https://w3.ibm.com/w3publisher/google-analytics/education-training/when-to-use-ga|\n", + "|https://w3.ibm.com/w3publisher/gts-efc/events/initiatives/balance-sheet-reporting|\n", + "|https://w3.ibm.com/w3publisher/patents/research-master-inventor |\n", + "|https://w3.ibm.com/w3publisher/gts-next/gts-narrative |\n", + "|https://w3.ibm.com/w3publisher/gbs-eu-pde/archive/consulting-fund-2 |\n", + "+---------------------------------------------------------------------------------+\n", + "only showing top 5 rows\n", + "\n" + ] + } + ], + "source": [ + "pages.show(5, False)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------------------------------------------------------------+-------------------------------------------------------------+\n", + "|src |dst |\n", + "+---------------------------------------------------------------+-------------------------------------------------------------+\n", + "|https://w3.ibm.com/w3publisher/patents/research-master-inventor|https://w3.ibm.com/w3publisher/patents/why-we-patent |\n", + "|https://w3.ibm.com/w3publisher/patents/research-master-inventor|https://w3.ibm.com/w3publisher/patents/awards |\n", + "|https://w3.ibm.com/w3publisher/patents/research-master-inventor|https://w3.ibm.com/ |\n", + "|https://w3.ibm.com/w3publisher/patents/research-master-inventor|http://w3.ibm.com/w3/info_terms_of_use.html |\n", + "|https://w3.ibm.com/w3publisher/patents/research-master-inventor|https://w3.ibm.com/w3publisher/patents/tips-and-presentations|\n", + "+---------------------------------------------------------------+-------------------------------------------------------------+\n", + "only showing top 5 rows\n", + "\n" + ] + } + ], + "source": [ + "links.show(5, False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a graphframe from vertices and edges" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "w3graph = GraphFrame(pages, links)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Run a Pagerank algorithm" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "w3pagerank = w3graph.pageRank(resetProbability=0.15, maxIter=5)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----------------------------------------------------------------------------------------------------+------------------+\n", + "|id |pagerank |\n", + "+-----------------------------------------------------------------------------------------------------+------------------+\n", + "|https://w3.ibm.com/w3publisher |918.6350758663948 |\n", + "|https://w3.ibm.com/w3publisher/w3-privacy-notice/ |917.9575882531376 |\n", + "|https://w3.ibm.com/w3publisher/sales-and-delivery-101-mod-3/consult-to-operate |29.492137597197754|\n", + "|https://w3.ibm.com/w3publisher/process-portal-benelux/process-portal-benelux/contact |28.53062710038013 |\n", + "|https://w3.ibm.com/w3publisher/ |27.380086802454347|\n", + "|https://w3.ibm.com/w3publisher/privacy-at-ibm/gdpr-awareness-course |26.689342455816792|\n", + "|https://w3.ibm.com/w3publisher/undefined/ |24.013627161928223|\n", + "|https://w3.ibm.com/w3publisher/undefined// |16.587042868259328|\n", + "|https://w3.ibm.com/w3publisher/hpc2018 |16.22722095383144 |\n", + "|https://w3.ibm.com/w3publisher/businessresourcegroups |15.559478172024198|\n", + "|https://w3.ibm.com/w3publisher/pps-shortcuts/ |13.204535900388832|\n", + "|https://w3.ibm.com/w3publisher/celebrations/ar-operations |12.726049892941768|\n", + "|https://w3.ibm.com/w3publisher/blog |11.593287201869359|\n", + "|https://w3.ibm.com/w3publisher/women-at-ibm |11.359712827645462|\n", + "|https://w3.ibm.com/w3publisher/new-collar/new-collar/meet-the-apprentices |11.112107646371827|\n", + "|https://w3.ibm.com/w3publisher/people-with-disabilities-gateway |10.843015784715224|\n", + "|https://w3.ibm.com/w3publisher/order-management-education/2019/undefined/ |10.79000282342931 |\n", + "|https://w3.ibm.com/w3publisher/tss-canada-quarterly-newsletter |10.704340508292939|\n", + "|https://w3.ibm.com/w3publisher/sales-and-delivery-101-mod-4/iot-sensor-demo |10.669126926881551|\n", + "|https://w3.ibm.com/w3publisher/uaab4lh |10.608933822202024|\n", + "|https://w3.ibm.com/w3publisher/europe-gbs-red-hat-enablement |10.591033920619854|\n", + "|https://w3.ibm.com/w3publisher/ai-ibm/use-cases/undefined/use-cases |10.555391423742247|\n", + "|https://w3.ibm.com/w3publisher/sample-course/sales-opportunities%2Fiot-value-proposition |10.46516987243922 |\n", + "|https://w3.ibm.com/w3publisher/internet-of-things-sales-delivery-101-m56/m6nextsteps |10.19311456800684 |\n", + "|https://w3.ibm.com/w3publisher/ibm-cloud-bundles/undefined/ |10.033878023150816|\n", + "|https://w3.ibm.com/w3publisher/fastcloud/undefined/ |10.033878023150816|\n", + "|https://w3.ibm.com/w3publisher/pridegateway |9.879788844611001 |\n", + "|https://w3.ibm.com/w3publisher/veterans-gateway |9.762175399658117 |\n", + "|https://w3.ibm.com/w3publisher/hispanics-at-ibm-gateway |9.259024590999664 |\n", + "|https://w3.ibm.com/w3publisher/asian-employee-gateway |9.20173532785606 |\n", + "|https://w3.ibm.com/w3publisher/tap-education/introduction |9.07120765979852 |\n", + "|https://w3.ibm.com/w3publisher/talent-acquisition-for-hr-professionals/cognitive-solutions |9.025255534937381 |\n", + "|https://w3.ibm.com/w3publisher/native-americans-at-ibm-gateway |8.979308389889017 |\n", + "|https://w3.ibm.com/w3publisher/tap-education/unit-1 |8.755475070378825 |\n", + "|https://w3.ibm.com/w3publisher/meet-the-team |8.631114750692095 |\n", + "|https://w3.ibm.com/w3publisher/sales-and-delivery-101-mod-4/ibm-watson-iot-platform |8.571226920933485 |\n", + "|https://w3.ibm.com/w3publisher/zresilience-web/ |8.329998922120668 |\n", + "|https://w3.ibm.com/w3publisher/think2019/sales-resources |8.26912511795766 |\n", + "|https://w3.ibm.com/w3publisher/resources |8.262864092305383 |\n", + "|https://w3.ibm.com/w3publisher/black-gateway |8.121392329423122 |\n", + "|https://w3.ibm.com/w3publisher/pps-shortcuts/pps-contacts |8.107727069638473 |\n", + "|https://w3.ibm.com/w3publisher/sibos-2019 |7.9565126202668655|\n", + "|https://w3.ibm.com/w3publisher/inspire-me/inspire-me/events |7.877418776708795 |\n", + "|https://w3.ibm.com/w3publisher/elc-mentorship/find-a-mentor/undefined/find-a-mentor/business-mentors |7.877418776708795 |\n", + "|https://w3.ibm.com/w3publisher/byod/why-powerbi |7.688728592645118 |\n", + "|https://w3.ibm.com/w3publisher/ai-ibm/use-cases |7.661772852064592 |\n", + "|https://w3.ibm.com/w3publisher/gbs-l-k-spotlight-monthly-may-2018/gbs-l-k-spotlight-monthly-may-2018/|7.551928209198955 |\n", + "|https://w3.ibm.com/w3publisher/gts-mea-take-5/take-5-form |7.542097794489055 |\n", + "|https://w3.ibm.com/w3publisher/ces-2019/ginni-keynote |7.524301045618796 |\n", + "|https://w3.ibm.com/w3publisher/gbs-l-k-spotlight-monthly-jul-2018/ |7.338303965098289 |\n", + "+-----------------------------------------------------------------------------------------------------+------------------+\n", + "only showing top 50 rows\n", + "\n" + ] + } + ], + "source": [ + "w3pagerank.vertices.orderBy(\"pagerank\", ascending=False).show(50, False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}