From 610a1557ea706af97be2a857e5cd9cfb8131fe0d Mon Sep 17 00:00:00 2001 From: abhay Date: Sun, 10 May 2020 20:59:42 -0500 Subject: [PATCH] add machine learning examples --- .../machine-learning-checkpoint.ipynb | 202 ++++++++++++++++++ machine-learning/machine-learning.ipynb | 202 ++++++++++++++++++ 2 files changed, 404 insertions(+) create mode 100644 machine-learning/.ipynb_checkpoints/machine-learning-checkpoint.ipynb create mode 100644 machine-learning/machine-learning.ipynb diff --git a/machine-learning/.ipynb_checkpoints/machine-learning-checkpoint.ipynb b/machine-learning/.ipynb_checkpoints/machine-learning-checkpoint.ipynb new file mode 100644 index 0000000..6f48b73 --- /dev/null +++ b/machine-learning/.ipynb_checkpoints/machine-learning-checkpoint.ipynb @@ -0,0 +1,202 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.ml import Pipeline\n", + "from pyspark.ml.classification import LogisticRegression\n", + "from pyspark.ml.feature import HashingTF, Tokenizer" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Prepare training documents from a list of (id, text, label) tuples.\n", + "training = spark.createDataFrame([\n", + " (0, \"a b c d e spark\", 1.0),\n", + " (1, \"b d\", 0.0),\n", + " (2, \"spark f g h\", 1.0),\n", + " (3, \"hadoop mapreduce\", 0.0)\n", + "], [\"id\", \"text\", \"label\"])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+----------------+-----+\n", + "| id| text|label|\n", + "+---+----------------+-----+\n", + "| 0| a b c d e spark| 1.0|\n", + "| 1| b d| 0.0|\n", + "| 2| spark f g h| 1.0|\n", + "| 3|hadoop mapreduce| 0.0|\n", + "+---+----------------+-----+\n", + "\n" + ] + } + ], + "source": [ + "training.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.\n", + "tokenizer = Tokenizer(inputCol=\"text\", outputCol=\"words\")\n", + "hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol=\"features\")\n", + "lr = LogisticRegression(maxIter=10, regParam=0.001)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Fit the pipeline to training documents.\n", + "model = pipeline.fit(training)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# Prepare test documents, which are unlabeled (id, text) tuples.\n", + "test = spark.createDataFrame([\n", + " (4, \"spark i j k\"),\n", + " (5, \"l m n\"),\n", + " (6, \"spark hadoop spark\"),\n", + " (7, \"apache hadoop\")\n", + "], [\"id\", \"text\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+------------------+\n", + "| id| text|\n", + "+---+------------------+\n", + "| 4| spark i j k|\n", + "| 5| l m n|\n", + "| 6|spark hadoop spark|\n", + "| 7| apache hadoop|\n", + "+---+------------------+\n", + "\n" + ] + } + ], + "source": [ + "test.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Make predictions on test documents and print columns of interest.\n", + "prediction = model.transform(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "selected = prediction.select(\"id\", \"text\", \"probability\", \"prediction\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+------------------+--------------------+----------+\n", + "| id| text| probability|prediction|\n", + "+---+------------------+--------------------+----------+\n", + "| 4| spark i j k|[0.15964077387874...| 1.0|\n", + "| 5| l m n|[0.83783256854767...| 0.0|\n", + "| 6|spark hadoop spark|[0.06926633132976...| 1.0|\n", + "| 7| apache hadoop|[0.98215753334442...| 0.0|\n", + "+---+------------------+--------------------+----------+\n", + "\n" + ] + } + ], + "source": [ + "selected.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for row in selected.collect():\n", + " rid, text, prob, prediction = row\n", + " print(\"(%d, %s) --> prob=%s, prediction=%f\" % (rid, text, str(prob), prediction))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/machine-learning/machine-learning.ipynb b/machine-learning/machine-learning.ipynb new file mode 100644 index 0000000..6f48b73 --- /dev/null +++ b/machine-learning/machine-learning.ipynb @@ -0,0 +1,202 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.ml import Pipeline\n", + "from pyspark.ml.classification import LogisticRegression\n", + "from pyspark.ml.feature import HashingTF, Tokenizer" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Prepare training documents from a list of (id, text, label) tuples.\n", + "training = spark.createDataFrame([\n", + " (0, \"a b c d e spark\", 1.0),\n", + " (1, \"b d\", 0.0),\n", + " (2, \"spark f g h\", 1.0),\n", + " (3, \"hadoop mapreduce\", 0.0)\n", + "], [\"id\", \"text\", \"label\"])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+----------------+-----+\n", + "| id| text|label|\n", + "+---+----------------+-----+\n", + "| 0| a b c d e spark| 1.0|\n", + "| 1| b d| 0.0|\n", + "| 2| spark f g h| 1.0|\n", + "| 3|hadoop mapreduce| 0.0|\n", + "+---+----------------+-----+\n", + "\n" + ] + } + ], + "source": [ + "training.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.\n", + "tokenizer = Tokenizer(inputCol=\"text\", outputCol=\"words\")\n", + "hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol=\"features\")\n", + "lr = LogisticRegression(maxIter=10, regParam=0.001)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Fit the pipeline to training documents.\n", + "model = pipeline.fit(training)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# Prepare test documents, which are unlabeled (id, text) tuples.\n", + "test = spark.createDataFrame([\n", + " (4, \"spark i j k\"),\n", + " (5, \"l m n\"),\n", + " (6, \"spark hadoop spark\"),\n", + " (7, \"apache hadoop\")\n", + "], [\"id\", \"text\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+------------------+\n", + "| id| text|\n", + "+---+------------------+\n", + "| 4| spark i j k|\n", + "| 5| l m n|\n", + "| 6|spark hadoop spark|\n", + "| 7| apache hadoop|\n", + "+---+------------------+\n", + "\n" + ] + } + ], + "source": [ + "test.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Make predictions on test documents and print columns of interest.\n", + "prediction = model.transform(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "selected = prediction.select(\"id\", \"text\", \"probability\", \"prediction\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+------------------+--------------------+----------+\n", + "| id| text| probability|prediction|\n", + "+---+------------------+--------------------+----------+\n", + "| 4| spark i j k|[0.15964077387874...| 1.0|\n", + "| 5| l m n|[0.83783256854767...| 0.0|\n", + "| 6|spark hadoop spark|[0.06926633132976...| 1.0|\n", + "| 7| apache hadoop|[0.98215753334442...| 0.0|\n", + "+---+------------------+--------------------+----------+\n", + "\n" + ] + } + ], + "source": [ + "selected.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for row in selected.collect():\n", + " rid, text, prob, prediction = row\n", + " print(\"(%d, %s) --> prob=%s, prediction=%f\" % (rid, text, str(prob), prediction))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}