Uploaded files

tirthajyoti · Jul 11, 2019 · ba9ab52 · ba9ab52
1 parent 03591bc
commit ba9ab52
Show file tree

Hide file tree

Showing 76 changed files with 41,155 additions and 1,091 deletions.
diff --git a/DataFrame_Operations.ipynb → DataFrame_operations_basics.ipynb b/DataFrame_Operations.ipynb → DataFrame_operations_basics.ipynb
@@ -1,5 +1,14 @@
 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Spark DataFrame basic operations\n",
+    "### Dr. Tirthajyoti Sarkar, Fremont, CA 94536\n",
+    "In this notebook, we go through basic operations that can be performed with a Spark DataFrame object. We will use a .CSV file of stock prices to illustrate the code."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 1,
@@ -35,7 +44,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -51,7 +60,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -83,7 +92,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -132,7 +141,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -141,7 +150,7 @@
        "['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -159,7 +168,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -188,7 +197,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -198,7 +207,7 @@
        " Row(Date=datetime.datetime(2010, 1, 5, 0, 0), Open=214.599998, High=215.589994, Low=213.249994, Close=214.379993, Volume=150476200, Adj Close=27.774976000000002)]"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -209,7 +218,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -218,7 +227,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -233,7 +242,7 @@
        " 'Adj Close': 27.727039}"
       ]
      },
-     "execution_count": 15,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -259,7 +268,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -293,7 +302,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -322,7 +331,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
@@ -344,6 +353,186 @@
    "source": [
     "df.filter(\"Close < 500 AND Open > 500\").show(5)"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Now we use DataFrame syntax to achieve the same output "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+-------------------+----------+----------+------------------+------------------+---------+------------------+\n",
+      "|               Date|      Open|      High|               Low|             Close|   Volume|         Adj Close|\n",
+      "+-------------------+----------+----------+------------------+------------------+---------+------------------+\n",
+      "|2010-01-04 00:00:00|213.429998|214.499996|212.38000099999996|        214.009998|123432400|         27.727039|\n",
+      "|2010-01-05 00:00:00|214.599998|215.589994|        213.249994|        214.379993|150476200|27.774976000000002|\n",
+      "|2010-01-06 00:00:00|214.379993|    215.23|        210.750004|        210.969995|138040000|27.333178000000004|\n",
+      "|2010-01-07 00:00:00|    211.75|212.000006|        209.050005|            210.58|119282800|          27.28265|\n",
+      "|2010-01-08 00:00:00|210.299994|212.000006|209.06000500000002|211.98000499999998|111902700|         27.464034|\n",
+      "+-------------------+----------+----------+------------------+------------------+---------+------------------+\n",
+      "only showing top 5 rows\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "df.filter(df['Close']<500).show(5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### If we need to chain multiple conditions together, use `&` for AND and `|` for OR and clearly separate the conditions by putting them inside parantheses"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+-------------------+----------+------------------+------------------+------------------+---------+---------+\n",
+      "|               Date|      Open|              High|               Low|             Close|   Volume|Adj Close|\n",
+      "+-------------------+----------+------------------+------------------+------------------+---------+---------+\n",
+      "|2012-02-15 00:00:00|514.259995|        526.290016|496.88998399999997|        497.669975|376530000|64.477899|\n",
+      "|2013-09-05 00:00:00|500.250008|500.67997699999995|493.63997699999993|495.26997400000005| 59091900|65.977837|\n",
+      "|2013-09-10 00:00:00|506.199997|        507.450012|        489.500015|494.63999900000005|185798900|65.893915|\n",
+      "|2014-01-30 00:00:00|502.539993|506.49997699999994|         496.70002|        499.779984|169625400|66.967353|\n",
+      "+-------------------+----------+------------------+------------------+------------------+---------+---------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "df.filter((df['Close']<500) & (df['Open']>500)).show(5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### We can use `==` to compare with an exact value for comparison and `~` for NOT operator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+-------------------+------------------+----------+------+------+---------+---------+\n",
+      "|               Date|              Open|      High|   Low| Close|   Volume|Adj Close|\n",
+      "+-------------------+------------------+----------+------+------+---------+---------+\n",
+      "|2010-01-22 00:00:00|206.78000600000001|207.499996|197.16|197.75|220441900|25.620401|\n",
+      "+-------------------+------------------+----------+------+------+---------+---------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "df.filter(df['Low']==197.16).show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Use the `collect` method instead of `show`, to collect the actual data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "low_data = df.filter(df['Low']==197.16).collect()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Row(Date=datetime.datetime(2010, 1, 22, 0, 0), Open=206.78000600000001, High=207.499996, Low=197.16, Close=197.75, Volume=220441900, Adj Close=25.620401)]"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "low_data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### It is still a list. So, grab the 0-index element as a Row object and convert it to a dictionary using `asDict` method"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dt = low_data[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'Date': datetime.datetime(2010, 1, 22, 0, 0),\n",
+       " 'Open': 206.78000600000001,\n",
+       " 'High': 207.499996,\n",
+       " 'Low': 197.16,\n",
+       " 'Close': 197.75,\n",
+       " 'Volume': 220441900,\n",
+       " 'Adj Close': 25.620401}"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dt.asDict()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, you can do whatever processing you want to do with the dictionary object!"
+   ]
   }
  ],
  "metadata": {
@@ -362,7 +551,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.6"
+   "version": "3.6.8"
   }
  },
  "nbformat": 4,