Skip to content

Commit

Permalink
dataframe examples
Browse files Browse the repository at this point in the history
  • Loading branch information
abhayibm committed May 11, 2020
1 parent 7461efc commit 7688246
Show file tree
Hide file tree
Showing 6 changed files with 2,260 additions and 0 deletions.
156 changes: 156 additions & 0 deletions dataframes/joins.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# in Python\n",
"person = spark.createDataFrame([\n",
" (0, \"Bill Chambers\", 0, [100]),\n",
" (1, \"Matei Zaharia\", 1, [500, 250, 100]),\n",
" (2, \"Michael Armbrust\", 1, [250, 100])])\\\n",
" .toDF(\"id\", \"name\", \"graduate_program\", \"spark_status\")\n",
"graduateProgram = spark.createDataFrame([\n",
" (0, \"Masters\", \"School of Information\", \"UC Berkeley\"),\n",
" (2, \"Masters\", \"EECS\", \"UC Berkeley\"),\n",
" (1, \"Ph.D.\", \"EECS\", \"UC Berkeley\")])\\\n",
" .toDF(\"id\", \"degree\", \"department\", \"school\")\n",
"sparkStatus = spark.createDataFrame([\n",
" (500, \"Vice President\"),\n",
" (250, \"PMC Member\"),\n",
" (100, \"Contributor\")])\\\n",
" .toDF(\"id\", \"status\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+---+----------------+----------------+---------------+\n",
"| id| name|graduate_program| spark_status|\n",
"+---+----------------+----------------+---------------+\n",
"| 0| Bill Chambers| 0| [100]|\n",
"| 1| Matei Zaharia| 1|[500, 250, 100]|\n",
"| 2|Michael Armbrust| 1| [250, 100]|\n",
"+---+----------------+----------------+---------------+\n",
"\n"
]
}
],
"source": [
"person.show()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+---+-------+--------------------+-----------+\n",
"| id| degree| department| school|\n",
"+---+-------+--------------------+-----------+\n",
"| 0|Masters|School of Informa...|UC Berkeley|\n",
"| 2|Masters| EECS|UC Berkeley|\n",
"| 1| Ph.D.| EECS|UC Berkeley|\n",
"+---+-------+--------------------+-----------+\n",
"\n"
]
}
],
"source": [
"graduateProgram.show()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+---+--------------+\n",
"| id| status|\n",
"+---+--------------+\n",
"|500|Vice President|\n",
"|250| PMC Member|\n",
"|100| Contributor|\n",
"+---+--------------+\n",
"\n"
]
}
],
"source": [
"sparkStatus.show()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"person.createOrReplaceTempView(\"person\")\n",
"graduateProgram.createOrReplaceTempView(\"graduateProgram\")\n",
"sparkStatus.createOrReplaceTempView(\"sparkStatus\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+---+----------------+----------------+---------------+---+-------+--------------------+-----------+\n",
"| id| name|graduate_program| spark_status| id| degree| department| school|\n",
"+---+----------------+----------------+---------------+---+-------+--------------------+-----------+\n",
"| 0| Bill Chambers| 0| [100]| 0|Masters|School of Informa...|UC Berkeley|\n",
"| 1| Matei Zaharia| 1|[500, 250, 100]| 1| Ph.D.| EECS|UC Berkeley|\n",
"| 2|Michael Armbrust| 1| [250, 100]| 1| Ph.D.| EECS|UC Berkeley|\n",
"+---+----------------+----------------+---------------+---+-------+--------------------+-----------+\n",
"\n"
]
}
],
"source": [
"spark.sql(\"SELECT * FROM person JOIN graduateProgram ON person.graduate_program = graduateProgram.id\").show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
104 changes: 104 additions & 0 deletions dataframes/spark-ui.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Row(sum(id)=2500000000000)]"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# in Python\n",
"df1 = spark.range(2, 10000000, 2)\n",
"df2 = spark.range(2, 10000000, 4)\n",
"step1 = df1.repartition(5)\n",
"step12 = df2.repartition(6)\n",
"step2 = step1.selectExpr(\"id * 5 as id\")\n",
"step3 = step2.join(step12, [\"id\"])\n",
"step4 = step3.selectExpr(\"sum(id)\")\n",
"\n",
"step4.collect() # 2500000000000"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" <div>\n",
" <p><b>SparkSession - hive</b></p>\n",
" \n",
" <div>\n",
" <p><b>SparkContext</b></p>\n",
"\n",
" <p><a href=\"http://abhays-mbp-2:4040\">Spark UI</a></p>\n",
"\n",
" <dl>\n",
" <dt>Version</dt>\n",
" <dd><code>v2.4.3</code></dd>\n",
" <dt>Master</dt>\n",
" <dd><code>local[*]</code></dd>\n",
" <dt>AppName</dt>\n",
" <dd><code>PySparkShell</code></dd>\n",
" </dl>\n",
" </div>\n",
" \n",
" </div>\n",
" "
],
"text/plain": [
"<pyspark.sql.session.SparkSession at 0x117bbef60>"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"spark"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit 7688246

Please sign in to comment.