diff --git a/spark_/NYC_Taxi_spark_exp1.ipynb b/spark_/NYC_Taxi_spark_exp1.ipynb index 51806fe..85b590f 100644 --- a/spark_/NYC_Taxi_spark_exp1.ipynb +++ b/spark_/NYC_Taxi_spark_exp1.ipynb @@ -42,7 +42,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 3, @@ -57,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -81,14 +81,15 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "/Users/yennanliu/NYC_Taxi_Trip_Duration/spark_\n" + "[Errno 2] No such file or directory: 'NYC_Taxi_Trip_Duration/spark_/'\n", + "/Users/yennanliu/notebook\n" ] } ], @@ -105,7 +106,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -117,7 +118,27 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 127, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pyspark.sql.dataframe.DataFrame" + ] + }, + "execution_count": 127, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(df_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -136,7 +157,7 @@ " ('trip_duration', 'int')]" ] }, - "execution_count": 30, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -148,7 +169,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -157,7 +178,7 @@ "0" ] }, - "execution_count": 32, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -169,7 +190,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": { "scrolled": true }, @@ -213,7 +234,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -224,7 +245,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -236,7 +257,7 @@ " Row(id=u'id3504673', vendor_id=2, pickup_datetime=datetime.datetime(2016, 4, 6, 19, 32, 31), dropoff_datetime=datetime.datetime(2016, 4, 6, 19, 39, 40), passenger_count=1, pickup_longitude=-74.01004028320312, pickup_latitude=40.719970703125, dropoff_longitude=-74.01226806640625, dropoff_latitude=40.70671844482422, store_and_fwd_flag=u'N', trip_duration=429)]" ] }, - "execution_count": 10, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -247,7 +268,27 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 142, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 142, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spark" + ] + }, + { + "cell_type": "code", + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -257,16 +298,16 @@ "+---------+--------+\n", "| id|count(1)|\n", "+---------+--------+\n", - "|id3013319| 1|\n", - "|id1622754| 1|\n", - "|id2187774| 1|\n", - "|id3921267| 1|\n", - "|id2795297| 1|\n", - "|id0130048| 1|\n", - "|id2088360| 1|\n", - "|id0454719| 1|\n", - "|id2366364| 1|\n", - "|id0187208| 1|\n", + "|id0515898| 1|\n", + "|id2677357| 1|\n", + "|id0556588| 1|\n", + "|id1381256| 1|\n", + "|id3524926| 1|\n", + "|id0082224| 1|\n", + "|id1631034| 1|\n", + "|id3829159| 1|\n", + "|id3048673| 1|\n", + "|id0045059| 1|\n", "+---------+--------+\n", "\n" ] @@ -282,6 +323,28 @@ " limit 10\"\"\").show()\n" ] }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------+\n", + "|count(1)|\n", + "+--------+\n", + "| 1458644|\n", + "+--------+\n", + "\n" + ] + } + ], + "source": [ + "spark.sql(\"SELECT count(*) from df_train_table\" ).show()" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -291,7 +354,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -301,16 +364,16 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 41, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -325,7 +388,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -334,7 +397,7 @@ "Column" ] }, - "execution_count": 43, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -346,7 +409,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -357,7 +420,7 @@ " Row(id=u'id0000250', vendor_id=1, pickup_datetime=datetime.datetime(2016, 3, 30, 8, 38, 35), dropoff_datetime=datetime.datetime(2016, 3, 30, 8, 46, 42), passenger_count=1, pickup_longitude=-73.99744415283203, pickup_latitude=40.7363395690918, dropoff_longitude=-73.98930358886719, dropoff_latitude=40.74203109741211, store_and_fwd_flag=u'N', trip_duration=487, moving_avg=1.0)]" ] }, - "execution_count": 45, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -376,7 +439,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -385,7 +448,7 @@ "DataFrame[pickup_datetime: timestamp, passenger_count: double]" ] }, - "execution_count": 53, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -401,7 +464,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -411,7 +474,7 @@ " Row(pickup_datetime=datetime.datetime(2016, 4, 20, 11, 38, 30), passenger_count=None)]" ] }, - "execution_count": 54, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -436,61 +499,70 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "df__ = sc.textFile(\"/Users/yennanliu/NYC_Taxi_Trip_Duration/data/train.csv\")\n", + "#df__.filter(lambda x: '2124' in x.).collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "u'/Users/yennanliu/NYC_Taxi_Trip_Duration/spark_'" + "[u'id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration',\n", + " u'id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982154846191406,40.767936706542969,-73.964630126953125,40.765602111816406,N,455',\n", + " u'id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415344238281,40.738563537597656,-73.999481201171875,40.731151580810547,N,663',\n", + " u'id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979026794433594,40.763938903808594,-74.005332946777344,40.710086822509766,N,2124',\n", + " u'id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.010040283203125,40.719970703125,-74.01226806640625,40.706718444824219,N,429']" ] }, - "execution_count": 14, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "pwd" + "df__.take(5)" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "df__ = sc.textFile(\"/Users/yennanliu/NYC_Taxi_Trip_Duration/data/train.csv\")\n", - "#df__.filter(lambda x: '2124' in x.).collect()" + "#type(df__)" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[u'id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration',\n", - " u'id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982154846191406,40.767936706542969,-73.964630126953125,40.765602111816406,N,455',\n", - " u'id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415344238281,40.738563537597656,-73.999481201171875,40.731151580810547,N,663',\n", - " u'id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979026794433594,40.763938903808594,-74.005332946777344,40.710086822509766,N,2124',\n", - " u'id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.010040283203125,40.719970703125,-74.01226806640625,40.706718444824219,N,429']" + "pyspark.sql.dataframe.DataFrame" ] }, - "execution_count": 16, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df__.take(5)" + "type(df_train)" ] }, { "cell_type": "code", - "execution_count": 87, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -509,7 +581,7 @@ " ('trip_duration', 'int')]" ] }, - "execution_count": 87, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -522,49 +594,171 @@ }, { "cell_type": "code", - "execution_count": 88, + "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "pyspark.sql.dataframe.DataFrame" + "pyspark.rdd.RDD" ] }, - "execution_count": 88, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "type(df_train)" + "xx = df_train.select('id','vendor_id','pickup_datetime').rdd\n", + "#xx = df_train.select('vendor_id').rdd\n", + "type(xx)" ] }, { "cell_type": "code", - "execution_count": 146, + "execution_count": 93, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "pyspark.rdd.RDD" + "['2016-03-14', '2016-06-12', '2016-01-19', '2016-04-06']" ] }, - "execution_count": 146, + "execution_count": 93, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "xx = df_train.select('id','vendor_id','pickup_datetime').rdd\n", - "#xx = df_train.select('vendor_id').rdd\n", - "type(xx)" + "# date \n", + "xx.map(lambda x: str(x[2]).split(\" \")[0] ).take(4)" ] }, { "cell_type": "code", - "execution_count": 126, + "execution_count": 89, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(u'id2875421', 2), (u'id2377394', 1), (u'id3858529', 2), (u'id3504673', 2)]" + ] + }, + "execution_count": 89, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# id and vendor id \n", + "xx.map(lambda x: (x[0], x[1]) ).take(4)" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# filter \n", + "\n", + "#xx.filter(lambda x : x[0] == 'id2875421').take(10)\n", + "xx.filter(lambda x : x[0] == 'id2875421').count()" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(u'id2875421', 2, datetime.datetime(2016, 3, 14, 17, 24, 55)),\n", + " (u'id2377394', 1, datetime.datetime(2016, 6, 12, 0, 43, 35)),\n", + " (u'id3858529', 2, datetime.datetime(2016, 1, 19, 11, 35, 24))]" + ] + }, + "execution_count": 102, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "xx.map(lambda x: (x[0:])).take(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[u'id2875421',\n", + " 2,\n", + " datetime.datetime(2016, 3, 14, 17, 24, 55),\n", + " u'id2377394',\n", + " 1,\n", + " datetime.datetime(2016, 6, 12, 0, 43, 35),\n", + " u'id3858529',\n", + " 2,\n", + " datetime.datetime(2016, 1, 19, 11, 35, 24),\n", + " u'id3504673']" + ] + }, + "execution_count": 115, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# flatMap\n", + "\n", + "xx.flatMap(lambda x: (x)).take(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[datetime.datetime(2016, 3, 14, 17, 24, 55),\n", + " datetime.datetime(2016, 6, 12, 0, 43, 35),\n", + " datetime.datetime(2016, 1, 19, 11, 35, 24),\n", + " datetime.datetime(2016, 4, 6, 19, 32, 31)]" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#xx\n", + "xx.map(lambda x: (x[2]) ).take(4)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -573,7 +767,7 @@ "[True, False, False]" ] }, - "execution_count": 126, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -584,7 +778,7 @@ }, { "cell_type": "code", - "execution_count": 136, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -596,7 +790,7 @@ }, { "cell_type": "code", - "execution_count": 157, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -605,7 +799,7 @@ "[u'id2875421', u'id2377394', u'id3858529']" ] }, - "execution_count": 157, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -617,7 +811,201 @@ }, { "cell_type": "code", - "execution_count": 159, + "execution_count": 123, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(u'id2875421', 2, datetime.datetime(2016, 3, 14, 17, 24, 55)),\n", + " (u'id2377394', 1, datetime.datetime(2016, 6, 12, 0, 43, 35)),\n", + " (u'id3858529', 2, datetime.datetime(2016, 1, 19, 11, 35, 24))]" + ] + }, + "execution_count": 123, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "xx.map(lambda x: (x[0], x[1],x[2]) ).take(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "metadata": {}, + "outputs": [], + "source": [ + "#data_key_2 = xx.map(lambda x: (x[0]) )\n", + "#data_key_2.reduceByKey(lambda x: x + x).take(4)" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(u'id0349415', 2), (u'id2017679', 1), (u'id1697645', 2), (u'id0429368', 2)]" + ] + }, + "execution_count": 122, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# reduceByKey\n", + "\n", + "#xx.reduceByKey(lambda x, y: x + y).collect()\n", + "data_key_ = xx.map(lambda x: (x[0], x[1]) )\n", + "data_key_.reduceByKey(lambda x, y: x + y).take(4)" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DataFrame[id: string, vendor_id: bigint, pickup_datetime: timestamp]" + ] + }, + "execution_count": 144, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spark.createDataFrame(xx)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "DataFrame[id: string, vendor_id: bigint, pickup_datetime: timestamp]" + ] + }, + "execution_count": 131, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# rdd -> spark dataframe \n", + "\n", + "df_xx = sqlContext.createDataFrame(xx)\n", + "df_xx" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pyspark.sql.dataframe.DataFrame" + ] + }, + "execution_count": 133, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(df_xx)" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Column" + ] + }, + "execution_count": 138, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_xx['id'].show" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 140, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------+--------+\n", + "| id|count(1)|\n", + "+---------+--------+\n", + "|id3013319| 1|\n", + "|id1622754| 1|\n", + "|id2187774| 1|\n", + "|id3921267| 1|\n", + "|id2795297| 1|\n", + "|id0130048| 1|\n", + "|id2088360| 1|\n", + "|id0454719| 1|\n", + "|id2366364| 1|\n", + "|id0187208| 1|\n", + "+---------+--------+\n", + "\n" + ] + } + ], + "source": [ + "# spark dataframe to spark sql \n", + "\n", + "df_xx.registerTempTable(\"df_xx_table\")\n", + "sqlContext.sql(\"\"\"\n", + " SELECT id, count(*) \n", + " FROM df_xx_table\n", + " group by 1 \n", + " order by 2 desc \n", + " limit 10\"\"\").show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -630,7 +1018,7 @@ " Row(id=u'id2181028', vendor_id=2, pickup_datetime=datetime.datetime(2016, 3, 26, 13, 30, 55))]" ] }, - "execution_count": 159, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -641,7 +1029,7 @@ }, { "cell_type": "code", - "execution_count": 130, + "execution_count": 30, "metadata": {}, "outputs": [ { @@ -650,7 +1038,7 @@ "[2, 1, 2]" ] }, - "execution_count": 130, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -661,7 +1049,7 @@ }, { "cell_type": "code", - "execution_count": 158, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -670,7 +1058,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 32, "metadata": {}, "outputs": [ { @@ -688,7 +1076,7 @@ " u'id1301050,1,2016-05-27']" ] }, - "execution_count": 66, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -699,19 +1087,19 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "# df__.map(lambda x: x[12:22]).take(10)\n", "\n", - "df__value = df__.filter(lambda line: line != header)\n", - "result = df__value.groupBy(lambda x : x[0:][12:22]).take(10)" + "#df__value = df__.filter(lambda line: line != header)\n", + "#result = df__value.groupBy(lambda x : x[0:][12:22]).take(10)" ] }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 35, "metadata": {}, "outputs": [ { @@ -728,6 +1116,157 @@ "df__value.map(lambda x : x[0:][12:22]).take(10)\n", "print (df__value.map(lambda x : x[0:][12:22]).take(10))\n" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "###############" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('a', 2), ('b', 1)]" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rdd = sc.parallelize([(\"a\", 1), (\"b\", 1), (\"a\", 1)])\n", + "sorted(rdd.countByKey().items())\n" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('a', 2), ('b', 1)]" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from operator import add\n", + "rdd = sc.parallelize([(\"a\", 1), (\"b\", 1), (\"a\", 1)])\n", + "sorted(rdd.reduceByKey(add).collect())\n" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('a', 3),\n", + " ('fleece', 7),\n", + " ('had', 2),\n", + " ('lamb', 5),\n", + " ('little', 4),\n", + " ('Mary', 1),\n", + " ('was', 8),\n", + " ('white', 9),\n", + " ('whose', 6)]" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tmp = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]\n", + "sc.parallelize(tmp).sortByKey().first()\n", + "sc.parallelize(tmp).sortByKey(True, 1).collect()\n", + "sc.parallelize(tmp).sortByKey(True, 2).collect()\n", + "tmp2 = [('Mary', 1), ('had', 2), ('a', 3), ('little', 4), ('lamb', 5)]\n", + "tmp2.extend([('whose', 6), ('fleece', 7), ('was', 8), ('white', 9)])\n", + "sc.parallelize(tmp2).sortByKey(True, 3, keyfunc=lambda k: k.lower()).collect()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(0, 1000), (1, 1001), (2, 1002), (3, 1003), (4, 1004)]" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x = sc.parallelize(range(0,5))\n", + "y = sc.parallelize(range(1000, 1005))\n", + "x.zip(y).collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(0, [2, 8]), (1, [1, 1, 3, 5])]" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rdd = sc.parallelize([1, 1, 2, 3, 5, 8])\n", + "result = rdd.groupBy(lambda x: x % 2).collect()\n", + "sorted([(x, sorted(y)) for (x, y) in result])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {