diff --git a/analyses/2019_03_Aimaanhasan__issue#36_array_extension.ipynb b/analyses/2019_03_Aimaanhasan__issue#36_array_extension.ipynb index 54adc8a..da05558 100644 --- a/analyses/2019_03_Aimaanhasan__issue#36_array_extension.ipynb +++ b/analyses/2019_03_Aimaanhasan__issue#36_array_extension.ipynb @@ -8,6 +8,84 @@ "This notebook have used the existing examples of string processing which @birdsarah has produced, present in 'analyses/issue_36.ipynb' to analyse it using **array extensions**" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Fletcher Array Extensions\n", + "Source: [fletcher docs](https://fletcher.readthedocs.io/en/latest/)\n", + "\n", + "Fletcher provides a generic implementation of the ExtensionDtype and ExtensionArray interfaces of Pandas for columns backed by Apache Arrow. By using it you can use any data type available in Apache Arrow natively in Pandas. Most prominently, fletcher provides native String und List types.\n", + "\n", + "In addition to bringing an alternative memory backend to NumPy, fletcher also provides high-performance operations on the new column types. It will either use the native implementation of an algorithm if provided in pyarrow or otherwise provide an implementation by itself using Numba.\n", + "\n", + "An example of usage is shown below\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 3 entries, 0 to 2\n", + "Data columns (total 1 columns):\n", + "str_column 2 non-null fletcher[string]\n", + "dtypes: fletcher[string](1)\n", + "memory usage: 108.0 bytes\n" + ] + } + ], + "source": [ + "import fletcher as fr\n", + "import pandas as pd\n", + "\n", + "df = pd.DataFrame({\n", + " 'str_column': fr.FletcherArray(['Test', None, 'Strings'])\n", + "})\n", + "df.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Analysis #1\n", + "### Comparing time to calculate count of unique values and nunique method usage \n", + "Calculating count of unique values not using fletcher - Wall time = 567ms. See Analysis-1.1 for code and processing
\n", + "Calculating count of unique values using fletcher - Wall time = 0ns. See Analysis-1.2 for code and processing
\n", + "The nunique method doesn't work using Fletcher Array thus \n", + "\n", + "\n", + "\n", + "### Findings\n", + "Array extensions has decreased the Wall time and has increased efficiency" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Analysis #2\n", + "### .str in DASK with Fletcher Array Extension is not supported\n", + "It gives the above error because DASK doesn't allow access to str with Fletcher Array Extension. See Analysis-1.1 for code and processing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Conclusion\n", + "\n", + "1. Array extension makes string processing faster for some cases\n", + "1. Needs extra methods and operations for DASK methods to work correctly\n", + "1. Some methods are not supported for Fletcher Array type objects" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -21,7 +99,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 1, "metadata": { "scrolled": true }, @@ -34,8 +112,8 @@ "\n", "

Client

\n", "\n", "\n", "\n", @@ -43,17 +121,17 @@ "\n", "\n", "\n", "" ], "text/plain": [ - "" + "" ] }, - "execution_count": 26, + "execution_count": 1, "metadata": {}, "output_type": "execute_result" } @@ -67,7 +145,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -166,19 +244,26 @@ "4 https://ajax.googleapis.com/ajax/libs/webfont/... " ] }, - "execution_count": 27, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = dd.read_parquet(\n", - " \"C:\\\\Users\\\\Ayman Hasan\\\\Desktop\\\\Outreachy\\\\sample\\\\part-00000-34d9b361-ea79-42eb-82ee-9c9f9259c339-c000.snappy.parquet\", engine='pyarrow',\n", + " \"C:\\\\Users\\\\hamza\\\\Downloads\\\\safe_dataset.sample (1)\\\\sample\\\\part-00000-34d9b361-ea79-42eb-82ee-9c9f9259c339-c000.snappy.parquet\", engine='pyarrow',\n", " columns=['argument_0', 'func_name', 'symbol', 'location', 'script_url']\n", ")\n", "df.head()\n" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "metadata": {}, @@ -193,7 +278,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -245,7 +330,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -382,7 +467,7 @@ "4 ajax.googleapis.com||webfont.js||ra/< " ] }, - "execution_count": 29, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -397,9 +482,188 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "dfpd = df.compute() #Converting to Pandas Dataframe\n", + "\n", + "\n", + "for i in dfpd.columns:\n", + " dfpd[i] = fr.FletcherArray(dfpd[i])\n", + "dfpd = dd.from_pandas(dfpd, npartitions=2)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Analysis - 1.1\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wall time: 418 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "n_unique_aggs = df.agg.nunique().compute()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1261" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "n_unique_aggs" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wall time: 0 ns\n" + ] + } + ], + "source": [ + "#Using different approach\n", + "%time\n", + "n_unique_aggs_from_fr = len(dfpd.agg.unique().compute())\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1261" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "n_unique_aggs_from_fr" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Analysis - 1.2\n", + "\n", + "See Analysis-1.1 for different approach
" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "ename": "TypeError", + "evalue": "data type not understood", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mn_unique_aggs_from_fr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdfpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0magg\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnunique\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[1;31m#The above command returns an error\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\dask\\dataframe\\core.py\u001b[0m in \u001b[0;36mnunique\u001b[1;34m(self, split_every)\u001b[0m\n\u001b[0;32m 2068\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mderived_from\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mSeries\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2069\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mnunique\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msplit_every\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2070\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdrop_duplicates\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msplit_every\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0msplit_every\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcount\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2071\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2072\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mderived_from\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mSeries\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\dask\\dataframe\\core.py\u001b[0m in \u001b[0;36mdrop_duplicates\u001b[1;34m(self, split_every, split_out, **kwargs)\u001b[0m\n\u001b[0;32m 418\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mdrop_duplicates\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msplit_every\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msplit_out\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 419\u001b[0m \u001b[1;31m# Let pandas error on bad inputs\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 420\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_meta_nonempty\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdrop_duplicates\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 421\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;34m'subset'\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mkwargs\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'subset'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 422\u001b[0m \u001b[0msplit_out_setup\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msplit_out_on_cols\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\dask\\dataframe\\core.py\u001b[0m in \u001b[0;36m_meta_nonempty\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 287\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_meta_nonempty\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 288\u001b[0m \u001b[1;34m\"\"\" A non-empty version of `_meta` with fake data.\"\"\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 289\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mmeta_nonempty\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_meta\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 290\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 291\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\dask\\utils.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, arg, *args, **kwargs)\u001b[0m\n\u001b[0;32m 411\u001b[0m \"\"\"\n\u001b[0;32m 412\u001b[0m \u001b[0mmeth\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdispatch\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marg\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 413\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mmeth\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marg\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 414\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 415\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\dask\\dataframe\\utils.py\u001b[0m in \u001b[0;36m_nonempty_series\u001b[1;34m(s, idx)\u001b[0m\n\u001b[0;32m 445\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 446\u001b[0m \u001b[0mentry\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_scalar_from_dtype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 447\u001b[1;33m \u001b[0mdata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mentry\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mentry\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 448\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 449\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mSeries\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0ms\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0midx\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mTypeError\u001b[0m: data type not understood" + ] + } + ], + "source": [ + "n_unique_aggs_from_fr = dfpd.agg.nunique()\n", + "#The above command returns an error\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Looking for strings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = dd.read_parquet(\n", + " 'C:\\\\Users\\\\Ayman Hasan\\\\Desktop\\\\Outreachy\\\\sample\\\\part-00000-34d9b361-ea79-42eb-82ee-9c9f9259c339-c000.snappy.parquet', engine='pyarrow',\n", + " columns=['argument_0', 'script_url']\n", + ")\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\distributed\\worker.py:2788: UserWarning: Large object of size 3.66 MB detected in task graph: \n", + " ( argument_0 ... 9 columns], 5)\n", + "Consider scattering large objects ahead of time\n", + "with client.scatter to reduce scheduler burden and \n", + "keep data on workers\n", + "\n", + " future = client.submit(func, big_data) # bad\n", + "\n", + " big_future = client.scatter(big_data) # good\n", + " future = client.submit(func, big_future) # good\n", + " % (format_bytes(len(b)), s))\n" + ] + }, { "data": { "text/html": [ @@ -534,7 +798,7 @@ "4 ajax.googleapis.com||webfont.js||ra/< " ] }, - "execution_count": 30, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -551,474 +815,86 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Wall time: 567 ms\n" + "5\n", + "Wall time: 439 ms\n" ] } ], "source": [ "%%time\n", - "n_unique_aggs = df.agg.nunique().compute()" + "print(df[df.argument_0.str.contains('modernizr')].script_url.nunique().compute())\n", + "#print(df[df.argument_0.str.contains('modernizr')].head())" ] }, { - "cell_type": "code", - "execution_count": 32, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1261" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "n_unique_aggs" + "## Analysis - 2\n", + "\n" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 13, "metadata": {}, "outputs": [ { - "ename": "TypeError", - "evalue": "data type not understood", + "ename": "AttributeError", + "evalue": "Can only use .str accessor with object dtype", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mn_unique_aggs_from_fr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdfpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0magg\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnunique\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[1;31m#The above command returns an error\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\dask\\dataframe\\core.py\u001b[0m in \u001b[0;36mnunique\u001b[1;34m(self, split_every)\u001b[0m\n\u001b[0;32m 2068\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mderived_from\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mSeries\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2069\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mnunique\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msplit_every\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2070\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdrop_duplicates\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msplit_every\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0msplit_every\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcount\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2071\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2072\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mderived_from\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mSeries\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\dask\\dataframe\\core.py\u001b[0m in \u001b[0;36mdrop_duplicates\u001b[1;34m(self, split_every, split_out, **kwargs)\u001b[0m\n\u001b[0;32m 418\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mdrop_duplicates\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msplit_every\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msplit_out\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 419\u001b[0m \u001b[1;31m# Let pandas error on bad inputs\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 420\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_meta_nonempty\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdrop_duplicates\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 421\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;34m'subset'\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mkwargs\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'subset'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 422\u001b[0m \u001b[0msplit_out_setup\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msplit_out_on_cols\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\dask\\dataframe\\core.py\u001b[0m in \u001b[0;36m_meta_nonempty\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 287\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_meta_nonempty\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 288\u001b[0m \u001b[1;34m\"\"\" A non-empty version of `_meta` with fake data.\"\"\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 289\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mmeta_nonempty\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_meta\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 290\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 291\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\dask\\utils.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, arg, *args, **kwargs)\u001b[0m\n\u001b[0;32m 411\u001b[0m \"\"\"\n\u001b[0;32m 412\u001b[0m \u001b[0mmeth\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdispatch\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marg\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 413\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mmeth\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marg\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 414\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 415\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\dask\\dataframe\\utils.py\u001b[0m in \u001b[0;36m_nonempty_series\u001b[1;34m(s, idx)\u001b[0m\n\u001b[0;32m 445\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 446\u001b[0m \u001b[0mentry\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_scalar_from_dtype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 447\u001b[1;33m \u001b[0mdata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mentry\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mentry\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 448\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 449\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mSeries\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0ms\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0midx\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mTypeError\u001b[0m: data type not understood" + "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n", + "\u001b[1;32mpandas\\_libs\\properties.pyx\u001b[0m in \u001b[0;36mpandas._libs.properties.CachedProperty.__get__\u001b[1;34m()\u001b[0m\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\dask\\dataframe\\core.py\u001b[0m in \u001b[0;36mstr\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1892\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mstr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1893\u001b[0m \u001b[1;34m\"\"\" Namespace for string methods \"\"\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1894\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mStringAccessor\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1895\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1896\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m__dir__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\dask\\dataframe\\accessor.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, series)\u001b[0m\n\u001b[0;32m 34\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mseries\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mSeries\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 35\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'Accessor cannot be initialized'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 36\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_validate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mseries\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 37\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_series\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mseries\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 38\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\dask\\dataframe\\accessor.py\u001b[0m in \u001b[0;36m_validate\u001b[1;34m(self, series)\u001b[0m\n\u001b[0;32m 114\u001b[0m \u001b[0mis_categorical_dtype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mseries\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mand\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 115\u001b[0m series.cat.categories.dtype == 'object')):\n\u001b[1;32m--> 116\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mAttributeError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Can only use .str accessor with object dtype\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 117\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 118\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mderived_from\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstrings\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mStringMethods\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mAttributeError\u001b[0m: Can only use .str accessor with object dtype" ] } ], "source": [ - "n_unique_aggs_from_fr = dfpd.agg.nunique()\n", - "#The above command returns an error\n" + "%%time\n", + "print(len(dfpd[dfpd.argument_0.str.contains('modernizr')].script_url.unique().compute())) #contains is not supported with fletcher array extension" ] }, { - "cell_type": "code", - "execution_count": 34, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Wall time: 0 ns\n" - ] - } - ], - "source": [ - "#Using different approach\n", - "%time\n", - "n_unique_aggs_from_fr = len(dfpd.agg.unique().compute())\n" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1261" - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "n_unique_aggs_from_fr" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Analysis #1\n", - "### Comparing time to calculate count of unique values \n", - "Calculating count of unique values not using fletcher - Wall time = 567ms
\n", - "Calculating count of unique values using fletcher - Wall time = 0ns\n", - "\n", - "### Findings\n", - "Array extensions has decreased the Wall time and has increased efficiency" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Looking for strings" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
argument_0script_url
0https://staticxx.facebook.com/connect/xd_arbit...
1https://staticxx.facebook.com/connect/xd_arbit...
2https://staticxx.facebook.com/connect/xd_arbit...
3https://staticxx.facebook.com/connect/xd_arbit...
4https://ajax.googleapis.com/ajax/libs/webfont/...
\n", - "
" - ], - "text/plain": [ - " argument_0 script_url\n", - "0 https://staticxx.facebook.com/connect/xd_arbit...\n", - "1 https://staticxx.facebook.com/connect/xd_arbit...\n", - "2 https://staticxx.facebook.com/connect/xd_arbit...\n", - "3 https://staticxx.facebook.com/connect/xd_arbit...\n", - "4 https://ajax.googleapis.com/ajax/libs/webfont/..." - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = dd.read_parquet(\n", - " 'C:\\\\Users\\\\Ayman Hasan\\\\Desktop\\\\Outreachy\\\\sample\\\\part-00000-34d9b361-ea79-42eb-82ee-9c9f9259c339-c000.snappy.parquet', engine='pyarrow',\n", - " columns=['argument_0', 'script_url']\n", - ")\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
argument_0script_url
0https://staticxx.facebook.com/connect/xd_arbit...
1https://staticxx.facebook.com/connect/xd_arbit...
2https://staticxx.facebook.com/connect/xd_arbit...
3https://staticxx.facebook.com/connect/xd_arbit...
4https://ajax.googleapis.com/ajax/libs/webfont/...
\n", - "
" - ], - "text/plain": [ - " argument_0 script_url\n", - "0 https://staticxx.facebook.com/connect/xd_arbit...\n", - "1 https://staticxx.facebook.com/connect/xd_arbit...\n", - "2 https://staticxx.facebook.com/connect/xd_arbit...\n", - "3 https://staticxx.facebook.com/connect/xd_arbit...\n", - "4 https://ajax.googleapis.com/ajax/libs/webfont/..." - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dfpd = df.compute() #Converting to Pandas Dataframe\n", - "\n", - "\n", - "for i in dfpd.columns:\n", - " dfpd[i] = fr.FletcherArray(dfpd[i])\n", - "dfpd = dd.from_pandas(dfpd, npartitions=2)\n", - "dfpd.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "5\n", - "Wall time: 603 ms\n", - "Compiler : 258 ms\n" - ] - } - ], "source": [ - "%%time\n", - "print(df[df.argument_0.str.contains('modernizr')].script_url.nunique().compute())\n", - "#print(df[df.argument_0.str.contains('modernizr')].head())" + "#### Splitting symbol" ] }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 14, "metadata": {}, "outputs": [ { - "ename": "AttributeError", - "evalue": "Can only use .str accessor with object dtype", + "ename": "OSError", + "evalue": "Passed non-file path: C:\\Users\\Ayman Hasan\\Desktop\\Outreachy\\sample\\part-00000-34d9b361-ea79-42eb-82ee-9c9f9259c339-c000.snappy.parquet", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n", - "\u001b[1;32mpandas\\_libs\\properties.pyx\u001b[0m in \u001b[0;36mpandas._libs.properties.CachedProperty.__get__\u001b[1;34m()\u001b[0m\n", - "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\dask\\dataframe\\core.py\u001b[0m in \u001b[0;36mstr\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1892\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mstr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1893\u001b[0m \u001b[1;34m\"\"\" Namespace for string methods \"\"\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1894\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mStringAccessor\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1895\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1896\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m__dir__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\dask\\dataframe\\accessor.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, series)\u001b[0m\n\u001b[0;32m 34\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mseries\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mSeries\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 35\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'Accessor cannot be initialized'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 36\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_validate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mseries\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 37\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_series\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mseries\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 38\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\dask\\dataframe\\accessor.py\u001b[0m in \u001b[0;36m_validate\u001b[1;34m(self, series)\u001b[0m\n\u001b[0;32m 114\u001b[0m \u001b[0mis_categorical_dtype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mseries\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mand\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 115\u001b[0m series.cat.categories.dtype == 'object')):\n\u001b[1;32m--> 116\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mAttributeError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Can only use .str accessor with object dtype\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 117\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 118\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mderived_from\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstrings\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mStringMethods\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mAttributeError\u001b[0m: Can only use .str accessor with object dtype" + "\u001b[1;31mOSError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m df = dd.read_parquet(\n\u001b[0;32m 2\u001b[0m \u001b[1;34m'C:\\\\Users\\\\Ayman Hasan\\\\Desktop\\\\Outreachy\\\\sample\\\\part-00000-34d9b361-ea79-42eb-82ee-9c9f9259c339-c000.snappy.parquet'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mengine\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'pyarrow'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mcolumns\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'argument_0'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'func_name'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'symbol'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'location'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'script_url'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 4\u001b[0m )\n\u001b[0;32m 5\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\dask\\dataframe\\io\\parquet.py\u001b[0m in \u001b[0;36mread_parquet\u001b[1;34m(path, columns, filters, categories, index, storage_options, engine, infer_divisions)\u001b[0m\n\u001b[0;32m 1152\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1153\u001b[0m return read(fs, fs_token, paths, columns=columns, filters=filters,\n\u001b[1;32m-> 1154\u001b[1;33m categories=categories, index=index, infer_divisions=infer_divisions)\n\u001b[0m\u001b[0;32m 1155\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1156\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\dask\\dataframe\\io\\parquet.py\u001b[0m in \u001b[0;36m_read_pyarrow\u001b[1;34m(fs, fs_token, paths, columns, filters, categories, index, infer_divisions)\u001b[0m\n\u001b[0;32m 673\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 674\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 675\u001b[1;33m \u001b[0mdataset\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpq\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mParquetDataset\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpaths\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfilesystem\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mget_pyarrow_filesystem\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 676\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mdataset\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpartitions\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 677\u001b[0m partitions = [n for n in dataset.partitions.partition_names\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pyarrow\\parquet.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, path_or_paths, filesystem, schema, metadata, split_row_groups, validate_schema, filters, metadata_nthreads)\u001b[0m\n\u001b[0;32m 888\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcommon_metadata_path\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 889\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmetadata_path\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_make_manifest\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 890\u001b[1;33m path_or_paths, self.fs, metadata_nthreads=metadata_nthreads)\n\u001b[0m\u001b[0;32m 891\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 892\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcommon_metadata_path\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pyarrow\\parquet.py\u001b[0m in \u001b[0;36m_make_manifest\u001b[1;34m(path_or_paths, fs, pathsep, metadata_nthreads)\u001b[0m\n\u001b[0;32m 1063\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mfs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0misfile\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1064\u001b[0m raise IOError('Passed non-file path: {0}'\n\u001b[1;32m-> 1065\u001b[1;33m .format(path))\n\u001b[0m\u001b[0;32m 1066\u001b[0m \u001b[0mpiece\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mParquetDatasetPiece\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1067\u001b[0m \u001b[0mpieces\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpiece\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mOSError\u001b[0m: Passed non-file path: C:\\Users\\Ayman Hasan\\Desktop\\Outreachy\\sample\\part-00000-34d9b361-ea79-42eb-82ee-9c9f9259c339-c000.snappy.parquet" ] } ], - "source": [ - "%%time\n", - "print(len(dfpd[dfpd.argument_0.str.contains('modernizr')].script_url.unique().compute())) #contains is not supported with fletcher array extension" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Analysis #2\n", - "### .str in DASK with Fletcher Array Extension is not supported\n", - "It gives the above error because DASK doesn't allow access to str with Fletcher Array Extension" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Conclusion\n", - "\n", - "1. Array extension makes string processing faster for some cases\n", - "1. Needs extra methods and operations for DASK methods to work correctly\n", - "1. Some methods are not supported for Fletcher Array type objects" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Splitting symbol" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
argument_0func_namesymbollocationscript_url
0a/<window.namehttps://staticxx.facebook.com/connect/xd_arbit...https://staticxx.facebook.com/connect/xd_arbit...
1a/<window.namehttps://staticxx.facebook.com/connect/xd_arbit...https://staticxx.facebook.com/connect/xd_arbit...
2Awindow.document.cookiehttps://staticxx.facebook.com/connect/xd_arbit...https://staticxx.facebook.com/connect/xd_arbit...
3xwindow.navigator.userAgenthttps://staticxx.facebook.com/connect/xd_arbit...https://staticxx.facebook.com/connect/xd_arbit...
4ra/<window.navigator.userAgenthttps://cas.us.criteo.com/delivery/r/afr.php?d...https://ajax.googleapis.com/ajax/libs/webfont/...
\n", - "
" - ], - "text/plain": [ - " argument_0 func_name symbol \\\n", - "0 a/< window.name \n", - "1 a/< window.name \n", - "2 A window.document.cookie \n", - "3 x window.navigator.userAgent \n", - "4 ra/< window.navigator.userAgent \n", - "\n", - " location \\\n", - "0 https://staticxx.facebook.com/connect/xd_arbit... \n", - "1 https://staticxx.facebook.com/connect/xd_arbit... \n", - "2 https://staticxx.facebook.com/connect/xd_arbit... \n", - "3 https://staticxx.facebook.com/connect/xd_arbit... \n", - "4 https://cas.us.criteo.com/delivery/r/afr.php?d... \n", - "\n", - " script_url \n", - "0 https://staticxx.facebook.com/connect/xd_arbit... \n", - "1 https://staticxx.facebook.com/connect/xd_arbit... \n", - "2 https://staticxx.facebook.com/connect/xd_arbit... \n", - "3 https://staticxx.facebook.com/connect/xd_arbit... \n", - "4 https://ajax.googleapis.com/ajax/libs/webfont/... " - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ "df = dd.read_parquet(\n", " 'C:\\\\Users\\\\Ayman Hasan\\\\Desktop\\\\Outreachy\\\\sample\\\\part-00000-34d9b361-ea79-42eb-82ee-9c9f9259c339-c000.snappy.parquet', engine='pyarrow',\n", @@ -1029,110 +905,9 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
argument_0func_namesymbollocationscript_url
0a/<window.namehttps://staticxx.facebook.com/connect/xd_arbit...https://staticxx.facebook.com/connect/xd_arbit...
1a/<window.namehttps://staticxx.facebook.com/connect/xd_arbit...https://staticxx.facebook.com/connect/xd_arbit...
2Awindow.document.cookiehttps://staticxx.facebook.com/connect/xd_arbit...https://staticxx.facebook.com/connect/xd_arbit...
3xwindow.navigator.userAgenthttps://staticxx.facebook.com/connect/xd_arbit...https://staticxx.facebook.com/connect/xd_arbit...
4ra/<window.navigator.userAgenthttps://cas.us.criteo.com/delivery/r/afr.php?d...https://ajax.googleapis.com/ajax/libs/webfont/...
\n", - "
" - ], - "text/plain": [ - " argument_0 func_name symbol \\\n", - "0 a/< window.name \n", - "1 a/< window.name \n", - "2 A window.document.cookie \n", - "3 x window.navigator.userAgent \n", - "4 ra/< window.navigator.userAgent \n", - "\n", - " location \\\n", - "0 https://staticxx.facebook.com/connect/xd_arbit... \n", - "1 https://staticxx.facebook.com/connect/xd_arbit... \n", - "2 https://staticxx.facebook.com/connect/xd_arbit... \n", - "3 https://staticxx.facebook.com/connect/xd_arbit... \n", - "4 https://cas.us.criteo.com/delivery/r/afr.php?d... \n", - "\n", - " script_url \n", - "0 https://staticxx.facebook.com/connect/xd_arbit... \n", - "1 https://staticxx.facebook.com/connect/xd_arbit... \n", - "2 https://staticxx.facebook.com/connect/xd_arbit... \n", - "3 https://staticxx.facebook.com/connect/xd_arbit... \n", - "4 https://ajax.googleapis.com/ajax/libs/webfont/... " - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "dfpd = df.compute() #Converting to Pandas Dataframe\n", "\n", @@ -1145,22 +920,9 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "AttributeError", - "evalue": "'DataFrame' object has no attribute 'symbol_parts'", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'symbol_2'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msymbol_parts\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 6\u001b[1;33m \u001b[0mdfpd\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'symbol_0'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdfpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msymbol_parts\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\dask\\dataframe\\core.py\u001b[0m in \u001b[0;36m__getattr__\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 2531\u001b[0m return new_dd_object(merge(self.dask, dsk), name,\n\u001b[0;32m 2532\u001b[0m meta, self.divisions)\n\u001b[1;32m-> 2533\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mAttributeError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"'DataFrame' object has no attribute %r\"\u001b[0m \u001b[1;33m%\u001b[0m \u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2534\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2535\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m__dir__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mAttributeError\u001b[0m: 'DataFrame' object has no attribute 'symbol_parts'" - ] - } - ], + "outputs": [], "source": [ "df['symbol_parts'] = df.symbol.str.split('.')\n", "df['symbol_0'] = df.symbol_parts.str.get(0)\n", @@ -1172,25 +934,9 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "AttributeError", - "evalue": "Can only use .str accessor with object dtype", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mdfpd\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'symbol_parts'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdfpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msymbol\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'.'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[0mdfpd\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'symbol_0'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdfpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msymbol_parts\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[0mdfpd\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'symbol_1'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msymbol_parts\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32mpandas\\_libs\\properties.pyx\u001b[0m in \u001b[0;36mpandas._libs.properties.CachedProperty.__get__\u001b[1;34m()\u001b[0m\n", - "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\dask\\dataframe\\core.py\u001b[0m in \u001b[0;36mstr\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1892\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mstr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1893\u001b[0m \u001b[1;34m\"\"\" Namespace for string methods \"\"\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1894\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mStringAccessor\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1895\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1896\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m__dir__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\dask\\dataframe\\accessor.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, series)\u001b[0m\n\u001b[0;32m 34\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mseries\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mSeries\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 35\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'Accessor cannot be initialized'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 36\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_validate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mseries\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 37\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_series\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mseries\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 38\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\dask\\dataframe\\accessor.py\u001b[0m in \u001b[0;36m_validate\u001b[1;34m(self, series)\u001b[0m\n\u001b[0;32m 114\u001b[0m \u001b[0mis_categorical_dtype\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mseries\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mand\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 115\u001b[0m series.cat.categories.dtype == 'object')):\n\u001b[1;32m--> 116\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mAttributeError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Can only use .str accessor with object dtype\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 117\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 118\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mderived_from\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstrings\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mStringMethods\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mAttributeError\u001b[0m: Can only use .str accessor with object dtype" - ] - } - ], + "outputs": [], "source": [ "dfpd['symbol_parts'] = dfpd.symbol.str.split('.')\n", "dfpd['symbol_0'] = dfpd.symbol_parts.str.get(0)\n", @@ -1201,18 +947,9 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "5\n", - "Wall time: 367 ms\n" - ] - } - ], + "outputs": [], "source": [ "%%time\n", "print(df[df.symbol_1 == 'fillText'].func_name.nunique().compute())" @@ -1222,6 +959,13 @@ "cell_type": "markdown", "metadata": {}, "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -1229,6 +973,18 @@ "display_name": "Python 3", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" } }, "nbformat": 4,