From 912794e9f2b8c340507eb795aa6a9a96387c6481 Mon Sep 17 00:00:00 2001 From: Carl GENNETAIS Date: Fri, 22 Mar 2024 15:55:30 +0100 Subject: [PATCH] clean code --- Makefile | 7 + dstools-requirements.txt | 2 + notebooks/PySpark_linreg_P4.ipynb | 469 +++++++++++++++--------------- setup.py | 10 +- src/data/make_dataset.py | 17 +- test_environment.py | 9 +- 6 files changed, 270 insertions(+), 244 deletions(-) diff --git a/Makefile b/Makefile index 89f2803..6215aea 100644 --- a/Makefile +++ b/Makefile @@ -40,6 +40,13 @@ clean: find . -type f -name "*.py[co]" -delete find . -type d -name "__pycache__" -delete +## Clean notebooks and python fils with black and isort +clean_code: + black . + isort . + nbqa autoflake --remove-all-unused-imports --remove-unused-variables -i notebooks/*.ipynb + nbqa isort notebooks/*.ipynb + ## Lint using flake8 lint: flake8 src diff --git a/dstools-requirements.txt b/dstools-requirements.txt index 1312c50..7ac2e96 100644 --- a/dstools-requirements.txt +++ b/dstools-requirements.txt @@ -4,3 +4,5 @@ jupyterlab-widgets jupyterlab_code_formatter jupyterlab_git lckr_jupyterlab_variableinspector +nbqa +autoflake diff --git a/notebooks/PySpark_linreg_P4.ipynb b/notebooks/PySpark_linreg_P4.ipynb index 20b24af..4475539 100644 --- a/notebooks/PySpark_linreg_P4.ipynb +++ b/notebooks/PySpark_linreg_P4.ipynb @@ -6,11 +6,11 @@ "id": "6eed6693-9c89-4581-9526-b3c7cd79a69c", "metadata": { "execution": { - "iopub.execute_input": "2024-03-21T19:42:53.406201Z", - "iopub.status.busy": "2024-03-21T19:42:53.406107Z", - "iopub.status.idle": "2024-03-21T19:42:55.223195Z", - "shell.execute_reply": "2024-03-21T19:42:55.222840Z", - "shell.execute_reply.started": "2024-03-21T19:42:53.406190Z" + "iopub.execute_input": "2024-03-22T14:54:38.589245Z", + "iopub.status.busy": "2024-03-22T14:54:38.589001Z", + "iopub.status.idle": "2024-03-22T14:54:45.763866Z", + "shell.execute_reply": "2024-03-22T14:54:45.763245Z", + "shell.execute_reply.started": "2024-03-22T14:54:38.589224Z" } }, "outputs": [ @@ -18,11 +18,11 @@ "name": "stderr", "output_type": "stream", "text": [ - "24/03/21 20:42:54 WARN Utils: Your hostname, carl-Precision-7780 resolves to a loopback address: 127.0.1.1; using 192.168.1.164 instead (on interface enp0s31f6)\n", - "24/03/21 20:42:54 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n", + "24/03/22 15:54:39 WARN Utils: Your hostname, carl-Precision-7780 resolves to a loopback address: 127.0.1.1; using 192.168.1.164 instead (on interface enp0s31f6)\n", + "24/03/22 15:54:39 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n", "Setting default log level to \"WARN\".\n", "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", - "24/03/21 20:42:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", + "24/03/22 15:54:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", "/home/carl/anaconda3/envs/p8env/lib/python3.12/site-packages/pyspark/sql/context.py:113: FutureWarning: Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.\n", " warnings.warn(\n" ] @@ -35,7 +35,7 @@ "from pyspark.ml.feature import VectorAssembler\n", "from pyspark.ml.regression import LinearRegression\n", "from pyspark.sql import SQLContext\n", - "from pyspark.sql.functions import col, count, isnan, when\n", + "from pyspark.sql.functions import count, isnan, when\n", "\n", "display_opt = SparkConf().set(\"spark.sql.repl.eagerEval.enabled\", True)\n", "sc = SparkContext(conf=display_opt)\n", @@ -48,11 +48,11 @@ "id": "a73115ff-ac00-4f80-bdee-31ba8499792a", "metadata": { "execution": { - "iopub.execute_input": "2024-03-21T19:42:59.004879Z", - "iopub.status.busy": "2024-03-21T19:42:59.004516Z", - "iopub.status.idle": "2024-03-21T19:43:01.199579Z", - "shell.execute_reply": "2024-03-21T19:43:01.199177Z", - "shell.execute_reply.started": "2024-03-21T19:42:59.004852Z" + "iopub.execute_input": "2024-03-22T14:54:46.379381Z", + "iopub.status.busy": "2024-03-22T14:54:46.378773Z", + "iopub.status.idle": "2024-03-22T14:54:48.691405Z", + "shell.execute_reply": "2024-03-22T14:54:48.690982Z", + "shell.execute_reply.started": "2024-03-22T14:54:46.379349Z" } }, "outputs": [], @@ -72,11 +72,11 @@ "id": "39b8f9d3-4374-45bd-b446-315cdfaf32c3", "metadata": { "execution": { - "iopub.execute_input": "2024-03-21T19:43:01.764629Z", - "iopub.status.busy": "2024-03-21T19:43:01.764484Z", - "iopub.status.idle": "2024-03-21T19:43:01.782320Z", - "shell.execute_reply": "2024-03-21T19:43:01.782017Z", - "shell.execute_reply.started": "2024-03-21T19:43:01.764619Z" + "iopub.execute_input": "2024-03-22T14:54:49.260166Z", + "iopub.status.busy": "2024-03-22T14:54:49.260032Z", + "iopub.status.idle": "2024-03-22T14:54:49.283108Z", + "shell.execute_reply": "2024-03-22T14:54:49.282687Z", + "shell.execute_reply.started": "2024-03-22T14:54:49.260156Z" } }, "outputs": [ @@ -147,14 +147,21 @@ "id": "20d11800-ac2b-4144-9b6a-ab91581ac5e3", "metadata": { "execution": { - "iopub.execute_input": "2024-03-21T19:43:07.342021Z", - "iopub.status.busy": "2024-03-21T19:43:07.341730Z", - "iopub.status.idle": "2024-03-21T19:43:08.617871Z", - "shell.execute_reply": "2024-03-21T19:43:08.617585Z", - "shell.execute_reply.started": "2024-03-21T19:43:07.342000Z" + "iopub.execute_input": "2024-03-22T14:54:49.284229Z", + "iopub.status.busy": "2024-03-22T14:54:49.284063Z", + "iopub.status.idle": "2024-03-22T14:54:50.825190Z", + "shell.execute_reply": "2024-03-22T14:54:50.824360Z", + "shell.execute_reply.started": "2024-03-22T14:54:49.284215Z" } }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, { "data": { "text/html": [ @@ -387,11 +394,11 @@ "id": "1a7dfd64-6262-4798-bdc9-305cbf555129", "metadata": { "execution": { - "iopub.execute_input": "2024-03-21T19:43:08.897165Z", - "iopub.status.busy": "2024-03-21T19:43:08.897009Z", - "iopub.status.idle": "2024-03-21T19:43:08.900519Z", - "shell.execute_reply": "2024-03-21T19:43:08.900146Z", - "shell.execute_reply.started": "2024-03-21T19:43:08.897154Z" + "iopub.execute_input": "2024-03-22T14:54:50.825831Z", + "iopub.status.busy": "2024-03-22T14:54:50.825670Z", + "iopub.status.idle": "2024-03-22T14:54:50.830942Z", + "shell.execute_reply": "2024-03-22T14:54:50.830241Z", + "shell.execute_reply.started": "2024-03-22T14:54:50.825816Z" } }, "outputs": [ @@ -461,11 +468,11 @@ "id": "d2331605-48cc-4172-8080-4d973348bed6", "metadata": { "execution": { - "iopub.execute_input": "2024-03-21T19:43:09.878621Z", - "iopub.status.busy": "2024-03-21T19:43:09.878258Z", - "iopub.status.idle": "2024-03-21T19:43:09.883339Z", - "shell.execute_reply": "2024-03-21T19:43:09.882286Z", - "shell.execute_reply.started": "2024-03-21T19:43:09.878594Z" + "iopub.execute_input": "2024-03-22T14:54:50.831694Z", + "iopub.status.busy": "2024-03-22T14:54:50.831540Z", + "iopub.status.idle": "2024-03-22T14:54:50.834443Z", + "shell.execute_reply": "2024-03-22T14:54:50.833892Z", + "shell.execute_reply.started": "2024-03-22T14:54:50.831680Z" } }, "outputs": [], @@ -481,21 +488,14 @@ "id": "fb37a9e9-fcc3-4164-babf-d40472064e5a", "metadata": { "execution": { - "iopub.execute_input": "2024-03-21T19:43:10.434406Z", - "iopub.status.busy": "2024-03-21T19:43:10.434047Z", - "iopub.status.idle": "2024-03-21T19:43:10.614599Z", - "shell.execute_reply": "2024-03-21T19:43:10.614254Z", - "shell.execute_reply.started": "2024-03-21T19:43:10.434382Z" + "iopub.execute_input": "2024-03-22T14:54:51.452960Z", + "iopub.status.busy": "2024-03-22T14:54:51.452784Z", + "iopub.status.idle": "2024-03-22T14:54:51.678508Z", + "shell.execute_reply": "2024-03-22T14:54:51.677960Z", + "shell.execute_reply.started": "2024-03-22T14:54:51.452943Z" } }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "24/03/21 20:43:10 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors\n" - ] - }, { "name": "stdout", "output_type": "stream", @@ -516,15 +516,15 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 8, "id": "4af50dc1-1fef-41ac-b7e6-4ef504c062f6", "metadata": { "execution": { - "iopub.execute_input": "2024-03-21T19:34:19.923760Z", - "iopub.status.busy": "2024-03-21T19:34:19.923414Z", - "iopub.status.idle": "2024-03-21T19:34:19.927997Z", - "shell.execute_reply": "2024-03-21T19:34:19.927672Z", - "shell.execute_reply.started": "2024-03-21T19:34:19.923735Z" + "iopub.execute_input": "2024-03-22T14:54:51.679311Z", + "iopub.status.busy": "2024-03-22T14:54:51.679136Z", + "iopub.status.idle": "2024-03-22T14:54:51.684109Z", + "shell.execute_reply": "2024-03-22T14:54:51.683758Z", + "shell.execute_reply.started": "2024-03-22T14:54:51.679297Z" } }, "outputs": [ @@ -579,7 +579,7 @@ " ('GHGEmissionsIntensity', 'double')]" ] }, - "execution_count": 64, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -590,15 +590,15 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "752e9585-af21-4bd3-9fc9-6b4d3b723d86", "metadata": { "execution": { - "iopub.execute_input": "2024-03-21T19:44:04.674777Z", - "iopub.status.busy": "2024-03-21T19:44:04.674471Z", - "iopub.status.idle": "2024-03-21T19:44:05.173212Z", - "shell.execute_reply": "2024-03-21T19:44:05.172767Z", - "shell.execute_reply.started": "2024-03-21T19:44:04.674756Z" + "iopub.execute_input": "2024-03-22T14:54:52.528303Z", + "iopub.status.busy": "2024-03-22T14:54:52.528148Z", + "iopub.status.idle": "2024-03-22T14:54:53.019600Z", + "shell.execute_reply": "2024-03-22T14:54:53.019134Z", + "shell.execute_reply.started": "2024-03-22T14:54:52.528288Z" } }, "outputs": [ @@ -629,11 +629,11 @@ "id": "7aff4c74-e904-475f-bb2a-681022040848", "metadata": { "execution": { - "iopub.execute_input": "2024-03-21T19:45:06.698197Z", - "iopub.status.busy": "2024-03-21T19:45:06.697934Z", - "iopub.status.idle": "2024-03-21T19:45:06.703149Z", - "shell.execute_reply": "2024-03-21T19:45:06.702354Z", - "shell.execute_reply.started": "2024-03-21T19:45:06.698179Z" + "iopub.execute_input": "2024-03-22T14:54:53.271465Z", + "iopub.status.busy": "2024-03-22T14:54:53.271227Z", + "iopub.status.idle": "2024-03-22T14:54:53.275818Z", + "shell.execute_reply": "2024-03-22T14:54:53.275152Z", + "shell.execute_reply.started": "2024-03-22T14:54:53.271445Z" } }, "outputs": [ @@ -655,15 +655,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "4eab6d4d-1057-45e1-a722-726f69173ac6", "metadata": { "execution": { - "iopub.execute_input": "2024-03-21T19:44:52.981888Z", - "iopub.status.busy": "2024-03-21T19:44:52.981178Z", - "iopub.status.idle": "2024-03-21T19:44:52.985739Z", - "shell.execute_reply": "2024-03-21T19:44:52.985366Z", - "shell.execute_reply.started": "2024-03-21T19:44:52.981844Z" + "iopub.execute_input": "2024-03-22T14:54:53.277151Z", + "iopub.status.busy": "2024-03-22T14:54:53.276957Z", + "iopub.status.idle": "2024-03-22T14:54:53.285791Z", + "shell.execute_reply": "2024-03-22T14:54:53.285152Z", + "shell.execute_reply.started": "2024-03-22T14:54:53.277134Z" } }, "outputs": [ @@ -682,15 +682,15 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 12, "id": "da71fe65-32af-41f3-b176-14c9b088787c", "metadata": { "execution": { - "iopub.execute_input": "2024-03-21T19:31:47.558614Z", - "iopub.status.busy": "2024-03-21T19:31:47.558264Z", - "iopub.status.idle": "2024-03-21T19:31:47.564584Z", - "shell.execute_reply": "2024-03-21T19:31:47.563545Z", - "shell.execute_reply.started": "2024-03-21T19:31:47.558590Z" + "iopub.execute_input": "2024-03-22T14:54:53.569510Z", + "iopub.status.busy": "2024-03-22T14:54:53.568962Z", + "iopub.status.idle": "2024-03-22T14:54:53.577783Z", + "shell.execute_reply": "2024-03-22T14:54:53.576739Z", + "shell.execute_reply.started": "2024-03-22T14:54:53.569469Z" } }, "outputs": [ @@ -700,7 +700,7 @@ "29" ] }, - "execution_count": 52, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -711,15 +711,15 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 13, "id": "7c29578b-be63-4396-8ad1-9d4696352cf2", "metadata": { "execution": { - "iopub.execute_input": "2024-03-21T19:25:35.238136Z", - "iopub.status.busy": "2024-03-21T19:25:35.237989Z", - "iopub.status.idle": "2024-03-21T19:25:35.256489Z", - "shell.execute_reply": "2024-03-21T19:25:35.255944Z", - "shell.execute_reply.started": "2024-03-21T19:25:35.238126Z" + "iopub.execute_input": "2024-03-22T14:54:54.131845Z", + "iopub.status.busy": "2024-03-22T14:54:54.131598Z", + "iopub.status.idle": "2024-03-22T14:54:54.176130Z", + "shell.execute_reply": "2024-03-22T14:54:54.175443Z", + "shell.execute_reply.started": "2024-03-22T14:54:54.131822Z" } }, "outputs": [], @@ -730,15 +730,15 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 14, "id": "bf00f3f1-5b6f-45e8-aca3-62bd06533a5d", "metadata": { "execution": { - "iopub.execute_input": "2024-03-21T19:29:39.107470Z", - "iopub.status.busy": "2024-03-21T19:29:39.107128Z", - "iopub.status.idle": "2024-03-21T19:29:39.188356Z", - "shell.execute_reply": "2024-03-21T19:29:39.187868Z", - "shell.execute_reply.started": "2024-03-21T19:29:39.107455Z" + "iopub.execute_input": "2024-03-22T14:54:54.440385Z", + "iopub.status.busy": "2024-03-22T14:54:54.440280Z", + "iopub.status.idle": "2024-03-22T14:54:54.493691Z", + "shell.execute_reply": "2024-03-22T14:54:54.493303Z", + "shell.execute_reply.started": "2024-03-22T14:54:54.440376Z" } }, "outputs": [ @@ -748,7 +748,7 @@ "3376" ] }, - "execution_count": 44, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -759,15 +759,15 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 15, "id": "30ae876a-aaad-47ea-8034-aaf2a8f42bc6", "metadata": { "execution": { - "iopub.execute_input": "2024-03-21T19:46:17.346818Z", - "iopub.status.busy": "2024-03-21T19:46:17.346565Z", - "iopub.status.idle": "2024-03-21T19:46:17.359607Z", - "shell.execute_reply": "2024-03-21T19:46:17.358814Z", - "shell.execute_reply.started": "2024-03-21T19:46:17.346800Z" + "iopub.execute_input": "2024-03-22T14:54:54.494367Z", + "iopub.status.busy": "2024-03-22T14:54:54.494210Z", + "iopub.status.idle": "2024-03-22T14:54:54.504964Z", + "shell.execute_reply": "2024-03-22T14:54:54.504380Z", + "shell.execute_reply.started": "2024-03-22T14:54:54.494353Z" } }, "outputs": [], @@ -778,15 +778,15 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 16, "id": "171d6423-bdd1-4024-8a22-bc1d129bc54b", "metadata": { "execution": { - "iopub.execute_input": "2024-03-21T19:47:15.272855Z", - "iopub.status.busy": "2024-03-21T19:47:15.272516Z", - "iopub.status.idle": "2024-03-21T19:47:15.278561Z", - "shell.execute_reply": "2024-03-21T19:47:15.277871Z", - "shell.execute_reply.started": "2024-03-21T19:47:15.272832Z" + "iopub.execute_input": "2024-03-22T14:54:54.817719Z", + "iopub.status.busy": "2024-03-22T14:54:54.817367Z", + "iopub.status.idle": "2024-03-22T14:54:54.824108Z", + "shell.execute_reply": "2024-03-22T14:54:54.823382Z", + "shell.execute_reply.started": "2024-03-22T14:54:54.817692Z" } }, "outputs": [ @@ -824,7 +824,7 @@ " 'GHGEmissionsIntensity']" ] }, - "execution_count": 13, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -835,15 +835,15 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 17, "id": "a486d74d-6a8b-46ce-b63e-ad02d96f462a", "metadata": { "execution": { - "iopub.execute_input": "2024-03-21T19:50:40.310265Z", - "iopub.status.busy": "2024-03-21T19:50:40.309941Z", - "iopub.status.idle": "2024-03-21T19:50:40.314474Z", - "shell.execute_reply": "2024-03-21T19:50:40.313735Z", - "shell.execute_reply.started": "2024-03-21T19:50:40.310242Z" + "iopub.execute_input": "2024-03-22T14:54:55.645509Z", + "iopub.status.busy": "2024-03-22T14:54:55.645167Z", + "iopub.status.idle": "2024-03-22T14:54:55.649674Z", + "shell.execute_reply": "2024-03-22T14:54:55.648991Z", + "shell.execute_reply.started": "2024-03-22T14:54:55.645480Z" } }, "outputs": [], @@ -853,15 +853,15 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 18, "id": "4c31643c-2749-48a2-9e06-e19318ffe11e", "metadata": { "execution": { - "iopub.execute_input": "2024-03-21T19:56:55.079753Z", - "iopub.status.busy": "2024-03-21T19:56:55.079435Z", - "iopub.status.idle": "2024-03-21T19:56:55.084674Z", - "shell.execute_reply": "2024-03-21T19:56:55.083951Z", - "shell.execute_reply.started": "2024-03-21T19:56:55.079731Z" + "iopub.execute_input": "2024-03-22T14:54:55.915851Z", + "iopub.status.busy": "2024-03-22T14:54:55.915591Z", + "iopub.status.idle": "2024-03-22T14:54:55.921149Z", + "shell.execute_reply": "2024-03-22T14:54:55.920457Z", + "shell.execute_reply.started": "2024-03-22T14:54:55.915828Z" } }, "outputs": [], @@ -879,15 +879,15 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 19, "id": "bf903c23-dfa1-466e-a240-840b79cb3940", "metadata": { "execution": { - "iopub.execute_input": "2024-03-21T19:56:57.292177Z", - "iopub.status.busy": "2024-03-21T19:56:57.291901Z", - "iopub.status.idle": "2024-03-21T19:56:57.321592Z", - "shell.execute_reply": "2024-03-21T19:56:57.321313Z", - "shell.execute_reply.started": "2024-03-21T19:56:57.292157Z" + "iopub.execute_input": "2024-03-22T14:54:56.591279Z", + "iopub.status.busy": "2024-03-22T14:54:56.590962Z", + "iopub.status.idle": "2024-03-22T14:54:56.661990Z", + "shell.execute_reply": "2024-03-22T14:54:56.661643Z", + "shell.execute_reply.started": "2024-03-22T14:54:56.591257Z" }, "scrolled": true }, @@ -903,15 +903,15 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 20, "id": "7bf9e53f-597d-4af6-b4e5-8dcba12df806", "metadata": { "execution": { - "iopub.execute_input": "2024-03-21T19:59:15.853846Z", - "iopub.status.busy": "2024-03-21T19:59:15.853500Z", - "iopub.status.idle": "2024-03-21T19:59:15.884833Z", - "shell.execute_reply": "2024-03-21T19:59:15.884444Z", - "shell.execute_reply.started": "2024-03-21T19:59:15.853813Z" + "iopub.execute_input": "2024-03-22T14:54:58.493010Z", + "iopub.status.busy": "2024-03-22T14:54:58.492715Z", + "iopub.status.idle": "2024-03-22T14:54:58.538878Z", + "shell.execute_reply": "2024-03-22T14:54:58.538485Z", + "shell.execute_reply.started": "2024-03-22T14:54:58.492989Z" } }, "outputs": [ @@ -954,15 +954,15 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 21, "id": "135f213a-73d3-4a1a-a036-19221db803cf", "metadata": { "execution": { - "iopub.execute_input": "2024-03-21T20:08:50.727271Z", - "iopub.status.busy": "2024-03-21T20:08:50.726960Z", - "iopub.status.idle": "2024-03-21T20:08:50.735771Z", - "shell.execute_reply": "2024-03-21T20:08:50.735247Z", - "shell.execute_reply.started": "2024-03-21T20:08:50.727250Z" + "iopub.execute_input": "2024-03-22T14:54:58.907301Z", + "iopub.status.busy": "2024-03-22T14:54:58.907197Z", + "iopub.status.idle": "2024-03-22T14:54:58.917459Z", + "shell.execute_reply": "2024-03-22T14:54:58.917170Z", + "shell.execute_reply.started": "2024-03-22T14:54:58.907292Z" } }, "outputs": [], @@ -972,25 +972,25 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 22, "id": "00584f68-5dab-4dac-bd53-8c234b3146f4", "metadata": { "execution": { - "iopub.execute_input": "2024-03-21T20:08:51.691723Z", - "iopub.status.busy": "2024-03-21T20:08:51.691573Z", - "iopub.status.idle": "2024-03-21T20:08:51.758959Z", - "shell.execute_reply": "2024-03-21T20:08:51.758634Z", - "shell.execute_reply.started": "2024-03-21T20:08:51.691713Z" + "iopub.execute_input": "2024-03-22T14:54:59.647369Z", + "iopub.status.busy": "2024-03-22T14:54:59.647253Z", + "iopub.status.idle": "2024-03-22T14:54:59.735076Z", + "shell.execute_reply": "2024-03-22T14:54:59.734530Z", + "shell.execute_reply.started": "2024-03-22T14:54:59.647359Z" } }, "outputs": [ { "data": { "text/plain": [ - "2435" + "2355" ] }, - "execution_count": 50, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -1001,25 +1001,25 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 23, "id": "5c4cb2e5-ea66-4a2e-a0ea-cb53500c0239", "metadata": { "execution": { - "iopub.execute_input": "2024-03-21T20:08:55.143565Z", - "iopub.status.busy": "2024-03-21T20:08:55.143417Z", - "iopub.status.idle": "2024-03-21T20:08:55.217304Z", - "shell.execute_reply": "2024-03-21T20:08:55.217036Z", - "shell.execute_reply.started": "2024-03-21T20:08:55.143555Z" + "iopub.execute_input": "2024-03-22T14:54:59.735611Z", + "iopub.status.busy": "2024-03-22T14:54:59.735505Z", + "iopub.status.idle": "2024-03-22T14:54:59.811983Z", + "shell.execute_reply": "2024-03-22T14:54:59.811373Z", + "shell.execute_reply.started": "2024-03-22T14:54:59.735601Z" } }, "outputs": [ { "data": { "text/plain": [ - "941" + "1021" ] }, - "execution_count": 51, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -1030,24 +1030,31 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 24, "id": "472947c2-0361-4cb9-a328-181722bc9bb9", "metadata": { "execution": { - "iopub.execute_input": "2024-03-21T20:08:56.806528Z", - "iopub.status.busy": "2024-03-21T20:08:56.806348Z", - "iopub.status.idle": "2024-03-21T20:08:57.019657Z", - "shell.execute_reply": "2024-03-21T20:08:57.019280Z", - "shell.execute_reply.started": "2024-03-21T20:08:56.806517Z" + "iopub.execute_input": "2024-03-22T14:55:00.972151Z", + "iopub.status.busy": "2024-03-22T14:55:00.972048Z", + "iopub.status.idle": "2024-03-22T14:55:01.972073Z", + "shell.execute_reply": "2024-03-22T14:55:01.971490Z", + "shell.execute_reply.started": "2024-03-22T14:55:00.972142Z" } }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "24/03/22 15:55:01 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "Coefficients: [-151.07954359371521,0.0,-53.51730378157163,-240554.4896691815,8212322.653426656,10486418.26409136,-673.0120424320967,-1653700.1162544375,217778.52265853234,-2.16218422315188,-84.81031719628625,-0.0,33.23002323360613,139.22501191859666,407.55695809713234,-1.3963113958138527e-52]\n", - "Intercept: 904104916.2293359\n" + "Coefficients: [-169.835221691931,0.0,-61.23284804150245,-185513.58183636863,6662550.172168628,12709877.40851308,-2931.877528773897,-1230419.0625483326,217689.245697947,6.019954257079121,-41.14292565613203,6.68375469861609,15.462736917063278,126.01066663417589,-66.00224376693296,-1.99739646325876e-52]\n", + "Intercept: 1255650414.0599008\n" ] } ], @@ -1067,15 +1074,15 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 25, "id": "bd992231-9b5b-42bc-acd1-0687cb9b4428", "metadata": { "execution": { - "iopub.execute_input": "2024-03-21T20:08:59.924881Z", - "iopub.status.busy": "2024-03-21T20:08:59.923927Z", - "iopub.status.idle": "2024-03-21T20:08:59.929733Z", - "shell.execute_reply": "2024-03-21T20:08:59.929243Z", - "shell.execute_reply.started": "2024-03-21T20:08:59.924851Z" + "iopub.execute_input": "2024-03-22T14:55:02.566794Z", + "iopub.status.busy": "2024-03-22T14:55:02.566679Z", + "iopub.status.idle": "2024-03-22T14:55:02.569595Z", + "shell.execute_reply": "2024-03-22T14:55:02.569346Z", + "shell.execute_reply.started": "2024-03-22T14:55:02.566784Z" } }, "outputs": [ @@ -1083,8 +1090,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "RMSE: 12917246.730128\n", - "r2: 0.455302\n" + "RMSE: 12569322.832757\n", + "r2: 0.349250\n" ] } ], @@ -1097,15 +1104,15 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 26, "id": "15ec09ab-d168-4fcb-b67f-1689ba513625", "metadata": { "execution": { - "iopub.execute_input": "2024-03-21T20:09:01.107098Z", - "iopub.status.busy": "2024-03-21T20:09:01.106881Z", - "iopub.status.idle": "2024-03-21T20:09:01.188165Z", - "shell.execute_reply": "2024-03-21T20:09:01.187504Z", - "shell.execute_reply.started": "2024-03-21T20:09:01.107083Z" + "iopub.execute_input": "2024-03-22T14:55:02.570201Z", + "iopub.status.busy": "2024-03-22T14:55:02.570104Z", + "iopub.status.idle": "2024-03-22T14:55:02.677533Z", + "shell.execute_reply": "2024-03-22T14:55:02.676870Z", + "shell.execute_reply.started": "2024-03-22T14:55:02.570192Z" } }, "outputs": [ @@ -1116,11 +1123,11 @@ "+-------+---------------------+\n", "|summary|SiteEnergyUseWN(kBtu)|\n", "+-------+---------------------+\n", - "| count| 2435|\n", - "| mean| 5398252.014250308|\n", - "| stddev| 1.7505775193069838E7|\n", + "| count| 2355|\n", + "| mean| 5478732.977553693|\n", + "| stddev| 1.5584646744936252E7|\n", "| min| 0.0|\n", - "| max| 4.71613856E8|\n", + "| max| 2.96671744E8|\n", "+-------+---------------------+\n", "\n" ] @@ -1132,15 +1139,15 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 27, "id": "9a5727d9-47b2-4ba4-ba51-7bcb6f4fd6d6", "metadata": { "execution": { - "iopub.execute_input": "2024-03-21T20:09:44.267725Z", - "iopub.status.busy": "2024-03-21T20:09:44.266616Z", - "iopub.status.idle": "2024-03-21T20:09:44.340281Z", - "shell.execute_reply": "2024-03-21T20:09:44.339718Z", - "shell.execute_reply.started": "2024-03-21T20:09:44.267691Z" + "iopub.execute_input": "2024-03-22T14:55:02.678012Z", + "iopub.status.busy": "2024-03-22T14:55:02.677910Z", + "iopub.status.idle": "2024-03-22T14:55:02.756394Z", + "shell.execute_reply": "2024-03-22T14:55:02.755974Z", + "shell.execute_reply.started": "2024-03-22T14:55:02.678003Z" } }, "outputs": [ @@ -1151,11 +1158,11 @@ "+--------------------+---------------------+--------------------+\n", "| prediction|SiteEnergyUseWN(kBtu)| features|\n", "+--------------------+---------------------+--------------------+\n", - "| 1.764028689962101E7| 7.3937112E7|[3.0,2016.0,98101...|\n", - "| 2360176.4152789116| 1.2581712E7|[9.0,2016.0,98101...|\n", - "| 7184788.142513275| 1.664693E7|[15.0,2016.0,9810...|\n", - "|2.1202014952000618E7| 4.7023088E7|[26.0,2016.0,9810...|\n", - "|1.9397328736707926E7| 4.9539212E7|[27.0,2016.0,9810...|\n", + "| 8885456.891724348| 7456910.0|[1.0,2016.0,98101...|\n", + "| 8567642.881640196| 6062767.5|[10.0,2016.0,9810...|\n", + "|1.1738310639999866E7| 1.4194054E7|[12.0,2016.0,9810...|\n", + "|1.5553373759315252E7| 2.7070114E7|[16.0,2016.0,9810...|\n", + "| 1.275119070737648E7| 5424942.0|[32.0,2016.0,9810...|\n", "+--------------------+---------------------+--------------------+\n", "only showing top 5 rows\n", "\n" @@ -1169,15 +1176,15 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 28, "id": "e77fa9e4-c547-4f18-8f1e-9d38f95e9ed3", "metadata": { "execution": { - "iopub.execute_input": "2024-03-21T20:10:35.027354Z", - "iopub.status.busy": "2024-03-21T20:10:35.026667Z", - "iopub.status.idle": "2024-03-21T20:10:35.100643Z", - "shell.execute_reply": "2024-03-21T20:10:35.100257Z", - "shell.execute_reply.started": "2024-03-21T20:10:35.027333Z" + "iopub.execute_input": "2024-03-22T14:55:05.937477Z", + "iopub.status.busy": "2024-03-22T14:55:05.937179Z", + "iopub.status.idle": "2024-03-22T14:55:06.026360Z", + "shell.execute_reply": "2024-03-22T14:55:06.025804Z", + "shell.execute_reply.started": "2024-03-22T14:55:05.937456Z" } }, "outputs": [ @@ -1185,7 +1192,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "R Squared (R2) on test data = 0.116433\n" + "R Squared (R2) on test data = 0.29902\n" ] } ], @@ -1200,15 +1207,15 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 29, "id": "a23cb597-5b16-4c83-b3ff-243495a94264", "metadata": { "execution": { - "iopub.execute_input": "2024-03-21T20:11:15.946718Z", - "iopub.status.busy": "2024-03-21T20:11:15.946282Z", - "iopub.status.idle": "2024-03-21T20:11:16.043020Z", - "shell.execute_reply": "2024-03-21T20:11:16.042495Z", - "shell.execute_reply.started": "2024-03-21T20:11:15.946690Z" + "iopub.execute_input": "2024-03-22T14:55:06.668319Z", + "iopub.status.busy": "2024-03-22T14:55:06.668217Z", + "iopub.status.idle": "2024-03-22T14:55:06.724262Z", + "shell.execute_reply": "2024-03-22T14:55:06.723652Z", + "shell.execute_reply.started": "2024-03-22T14:55:06.668309Z" } }, "outputs": [ @@ -1216,7 +1223,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Root Mean Squared Error (RMSE) on test data = 1.01625e+07\n" + "Root Mean Squared Error (RMSE) on test data = 1.39625e+07\n" ] } ], @@ -1230,15 +1237,15 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 30, "id": "943541e5-84e3-4d90-a557-6f2aeba69e02", "metadata": { "execution": { - "iopub.execute_input": "2024-03-21T20:11:26.920558Z", - "iopub.status.busy": "2024-03-21T20:11:26.920208Z", - "iopub.status.idle": "2024-03-21T20:11:27.003940Z", - "shell.execute_reply": "2024-03-21T20:11:27.003366Z", - "shell.execute_reply.started": "2024-03-21T20:11:26.920533Z" + "iopub.execute_input": "2024-03-22T14:55:13.218237Z", + "iopub.status.busy": "2024-03-22T14:55:13.217908Z", + "iopub.status.idle": "2024-03-22T14:55:13.296073Z", + "shell.execute_reply": "2024-03-22T14:55:13.295649Z", + "shell.execute_reply.started": "2024-03-22T14:55:13.218216Z" } }, "outputs": [ @@ -1247,30 +1254,30 @@ "output_type": "stream", "text": [ "numIterations: 10\n", - "objectiveHistory: [0.4999999999999999, 0.4513405332379861, 0.3496288720705346, 0.3132059678991644, 0.2979175219459393, 0.29099582620176784, 0.28634347984746333, 0.2816542883456642, 0.278035308123612, 0.275071666865442, 0.27234891682937795]\n", + "objectiveHistory: [0.5000000000000001, 0.4536465285672086, 0.38173018818596827, 0.35898165958423267, 0.34920852377031136, 0.34218144434462267, 0.33546395273514285, 0.33122907103410054, 0.3281411554929128, 0.32639595888831635, 0.32537510763814687]\n", "+--------------------+\n", "| residuals|\n", "+--------------------+\n", - "| -243489.84001803398|\n", - "| -1341477.7421656847|\n", - "| 494338.07420897484|\n", - "| 515201.9758192301|\n", - "| -1263634.9673179388|\n", - "| -283987.53382360935|\n", - "| 3499325.4105563164|\n", - "| -436322.10798954964|\n", - "|1.4048734492967248E7|\n", - "| 37952.919716358185|\n", - "| 738110.4384701252|\n", - "| 3012087.1990534067|\n", - "| 362873.19047009945|\n", - "| -1783005.3338081837|\n", - "|1.0624193788422823E7|\n", - "| -2121161.340616703|\n", - "| 2397479.845118284|\n", - "| -4644345.676080108|\n", - "| -3645164.687152028|\n", - "| -7219776.80274415|\n", + "| -853400.4558849335|\n", + "|4.6907662356511354E7|\n", + "| -771775.8544082642|\n", + "| -2609141.35815382|\n", + "| 7701158.298909187|\n", + "| -1442988.020166397|\n", + "| -1949979.2562639713|\n", + "| 7487695.064352512|\n", + "| -1230901.5787043571|\n", + "| -1416627.4792191982|\n", + "| 767720.289607048|\n", + "| -1517345.2611379623|\n", + "| -2657093.936000824|\n", + "| 7266613.60664463|\n", + "| 622579.0868973732|\n", + "| 1207522.3704488277|\n", + "| 2.58342321650815E7|\n", + "|3.0136061724802017E7|\n", + "| -5391802.03095746|\n", + "| -6739086.736651182|\n", "+--------------------+\n", "only showing top 20 rows\n", "\n" @@ -1282,6 +1289,14 @@ "print(\"objectiveHistory: %s\" % str(trainingSummary.objectiveHistory))\n", "trainingSummary.residuals.show()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e35041e6-3961-4f79-9a31-d8c6f173aab4", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/setup.py b/setup.py index 9157905..7019aea 100644 --- a/setup.py +++ b/setup.py @@ -1,10 +1,10 @@ from setuptools import find_packages, setup setup( - name='src', + name="src", packages=find_packages(), - version='0.1.0', - description='OpenClassrooms Projet 8 : Déployer un modèle dans le cloud', - author='carl', - license='', + version="0.1.0", + description="OpenClassrooms Projet 8 : Déployer un modèle dans le cloud", + author="carl", + license="", ) diff --git a/src/data/make_dataset.py b/src/data/make_dataset.py index 96b377a..ec4b391 100644 --- a/src/data/make_dataset.py +++ b/src/data/make_dataset.py @@ -1,23 +1,24 @@ # -*- coding: utf-8 -*- -import click import logging from pathlib import Path + +import click from dotenv import find_dotenv, load_dotenv @click.command() -@click.argument('input_filepath', type=click.Path(exists=True)) -@click.argument('output_filepath', type=click.Path()) +@click.argument("input_filepath", type=click.Path(exists=True)) +@click.argument("output_filepath", type=click.Path()) def main(input_filepath, output_filepath): - """ Runs data processing scripts to turn raw data from (../raw) into - cleaned data ready to be analyzed (saved in ../processed). + """Runs data processing scripts to turn raw data from (../raw) into + cleaned data ready to be analyzed (saved in ../processed). """ logger = logging.getLogger(__name__) - logger.info('making final data set from raw data') + logger.info("making final data set from raw data") -if __name__ == '__main__': - log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' +if __name__ == "__main__": + log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" logging.basicConfig(level=logging.INFO, format=log_fmt) # not used in this stub but often useful for finding various files diff --git a/test_environment.py b/test_environment.py index d0ac4a7..5381850 100644 --- a/test_environment.py +++ b/test_environment.py @@ -10,16 +10,17 @@ def main(): elif REQUIRED_PYTHON == "python3": required_major = 3 else: - raise ValueError("Unrecognized python interpreter: {}".format( - REQUIRED_PYTHON)) + raise ValueError("Unrecognized python interpreter: {}".format(REQUIRED_PYTHON)) if system_major != required_major: raise TypeError( "This project requires Python {}. Found: Python {}".format( - required_major, sys.version)) + required_major, sys.version + ) + ) else: print(">>> Development environment passes all tests!") -if __name__ == '__main__': +if __name__ == "__main__": main()