From a3d17db28a2fb97afe098782f92243a01fa51f40 Mon Sep 17 00:00:00 2001 From: Anna Eremina <127204320+annaeremina96@users.noreply.github.com> Date: Thu, 30 Nov 2023 15:41:01 +0300 Subject: [PATCH] Add files via upload --- product_analytics_rfm_analysis.ipynb | 1015 ++++++++++++++++++++++++++ 1 file changed, 1015 insertions(+) create mode 100644 product_analytics_rfm_analysis.ipynb diff --git a/product_analytics_rfm_analysis.ipynb b/product_analytics_rfm_analysis.ipynb new file mode 100644 index 0000000..b9bdfb4 --- /dev/null +++ b/product_analytics_rfm_analysis.ipynb @@ -0,0 +1,1015 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Продуктовая аналитика: Методы сегментации клиентов и целевой аудитории\n", + "\n", + "Обсудим, зачем необходимо сегментировать целевую аудиторию. Поговорим об особенностях сегментации в B2B и B2C продуктах. Познакомимся с популярными критериями выделения сегментов и методами сегментации: описательными (модель 5W, Khramatrix и др.) и аналитическими (RFM, когорты, k-means и др.)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Проект. RFM анализ\n", + "\n", + "**Скачайте датасет и проведите RFM анализ. В каждом подсегменте поделите пользователей на 4 класса. Отсчитывайте количество дней, прошедших с момента последней покупки, с максимальной даты покупки в датасете.**\n", + "\n", + "- Какое максимальное кол-во покупок было совершено одним пользователем?\n", + "- Какая верхняя граница у суммы покупок у пользователей с классом 4 в подсегменте М? (Другими словами: пользователи, у которых сумма покупок от 0 до Х попадают в 4 класс в подсегменте М)\n", + "- Какая нижняя граница у количества покупок у пользователей с классом 1 в подсегменте F?\n", + "- Какая верхняя граница у количества покупок у пользователей с классом 2 в подсегменте R?\n", + "- Сколько пользователей попало в сегмент 111?\n", + "- Сколько пользователей попало в сегмент 311?\n", + "- В каком RFM-сегменте самое большое кол-во пользователей?\n", + "- В каком RFM-сегменте самое маленькое кол-во пользователей?\n", + "- Какое количество пользователей попало в самый малочисленный сегмент?" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import numpy as np\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/tljh/user/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3063: DtypeWarning: Columns (1) have mixed types.Specify dtype option on import or set low_memory=False.\n", + " interactivity=interactivity, compiler=compiler, result=result)\n" + ] + } + ], + "source": [ + "project = pd.read_csv('/mnt/HC_Volume_18315164/home-jupyter/jupyter-a-eremina/product_analytics/RFM_ht_data.csv', sep = ',')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
InvoiceNoCustomerCodeInvoiceDateAmount
0C0011810010001190672902020-09-011716.00
1C0011810010017132339332020-09-011489.74
2C0011810010020990579682020-09-01151.47
3C0011810010021800072762020-09-01146.72
4C0011810010024131640762020-09-01104.00
\n", + "
" + ], + "text/plain": [ + " InvoiceNo CustomerCode InvoiceDate Amount\n", + "0 C0011810010001 19067290 2020-09-01 1716.00\n", + "1 C0011810010017 13233933 2020-09-01 1489.74\n", + "2 C0011810010020 99057968 2020-09-01 151.47\n", + "3 C0011810010021 80007276 2020-09-01 146.72\n", + "4 C0011810010024 13164076 2020-09-01 104.00" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "project.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(332730, 4)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "project.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Приводим данные в нужный формат:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "project['InvoiceDate']=pd.to_datetime(project['InvoiceDate'])\n", + "project['CustomerCode']=project['CustomerCode'].astype(str)\n", + "project['InvoiceNo']=project['InvoiceNo'].astype(str)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "last_date=project['InvoiceDate'].max()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerCodesize
8938819057820204
4459413215452113
1034713032521106
970771908088099
1199519900306190
.........
58910132728611
58911132728711
58913132728751
58914132728781
61866140007781
\n", + "

123733 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " CustomerCode size\n", + "89388 19057820 204\n", + "44594 13215452 113\n", + "10347 13032521 106\n", + "97077 19080880 99\n", + "119951 99003061 90\n", + "... ... ...\n", + "58910 13272861 1\n", + "58911 13272871 1\n", + "58913 13272875 1\n", + "58914 13272878 1\n", + "61866 14000778 1\n", + "\n", + "[123733 rows x 2 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "project.groupby('CustomerCode', as_index=False).size().sort_values('size', ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerCoderecencyfrequencymonetary_value
0022130191911609.20
1022130422239685.48
202213071291415.00
302213088231305.00
4022130922511412.88
...............
12372899099927101961.10
12372999099936011521.78
12373099099959821444.56
123731990999631913018.91
12373299099972024577.37
\n", + "

123733 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " CustomerCode recency frequency monetary_value\n", + "0 02213019 19 1 1609.20\n", + "1 02213042 22 3 9685.48\n", + "2 02213071 29 1 415.00\n", + "3 02213088 23 1 305.00\n", + "4 02213092 25 1 1412.88\n", + "... ... ... ... ...\n", + "123728 99099927 10 1 961.10\n", + "123729 99099936 0 1 1521.78\n", + "123730 99099959 8 2 1444.56\n", + "123731 99099963 19 1 3018.91\n", + "123732 99099972 0 2 4577.37\n", + "\n", + "[123733 rows x 4 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Создаём RFM-таблицу\n", + "\n", + "rfm_table=project.groupby('CustomerCode',as_index=False).agg({'InvoiceDate':lambda x: (last_date-x.max()).days,\n", + " 'InvoiceNo':lambda x:len(x),\n", + " 'Amount':lambda x:x.sum()})\n", + "\n", + "\n", + "rfm_table['InvoiceDate'] = rfm_table['InvoiceDate'].astype(int)\n", + "\n", + "\n", + "rfm_table=rfm_table.rename(columns={'InvoiceDate':'recency',\n", + " 'InvoiceNo':'frequency',\n", + " 'Amount':'monetary_value'})\n", + "\n", + "rfm_table" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
recencyfrequencymonetary_value
0.252.01.0765.00
0.508.02.01834.48
0.7516.03.04008.84
\n", + "
" + ], + "text/plain": [ + " recency frequency monetary_value\n", + "0.25 2.0 1.0 765.00\n", + "0.50 8.0 2.0 1834.48\n", + "0.75 16.0 3.0 4008.84" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Рассчитаем квантили\n", + "\n", + "quantiles=rfm_table.quantile(q=(0.25,0.5,0.75))\n", + "quantiles" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Разобьём пользователей на классы\n", + "\n", + "def Rclass(value, parameter_name, quantiles_table):\n", + " if value <= quantiles_table[parameter_name][0.25]:\n", + " return 1\n", + " elif value <= quantiles_table[parameter_name][0.5]:\n", + " return 2\n", + " elif value <= quantiles_table[parameter_name][0.75]:\n", + " return 3\n", + " else:\n", + " return 4\n", + " \n", + "def FMclass(value, parameter_name, quantiles_table):\n", + " if value <= quantiles_table[parameter_name][0.25]:\n", + " return 4\n", + " elif value <= quantiles_table[parameter_name][0.5]:\n", + " return 3\n", + " elif value <= quantiles_table[parameter_name][0.75]:\n", + " return 2\n", + " else:\n", + " return 1" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "rfm_segmentation=rfm_table" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "rfm_segmentation['R_quantile']=rfm_segmentation['recency'].apply(Rclass,args=('recency', quantiles))\n", + "rfm_segmentation['F_quantile']=rfm_segmentation['frequency'].apply(FMclass,args=('frequency', quantiles))\n", + "rfm_segmentation['M_quantile']=rfm_segmentation['monetary_value'].apply(FMclass,args=('monetary_value', quantiles))\n", + "rfm_segmentation['RFM_class']=rfm_segmentation['R_quantile'].map(str)\\\n", + " +rfm_segmentation['F_quantile'].map(str)\\\n", + " +rfm_segmentation['M_quantile'].map(str)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerCoderecencyfrequencymonetary_valueR_quantileF_quantileM_quantileRFM_class
0022130191911609.20443443
1022130422239685.48421421
202213071291415.00444444
302213088231305.00444444
4022130922511412.88443443
\n", + "
" + ], + "text/plain": [ + " CustomerCode recency frequency monetary_value R_quantile F_quantile \\\n", + "0 02213019 19 1 1609.20 4 4 \n", + "1 02213042 22 3 9685.48 4 2 \n", + "2 02213071 29 1 415.00 4 4 \n", + "3 02213088 23 1 305.00 4 4 \n", + "4 02213092 25 1 1412.88 4 4 \n", + "\n", + " M_quantile RFM_class \n", + "0 3 443 \n", + "1 1 421 \n", + "2 4 444 \n", + "3 4 444 \n", + "4 3 443 " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rfm_segmentation.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Какое максимальное кол-во покупок было совершено одним пользователем?" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "204" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rfm_segmentation.frequency.max()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Какая верхняя граница у суммы покупок у пользователей с классом 4 в подсегменте М? (Другими словами: пользователи, у которых сумма покупок от 0 до Х попадают в 4 класс в подсегменте М)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "765.0" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rfm_segmentation.query('M_quantile == 4').monetary_value.max()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Какая нижняя граница у количества покупок у пользователей с классом 1 в подсегменте F?" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rfm_segmentation.query('F_quantile == 1').frequency.min()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Какое максимальное количество дней может пройти с момента последней покупки для того, чтобы пользователь попал в класс 2 в подсегменте R?" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rfm_segmentation.query('R_quantile == 2').recency.max()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Сколько пользователей попало в сегмент 111?" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "9705" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rfm_segmentation.query('RFM_class == \"111\"').shape[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Сколько пользователей попало в сегмент 311?" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1609" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rfm_segmentation.query('RFM_class == \"311\"').shape[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "В каком RFM-сегменте самое большое кол-во пользователей?" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'444'" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rfm_segmentation.groupby('RFM_class').size().idxmax()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "В каком RFM-сегменте самое маленькое кол-во пользователей?" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'414'" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rfm_segmentation.groupby('RFM_class').size().idxmin()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Какое количество пользователей попало в самый малочисленный сегмент?" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rfm_segmentation.groupby('RFM_class').size().min()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# heatmap\n", + "\n", + "rfm_viz = rfm_segmentation.pivot_table(\n", + " index='R_quantile', \n", + " columns='F_quantile', \n", + " values='monetary_value', \n", + " aggfunc=np.median).applymap(int)\n", + "sns.heatmap(rfm_viz, cmap=\"YlGnBu\", annot=True, fmt=\".0f\", linewidths=4.15, annot_kws={\"size\": 10},yticklabels=4);" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}