gsdc

AlvinAi96 · Aug 8, 2021 · 1bd5e4f · 1bd5e4f
1 parent fea358f
commit 1bd5e4f
Show file tree

Hide file tree

Showing 8 changed files with 2,689 additions and 0 deletions.
diff --git a/get_carpark_index_by_lgb.py b/get_carpark_index_by_lgb.py
diff --git a/get_carpark_index_by_mp.py b/get_carpark_index_by_mp.py
@@ -0,0 +1,242 @@
+# -*- coding: utf-8 -*-
+# encoding = utf-8
+
+'''
+get_carpark_index_by_mp.py
+author：alvin
+create dayno: 20210801
+
+Function: Get the car parks' indexes by movingpandas library.
+功能: 通过movingpandas获取停车场的索引位置。
+
+History:
+version       contributor       comment
+v1.0          alvin             第一版
+'''
+
+
+
+import numpy as np
+import pandas as pd
+import glob
+import os
+import matplotlib.pyplot as plt
+from tqdm import tqdm
+from pathlib import Path
+import plotly.express as px
+from shapely.geometry import Point
+from scipy import spatial
+import geopandas as gpd
+import optuna
+import warnings
+warnings.filterwarnings("ignore")
+import geopandas as gpd
+import movingpandas as mpd
+from pyproj import CRS
+from datetime import datetime, timedelta
+
+
+
+# Loading the dataset. 导入数据。
+data_dir = Path("../data")
+trn_df = pd.read_csv(data_dir / "baseline_locations_train.csv")
+tst_df = pd.read_csv(data_dir / "baseline_locations_test.csv")
+sub_df = pd.read_csv(data_dir / 'sample_submission.csv')
+gt_df = pd.DataFrame()
+for (collection_name, phone_name), df in trn_df.groupby(["collectionName", "phoneName"]):
+    path = data_dir / f"train/{collection_name}/{phone_name}/ground_truth.csv"
+    df = pd.read_csv(path)
+    gt_df = pd.concat([gt_df, df]).reset_index(drop=True)
+gt_df['phone'] = gt_df['collectionName'] + '_' + gt_df['phoneName']
+
+
+
+def get_traj_collection(data):
+    '''Transforming the dataset to the Trajectory Collection object throught movingpandas. 
+    将数据集转为movingpandas的Trajectory Collection对象。'''
+    # millisSinceGpsEpoch -> the formatted time. millisSinceGpsEpoch转换为格式化的日期和时间。
+    dt_offset = pd.to_datetime('1980-01-06 00:00:00')
+    dt_offset_in_ms = int(dt_offset.value / 1e6)
+    data['t'] = pd.to_datetime(data['millisSinceGpsEpoch'] + dt_offset_in_ms, unit='ms')
+    data['t'] = pd.to_datetime(data['t'], format='%Y-%m-%d %H:%M:%S')
+
+    # LatDeg & LngDeg -> Point object. 将经纬度转换为Point对象。
+    data['geometry'] = [Point(long, lat) for long, lat in zip(data['lngDeg'].to_list(), data['latDeg'].to_list())]
+    # Create Geodataframe. 创建Geodataframe. 注意这是: CRS 4326 WGS84。
+    geodata = gpd.GeoDataFrame(data, crs = CRS.from_epsg('4326'))
+    # Set timestamp as index. 将时间戳设为索引。
+    geodata = geodata.set_index('t')
+    # Create Trajectory Collection object. 使用Movingpandas创建Trajectory Collection对象，以phone作为轨迹id。
+    traj_collection = mpd.TrajectoryCollection(geodata, 'phone')
+    return traj_collection
+
+
+
+def get_stop_traj(traj_col, min_sec, max_dist):
+    '''
+    For trajectories, detect stopping points. 针对多个轨迹路线，依次做停车检测。
+    Input:
+        1. traj_col       (TrajectoryCollection): The Trajectory Collection object involving multiple trajectories. 轨迹路线对象（包括多条轨迹）。
+        2. min_sec                       (float): The minimum stop duration(s). 最小停车时长(s)，越大越严格要求停车时长要长。
+        3. max_dist                      (float): The maximum stop radius(m). 最大停车半径(m)，越小越严格要求停车范围要小。
+    Output:
+        1. stop_traj_dict                 (dict): The dict for car parks'indexes. 停车场轨迹索引字典。
+                                                  {phone:[start_point_max_idxs, end_point_min_idxs]}
+    '''
+    traj_num = len(traj_col.trajectories)
+    print('The number of trajectories: {}'.format(traj_num))
+    stop_traj_dict = {}
+    start_point_max_idxs = []
+    end_point_min_idxs = []
+    for i in range(traj_num):
+        tgt_traj = traj_col.trajectories[i]
+        tgt_stop = mpd.TrajectoryStopDetector(tgt_traj).get_stop_segments(min_duration=timedelta(seconds=min_sec), max_diameter=max_dist)
+        print("'{}':[{},{}],".format(tgt_traj.id, len(tgt_stop.trajectories[0].df), len(tgt_traj.df)-len(tgt_stop.trajectories[-1].df)))
+        start_point_max_idxs.append(len(tgt_stop.trajectories[0].df))
+        end_point_min_idxs.append(len(tgt_traj.df)-len(tgt_stop.trajectories[-1].df))
+    return stop_traj_dict
+
+
+
+min_sec = 1
+max_dist = 33
+
+traj_col_trn = get_traj_collection(trn_df)
+stop_traj_dict_trn = get_stop_traj(traj_col_trn, min_sec, max_dist)
+
+traj_col_tst = get_traj_collection(tst_df)
+stop_traj_dict_tst = get_stop_traj(traj_col_tst, min_sec, max_dist)
+
+print('Train:', stop_traj_dict_trn)
+print('Test:', stop_traj_dict_tst)
+
+
+
+
+# Return: 返回结果如下：
+'''
+The number of trajectories: 73
+'2020-05-14-US-MTV-1_Pixel4':[79,1688],
+'2020-05-14-US-MTV-1_Pixel4XLModded':[76,1739],
+'2020-05-14-US-MTV-2_Pixel4':[97,1718],
+'2020-05-14-US-MTV-2_Pixel4XLModded':[2,575],
+'2020-05-21-US-MTV-1_Pixel4':[318,1922],
+'2020-05-21-US-MTV-2_Pixel4':[47,1857],
+'2020-05-21-US-MTV-2_Pixel4XL':[49,1790],
+'2020-05-29-US-MTV-1_Pixel4':[17,1897],
+'2020-05-29-US-MTV-1_Pixel4XL':[49,1894],
+'2020-05-29-US-MTV-1_Pixel4XLModded':[8,1894],
+'2020-05-29-US-MTV-2_Pixel4':[81,1932],
+'2020-05-29-US-MTV-2_Pixel4XL':[83,1936],
+'2020-06-04-US-MTV-1_Pixel4':[54,1682],
+'2020-06-04-US-MTV-1_Pixel4XL':[58,1681],
+'2020-06-04-US-MTV-1_Pixel4XLModded':[58,1767],
+'2020-06-05-US-MTV-1_Pixel4':[55,1815],
+'2020-06-05-US-MTV-1_Pixel4XL':[118,1880],
+'2020-06-05-US-MTV-1_Pixel4XLModded':[190,1121],
+'2020-06-05-US-MTV-2_Pixel4':[187,1672],
+'2020-06-05-US-MTV-2_Pixel4XL':[18,1611],
+'2020-06-11-US-MTV-1_Pixel4':[143,1888],
+'2020-06-11-US-MTV-1_Pixel4XL':[154,1772],
+'2020-07-08-US-MTV-1_Pixel4':[119,2038],
+'2020-07-08-US-MTV-1_Pixel4XL':[121,1868],
+'2020-07-08-US-MTV-1_Pixel4XLModded':[121,1244],
+'2020-07-17-US-MTV-1_Mi8':[134,2038],
+'2020-07-17-US-MTV-2_Mi8':[53,1712],
+'2020-08-03-US-MTV-1_Mi8':[241,1935],
+'2020-08-03-US-MTV-1_Pixel4':[262,1872],
+'2020-08-06-US-MTV-2_Mi8':[66,1717],
+'2020-08-06-US-MTV-2_Pixel4':[69,1720],
+'2020-08-06-US-MTV-2_Pixel4XL':[70,1723],
+'2020-09-04-US-SF-1_Mi8':[83,1714],
+'2020-09-04-US-SF-1_Pixel4':[9,1743],
+'2020-09-04-US-SF-1_Pixel4XL':[85,1722],
+'2020-09-04-US-SF-2_Mi8':[62,2479],
+'2020-09-04-US-SF-2_Pixel4':[65,2347],
+'2020-09-04-US-SF-2_Pixel4XL':[64,1257],
+'2021-01-04-US-RWC-1_Pixel4':[63,2010],
+'2021-01-04-US-RWC-1_Pixel4Modded':[62,2010],
+'2021-01-04-US-RWC-1_Pixel4XL':[62,2039],
+'2021-01-04-US-RWC-1_Pixel5':[65,2009],
+'2021-01-04-US-RWC-2_Pixel4':[37,1851],
+'2021-01-04-US-RWC-2_Pixel4Modded':[34,1849],
+'2021-01-04-US-RWC-2_Pixel4XL':[31,1846],
+'2021-01-04-US-RWC-2_Pixel5':[39,1862],
+'2021-01-05-US-SVL-1_Mi8':[58,1323],
+'2021-01-05-US-SVL-1_Pixel4':[57,1318],
+'2021-01-05-US-SVL-1_Pixel4XL':[44,1346],
+'2021-01-05-US-SVL-1_Pixel5':[50,1424],
+'2021-01-05-US-SVL-2_Pixel4':[45,1146],
+'2021-01-05-US-SVL-2_Pixel4Modded':[40,1240],
+'2021-01-05-US-SVL-2_Pixel4XL':[17,1135],
+'2021-03-10-US-SVL-1_Pixel4XL':[56,1444],
+'2021-03-10-US-SVL-1_SamsungS20Ultra':[62,1449],
+'2021-04-15-US-MTV-1_Pixel4':[36,1685],
+'2021-04-15-US-MTV-1_Pixel4Modded':[41,1682],
+'2021-04-15-US-MTV-1_Pixel5':[36,1673],
+'2021-04-15-US-MTV-1_SamsungS20Ultra':[38,1683],
+'2021-04-22-US-SJC-1_Pixel4':[35,2886],
+'2021-04-22-US-SJC-1_SamsungS20Ultra':[34,2811],
+'2021-04-26-US-SVL-1_Mi8':[74,1033],
+'2021-04-26-US-SVL-1_Pixel5':[72,1031],
+'2021-04-28-US-MTV-1_Pixel4':[71,1973],
+'2021-04-28-US-MTV-1_Pixel5':[56,1971],
+'2021-04-28-US-MTV-1_SamsungS20Ultra':[52,1941],
+'2021-04-28-US-SJC-1_Pixel4':[48,1984],
+'2021-04-28-US-SJC-1_SamsungS20Ultra':[48,2001],
+'2021-04-29-US-MTV-1_Pixel4':[96,1599],
+'2021-04-29-US-MTV-1_Pixel5':[15,1587],
+'2021-04-29-US-MTV-1_SamsungS20Ultra':[94,1584],
+'2021-04-29-US-SJC-2_Pixel4':[28,2316],
+'2021-04-29-US-SJC-2_SamsungS20Ultra':[33,2312],
+
+The number of trajectories: 48
+'2020-05-15-US-MTV-1_Pixel4':[25,3482],
+'2020-05-15-US-MTV-1_Pixel4XL':[957,3498],
+'2020-05-28-US-MTV-1_Pixel4':[238,2093],
+'2020-05-28-US-MTV-1_Pixel4XL':[181,2095],
+'2020-05-28-US-MTV-2_Pixel4':[3,2282],
+'2020-05-28-US-MTV-2_Pixel4XL':[4,2214],
+'2020-05-28-US-MTV-2_Pixel4XLModded':[2,1456],
+'2020-06-04-US-MTV-2_Pixel4':[38,1651],
+'2020-06-04-US-MTV-2_Pixel4XL':[43,1649],
+'2020-06-04-US-MTV-2_Pixel4XLModded':[39,1661],
+'2020-06-10-US-MTV-1_Pixel4':[97,1625],
+'2020-06-10-US-MTV-1_Pixel4XL':[98,1624],
+'2020-06-10-US-MTV-1_Pixel4XLModded':[95,1631],
+'2020-06-10-US-MTV-2_Pixel4':[81,1779],
+'2020-06-10-US-MTV-2_Pixel4XL':[83,1770],
+'2020-06-10-US-MTV-2_Pixel4XLModded':[22,1930],
+'2020-08-03-US-MTV-2_Mi8':[101,1701],
+'2020-08-03-US-MTV-2_Pixel4':[103,1694],
+'2020-08-03-US-MTV-2_Pixel4XL':[56,1647],
+'2020-08-13-US-MTV-1_Mi8':[84,2195],
+'2020-08-13-US-MTV-1_Pixel4':[86,2236],
+'2021-03-16-US-MTV-2_Pixel4Modded':[10,2027],
+'2021-03-16-US-MTV-2_SamsungS20Ultra':[156,2195],
+'2021-03-16-US-RWC-2_Pixel4XL':[25,1948],
+'2021-03-16-US-RWC-2_Pixel5':[74,1947],
+'2021-03-16-US-RWC-2_SamsungS20Ultra':[69,1932],
+'2021-03-25-US-PAO-1_Mi8':[85,1719],
+'2021-03-25-US-PAO-1_Pixel4':[99,1723],
+'2021-03-25-US-PAO-1_Pixel4Modded':[94,1719],
+'2021-03-25-US-PAO-1_Pixel5':[96,1723],
+'2021-03-25-US-PAO-1_SamsungS20Ultra':[84,1721],
+'2021-04-02-US-SJC-1_Pixel4':[69,2315],
+'2021-04-02-US-SJC-1_Pixel5':[72,2323],
+'2021-04-08-US-MTV-1_Pixel4':[24,1007],
+'2021-04-08-US-MTV-1_Pixel4Modded':[48,1005],
+'2021-04-08-US-MTV-1_Pixel5':[49,1148],
+'2021-04-08-US-MTV-1_SamsungS20Ultra':[48,1008],
+'2021-04-21-US-MTV-1_Pixel4':[65,1420],
+'2021-04-21-US-MTV-1_Pixel4Modded':[51,1406],
+'2021-04-22-US-SJC-2_SamsungS20Ultra':[23,2293],
+'2021-04-26-US-SVL-2_SamsungS20Ultra':[32,2301],
+'2021-04-28-US-MTV-2_Pixel4':[28,1727],
+'2021-04-28-US-MTV-2_SamsungS20Ultra':[49,1751],
+'2021-04-29-US-MTV-2_Pixel4':[18,1679],
+'2021-04-29-US-MTV-2_Pixel5':[17,1719],
+'2021-04-29-US-MTV-2_SamsungS20Ultra':[126,1682],
+'2021-04-29-US-SJC-3_Pixel4':[37,1947],
+'2021-04-29-US-SJC-3_SamsungS20Ultra':[36,1952],
+'''
diff --git a/gf_mean.py b/gf_mean.py
@@ -0,0 +1,145 @@
+# -*- coding: utf-8 -*-
+# encoding = utf-8
+
+'''
+gf_mean.py
+author：alvin
+create dayno: 20210719
+
+Function: Gaussian Filter + Phone Mean.
+功能: 高斯滤波 + 平均路径。
+
+History:
+version       contributor       comment
+v1.0          alvin             第一版
+
+Reference:
+1. 'Adaptive_gauss+phone_mean'(Petr B): https://www.kaggle.com/bpetrb/adaptive-gauss-phone-mean
+'''
+
+import numpy as np
+import pandas as pd
+from scipy.ndimage import gaussian_filter1d
+from scipy.interpolate import interp1d
+import optuna
+import os
+from pathlib import Path
+
+
+
+def apply_gauss_smoothing(df, params):
+    '''Apply Gaussian Filter to smooth the data.'''
+    df = df.copy()
+    SZ_1 = params['sz_1']
+    SZ_2 = params['sz_2']
+    SZ_CRIT = params['sz_crit']    
+
+    unique_paths = df[['collectionName', 'phoneName']].drop_duplicates().to_numpy()
+    for collection, phone in unique_paths:
+        cond = np.logical_and(df['collectionName'] == collection, df['phoneName'] == phone)
+        data = df[cond][['latDeg', 'lngDeg']].to_numpy()
+
+        lat_g1 = gaussian_filter1d(data[:, 0], np.sqrt(SZ_1))
+        lon_g1 = gaussian_filter1d(data[:, 1], np.sqrt(SZ_1))
+        lat_g2 = gaussian_filter1d(data[:, 0], np.sqrt(SZ_2))
+        lon_g2 = gaussian_filter1d(data[:, 1], np.sqrt(SZ_2))
+
+        lat_dif = data[1:,0] - data[:-1,0]
+        lon_dif = data[1:,1] - data[:-1,1]
+
+        lat_crit = np.append(np.abs(gaussian_filter1d(lat_dif, np.sqrt(SZ_CRIT)) / (1e-9 + gaussian_filter1d(np.abs(lat_dif), np.sqrt(SZ_CRIT)))),[0])
+        lon_crit = np.append(np.abs(gaussian_filter1d(lon_dif, np.sqrt(SZ_CRIT)) / (1e-9 + gaussian_filter1d(np.abs(lon_dif), np.sqrt(SZ_CRIT)))),[0])           
+
+        df.loc[cond, 'latDeg'] = lat_g1 * lat_crit + lat_g2 * (1.0 - lat_crit)
+        df.loc[cond, 'lngDeg'] = lon_g1 * lon_crit + lon_g2 * (1.0 - lon_crit)                        
+    return df
+
+
+
+def mean_with_other_phones(df):
+    df = df.copy()
+    collections_list = df[['collectionName']].drop_duplicates().to_numpy()
+    # Target for each colleciton. 针对每个collection
+    for collection in collections_list:
+        phone_list = df[df['collectionName'].to_list() == collection][['phoneName']].drop_duplicates().to_numpy()
+
+        phone_data = {}
+        corrections = {}
+        # Target for each phone. 针对每个phone
+        for phone in phone_list:
+            # Get the boolean of the none value. collection+phone的bool位置
+            cond = np.logical_and(df['collectionName'] == collection[0], df['phoneName'] == phone[0]).to_list()
+            phone_data[phone[0]] = df[cond][['millisSinceGpsEpoch', 'latDeg', 'lngDeg']].to_numpy()
+
+        # Choose a phone. 选择一个phone的数据
+        for current in phone_data:
+            correction = np.ones(phone_data[current].shape, dtype=np.float)
+            correction[:,1:] = phone_data[current][:,1:] # Load location info. 只载入经纬度，时间全变为1
+
+            # Telephones data don't complitely match by time, so - interpolate.
+            for other in phone_data:
+                if other == current:
+                    continue
+                # Use other phone to interpolate. 用其它phone做插值    
+                # x: timestamp; y: location
+                # x为时间phone_data[other][:,0]，y为经纬度phone_data[other][:,1:]
+                loc = interp1d(phone_data[other][:,0], 
+                               phone_data[other][:,1:], 
+                               axis=0, 
+                               kind='linear', 
+                               copy=False, 
+                               bounds_error=None, 
+                               fill_value='extrapolate', 
+                               assume_sorted=True)
+                # In the same collection, find out which points are the start point and stop point
+                # 找到同一个collection，哪个点最早和最晚
+                start_idx = 0
+                stop_idx = 0
+                for idx, val in enumerate(phone_data[current][:,0]):
+                    if val < phone_data[other][0,0]:
+                        start_idx = idx
+                    if val < phone_data[other][-1,0]:
+                        stop_idx = idx
+
+                if stop_idx - start_idx > 0:
+                    correction[start_idx:stop_idx,0] += 1
+                    correction[start_idx:stop_idx,1:] += loc(phone_data[current][start_idx:stop_idx,0])                    
+            # Mean the trajectorie of other phones. 现有机子和其它机子做平均
+            correction[:,1] /= correction[:,0]
+            correction[:,2] /= correction[:,0]         
+            corrections[current] = correction.copy()
+
+        for phone in phone_list:
+            cond = np.logical_and(df['collectionName'] == collection[0], df['phoneName'] == phone[0]).to_list()           
+            df.loc[cond, ['latDeg', 'lngDeg']] = corrections[phone[0]][:,1:]            
+    return df
+
+
+
+def calc_haversine(lat1, lon1, lat2, lon2):
+    RADIUS = 6_367_000
+    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
+    dlat = lat2 - lat1
+    dlon = lon2 - lon1
+    a = np.sin(dlat/2)**2 + \
+        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
+    dist = 2 * RADIUS * np.arcsin(a**0.5)
+    return dist
+
+
+
+def compute_dist(pred_df, gt_df):
+    oof = pred_df.copy()
+    gt = gt_df.copy()
+    df = oof.merge(gt, on = ['phone','millisSinceGpsEpoch'])
+    dst_oof = calc_haversine(df.latDeg_x,df.lngDeg_x, df.latDeg_y, df.lngDeg_y)
+    scores = pd.DataFrame({'phone': df.phone,'dst': dst_oof})
+    scores_grp = scores.groupby('phone')
+    d50 = scores_grp.quantile(.50).reset_index()
+    d50.columns = ['phone','q50']
+    d95 = scores_grp.quantile(.95).reset_index()
+    d95.columns = ['phone','q95']
+    return (scores_grp.quantile(.50).mean() + scores_grp.quantile(.95).mean())/2, d50.merge(d95)
+
+
+