Skip to content

Commit

Permalink
gsdc
Browse files Browse the repository at this point in the history
  • Loading branch information
AlvinAi96 authored Aug 8, 2021
1 parent fea358f commit 1bd5e4f
Show file tree
Hide file tree
Showing 8 changed files with 2,689 additions and 0 deletions.
464 changes: 464 additions & 0 deletions get_carpark_index_by_lgb.py

Large diffs are not rendered by default.

242 changes: 242 additions & 0 deletions get_carpark_index_by_mp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
# -*- coding: utf-8 -*-
# encoding = utf-8

'''
get_carpark_index_by_mp.py
author:alvin
create dayno: 20210801
Function: Get the car parks' indexes by movingpandas library.
功能: 通过movingpandas获取停车场的索引位置。
History:
version contributor comment
v1.0 alvin 第一版
'''



import numpy as np
import pandas as pd
import glob
import os
import matplotlib.pyplot as plt
from tqdm import tqdm
from pathlib import Path
import plotly.express as px
from shapely.geometry import Point
from scipy import spatial
import geopandas as gpd
import optuna
import warnings
warnings.filterwarnings("ignore")
import geopandas as gpd
import movingpandas as mpd
from pyproj import CRS
from datetime import datetime, timedelta



# Loading the dataset. 导入数据。
data_dir = Path("../data")
trn_df = pd.read_csv(data_dir / "baseline_locations_train.csv")
tst_df = pd.read_csv(data_dir / "baseline_locations_test.csv")
sub_df = pd.read_csv(data_dir / 'sample_submission.csv')
gt_df = pd.DataFrame()
for (collection_name, phone_name), df in trn_df.groupby(["collectionName", "phoneName"]):
path = data_dir / f"train/{collection_name}/{phone_name}/ground_truth.csv"
df = pd.read_csv(path)
gt_df = pd.concat([gt_df, df]).reset_index(drop=True)
gt_df['phone'] = gt_df['collectionName'] + '_' + gt_df['phoneName']



def get_traj_collection(data):
'''Transforming the dataset to the Trajectory Collection object throught movingpandas.
将数据集转为movingpandas的Trajectory Collection对象。'''
# millisSinceGpsEpoch -> the formatted time. millisSinceGpsEpoch转换为格式化的日期和时间。
dt_offset = pd.to_datetime('1980-01-06 00:00:00')
dt_offset_in_ms = int(dt_offset.value / 1e6)
data['t'] = pd.to_datetime(data['millisSinceGpsEpoch'] + dt_offset_in_ms, unit='ms')
data['t'] = pd.to_datetime(data['t'], format='%Y-%m-%d %H:%M:%S')

# LatDeg & LngDeg -> Point object. 将经纬度转换为Point对象。
data['geometry'] = [Point(long, lat) for long, lat in zip(data['lngDeg'].to_list(), data['latDeg'].to_list())]
# Create Geodataframe. 创建Geodataframe. 注意这是: CRS 4326 WGS84。
geodata = gpd.GeoDataFrame(data, crs = CRS.from_epsg('4326'))
# Set timestamp as index. 将时间戳设为索引。
geodata = geodata.set_index('t')
# Create Trajectory Collection object. 使用Movingpandas创建Trajectory Collection对象,以phone作为轨迹id。
traj_collection = mpd.TrajectoryCollection(geodata, 'phone')
return traj_collection



def get_stop_traj(traj_col, min_sec, max_dist):
'''
For trajectories, detect stopping points. 针对多个轨迹路线,依次做停车检测。
Input:
1. traj_col (TrajectoryCollection): The Trajectory Collection object involving multiple trajectories. 轨迹路线对象(包括多条轨迹)。
2. min_sec (float): The minimum stop duration(s). 最小停车时长(s),越大越严格要求停车时长要长。
3. max_dist (float): The maximum stop radius(m). 最大停车半径(m),越小越严格要求停车范围要小。
Output:
1. stop_traj_dict (dict): The dict for car parks'indexes. 停车场轨迹索引字典。
{phone:[start_point_max_idxs, end_point_min_idxs]}
'''
traj_num = len(traj_col.trajectories)
print('The number of trajectories: {}'.format(traj_num))
stop_traj_dict = {}
start_point_max_idxs = []
end_point_min_idxs = []
for i in range(traj_num):
tgt_traj = traj_col.trajectories[i]
tgt_stop = mpd.TrajectoryStopDetector(tgt_traj).get_stop_segments(min_duration=timedelta(seconds=min_sec), max_diameter=max_dist)
print("'{}':[{},{}],".format(tgt_traj.id, len(tgt_stop.trajectories[0].df), len(tgt_traj.df)-len(tgt_stop.trajectories[-1].df)))
start_point_max_idxs.append(len(tgt_stop.trajectories[0].df))
end_point_min_idxs.append(len(tgt_traj.df)-len(tgt_stop.trajectories[-1].df))
return stop_traj_dict



min_sec = 1
max_dist = 33

traj_col_trn = get_traj_collection(trn_df)
stop_traj_dict_trn = get_stop_traj(traj_col_trn, min_sec, max_dist)

traj_col_tst = get_traj_collection(tst_df)
stop_traj_dict_tst = get_stop_traj(traj_col_tst, min_sec, max_dist)

print('Train:', stop_traj_dict_trn)
print('Test:', stop_traj_dict_tst)




# Return: 返回结果如下:
'''
The number of trajectories: 73
'2020-05-14-US-MTV-1_Pixel4':[79,1688],
'2020-05-14-US-MTV-1_Pixel4XLModded':[76,1739],
'2020-05-14-US-MTV-2_Pixel4':[97,1718],
'2020-05-14-US-MTV-2_Pixel4XLModded':[2,575],
'2020-05-21-US-MTV-1_Pixel4':[318,1922],
'2020-05-21-US-MTV-2_Pixel4':[47,1857],
'2020-05-21-US-MTV-2_Pixel4XL':[49,1790],
'2020-05-29-US-MTV-1_Pixel4':[17,1897],
'2020-05-29-US-MTV-1_Pixel4XL':[49,1894],
'2020-05-29-US-MTV-1_Pixel4XLModded':[8,1894],
'2020-05-29-US-MTV-2_Pixel4':[81,1932],
'2020-05-29-US-MTV-2_Pixel4XL':[83,1936],
'2020-06-04-US-MTV-1_Pixel4':[54,1682],
'2020-06-04-US-MTV-1_Pixel4XL':[58,1681],
'2020-06-04-US-MTV-1_Pixel4XLModded':[58,1767],
'2020-06-05-US-MTV-1_Pixel4':[55,1815],
'2020-06-05-US-MTV-1_Pixel4XL':[118,1880],
'2020-06-05-US-MTV-1_Pixel4XLModded':[190,1121],
'2020-06-05-US-MTV-2_Pixel4':[187,1672],
'2020-06-05-US-MTV-2_Pixel4XL':[18,1611],
'2020-06-11-US-MTV-1_Pixel4':[143,1888],
'2020-06-11-US-MTV-1_Pixel4XL':[154,1772],
'2020-07-08-US-MTV-1_Pixel4':[119,2038],
'2020-07-08-US-MTV-1_Pixel4XL':[121,1868],
'2020-07-08-US-MTV-1_Pixel4XLModded':[121,1244],
'2020-07-17-US-MTV-1_Mi8':[134,2038],
'2020-07-17-US-MTV-2_Mi8':[53,1712],
'2020-08-03-US-MTV-1_Mi8':[241,1935],
'2020-08-03-US-MTV-1_Pixel4':[262,1872],
'2020-08-06-US-MTV-2_Mi8':[66,1717],
'2020-08-06-US-MTV-2_Pixel4':[69,1720],
'2020-08-06-US-MTV-2_Pixel4XL':[70,1723],
'2020-09-04-US-SF-1_Mi8':[83,1714],
'2020-09-04-US-SF-1_Pixel4':[9,1743],
'2020-09-04-US-SF-1_Pixel4XL':[85,1722],
'2020-09-04-US-SF-2_Mi8':[62,2479],
'2020-09-04-US-SF-2_Pixel4':[65,2347],
'2020-09-04-US-SF-2_Pixel4XL':[64,1257],
'2021-01-04-US-RWC-1_Pixel4':[63,2010],
'2021-01-04-US-RWC-1_Pixel4Modded':[62,2010],
'2021-01-04-US-RWC-1_Pixel4XL':[62,2039],
'2021-01-04-US-RWC-1_Pixel5':[65,2009],
'2021-01-04-US-RWC-2_Pixel4':[37,1851],
'2021-01-04-US-RWC-2_Pixel4Modded':[34,1849],
'2021-01-04-US-RWC-2_Pixel4XL':[31,1846],
'2021-01-04-US-RWC-2_Pixel5':[39,1862],
'2021-01-05-US-SVL-1_Mi8':[58,1323],
'2021-01-05-US-SVL-1_Pixel4':[57,1318],
'2021-01-05-US-SVL-1_Pixel4XL':[44,1346],
'2021-01-05-US-SVL-1_Pixel5':[50,1424],
'2021-01-05-US-SVL-2_Pixel4':[45,1146],
'2021-01-05-US-SVL-2_Pixel4Modded':[40,1240],
'2021-01-05-US-SVL-2_Pixel4XL':[17,1135],
'2021-03-10-US-SVL-1_Pixel4XL':[56,1444],
'2021-03-10-US-SVL-1_SamsungS20Ultra':[62,1449],
'2021-04-15-US-MTV-1_Pixel4':[36,1685],
'2021-04-15-US-MTV-1_Pixel4Modded':[41,1682],
'2021-04-15-US-MTV-1_Pixel5':[36,1673],
'2021-04-15-US-MTV-1_SamsungS20Ultra':[38,1683],
'2021-04-22-US-SJC-1_Pixel4':[35,2886],
'2021-04-22-US-SJC-1_SamsungS20Ultra':[34,2811],
'2021-04-26-US-SVL-1_Mi8':[74,1033],
'2021-04-26-US-SVL-1_Pixel5':[72,1031],
'2021-04-28-US-MTV-1_Pixel4':[71,1973],
'2021-04-28-US-MTV-1_Pixel5':[56,1971],
'2021-04-28-US-MTV-1_SamsungS20Ultra':[52,1941],
'2021-04-28-US-SJC-1_Pixel4':[48,1984],
'2021-04-28-US-SJC-1_SamsungS20Ultra':[48,2001],
'2021-04-29-US-MTV-1_Pixel4':[96,1599],
'2021-04-29-US-MTV-1_Pixel5':[15,1587],
'2021-04-29-US-MTV-1_SamsungS20Ultra':[94,1584],
'2021-04-29-US-SJC-2_Pixel4':[28,2316],
'2021-04-29-US-SJC-2_SamsungS20Ultra':[33,2312],
The number of trajectories: 48
'2020-05-15-US-MTV-1_Pixel4':[25,3482],
'2020-05-15-US-MTV-1_Pixel4XL':[957,3498],
'2020-05-28-US-MTV-1_Pixel4':[238,2093],
'2020-05-28-US-MTV-1_Pixel4XL':[181,2095],
'2020-05-28-US-MTV-2_Pixel4':[3,2282],
'2020-05-28-US-MTV-2_Pixel4XL':[4,2214],
'2020-05-28-US-MTV-2_Pixel4XLModded':[2,1456],
'2020-06-04-US-MTV-2_Pixel4':[38,1651],
'2020-06-04-US-MTV-2_Pixel4XL':[43,1649],
'2020-06-04-US-MTV-2_Pixel4XLModded':[39,1661],
'2020-06-10-US-MTV-1_Pixel4':[97,1625],
'2020-06-10-US-MTV-1_Pixel4XL':[98,1624],
'2020-06-10-US-MTV-1_Pixel4XLModded':[95,1631],
'2020-06-10-US-MTV-2_Pixel4':[81,1779],
'2020-06-10-US-MTV-2_Pixel4XL':[83,1770],
'2020-06-10-US-MTV-2_Pixel4XLModded':[22,1930],
'2020-08-03-US-MTV-2_Mi8':[101,1701],
'2020-08-03-US-MTV-2_Pixel4':[103,1694],
'2020-08-03-US-MTV-2_Pixel4XL':[56,1647],
'2020-08-13-US-MTV-1_Mi8':[84,2195],
'2020-08-13-US-MTV-1_Pixel4':[86,2236],
'2021-03-16-US-MTV-2_Pixel4Modded':[10,2027],
'2021-03-16-US-MTV-2_SamsungS20Ultra':[156,2195],
'2021-03-16-US-RWC-2_Pixel4XL':[25,1948],
'2021-03-16-US-RWC-2_Pixel5':[74,1947],
'2021-03-16-US-RWC-2_SamsungS20Ultra':[69,1932],
'2021-03-25-US-PAO-1_Mi8':[85,1719],
'2021-03-25-US-PAO-1_Pixel4':[99,1723],
'2021-03-25-US-PAO-1_Pixel4Modded':[94,1719],
'2021-03-25-US-PAO-1_Pixel5':[96,1723],
'2021-03-25-US-PAO-1_SamsungS20Ultra':[84,1721],
'2021-04-02-US-SJC-1_Pixel4':[69,2315],
'2021-04-02-US-SJC-1_Pixel5':[72,2323],
'2021-04-08-US-MTV-1_Pixel4':[24,1007],
'2021-04-08-US-MTV-1_Pixel4Modded':[48,1005],
'2021-04-08-US-MTV-1_Pixel5':[49,1148],
'2021-04-08-US-MTV-1_SamsungS20Ultra':[48,1008],
'2021-04-21-US-MTV-1_Pixel4':[65,1420],
'2021-04-21-US-MTV-1_Pixel4Modded':[51,1406],
'2021-04-22-US-SJC-2_SamsungS20Ultra':[23,2293],
'2021-04-26-US-SVL-2_SamsungS20Ultra':[32,2301],
'2021-04-28-US-MTV-2_Pixel4':[28,1727],
'2021-04-28-US-MTV-2_SamsungS20Ultra':[49,1751],
'2021-04-29-US-MTV-2_Pixel4':[18,1679],
'2021-04-29-US-MTV-2_Pixel5':[17,1719],
'2021-04-29-US-MTV-2_SamsungS20Ultra':[126,1682],
'2021-04-29-US-SJC-3_Pixel4':[37,1947],
'2021-04-29-US-SJC-3_SamsungS20Ultra':[36,1952],
'''
145 changes: 145 additions & 0 deletions gf_mean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
# -*- coding: utf-8 -*-
# encoding = utf-8

'''
gf_mean.py
author:alvin
create dayno: 20210719
Function: Gaussian Filter + Phone Mean.
功能: 高斯滤波 + 平均路径。
History:
version contributor comment
v1.0 alvin 第一版
Reference:
1. 'Adaptive_gauss+phone_mean'(Petr B): https://www.kaggle.com/bpetrb/adaptive-gauss-phone-mean
'''

import numpy as np
import pandas as pd
from scipy.ndimage import gaussian_filter1d
from scipy.interpolate import interp1d
import optuna
import os
from pathlib import Path



def apply_gauss_smoothing(df, params):
'''Apply Gaussian Filter to smooth the data.'''
df = df.copy()
SZ_1 = params['sz_1']
SZ_2 = params['sz_2']
SZ_CRIT = params['sz_crit']

unique_paths = df[['collectionName', 'phoneName']].drop_duplicates().to_numpy()
for collection, phone in unique_paths:
cond = np.logical_and(df['collectionName'] == collection, df['phoneName'] == phone)
data = df[cond][['latDeg', 'lngDeg']].to_numpy()

lat_g1 = gaussian_filter1d(data[:, 0], np.sqrt(SZ_1))
lon_g1 = gaussian_filter1d(data[:, 1], np.sqrt(SZ_1))
lat_g2 = gaussian_filter1d(data[:, 0], np.sqrt(SZ_2))
lon_g2 = gaussian_filter1d(data[:, 1], np.sqrt(SZ_2))

lat_dif = data[1:,0] - data[:-1,0]
lon_dif = data[1:,1] - data[:-1,1]

lat_crit = np.append(np.abs(gaussian_filter1d(lat_dif, np.sqrt(SZ_CRIT)) / (1e-9 + gaussian_filter1d(np.abs(lat_dif), np.sqrt(SZ_CRIT)))),[0])
lon_crit = np.append(np.abs(gaussian_filter1d(lon_dif, np.sqrt(SZ_CRIT)) / (1e-9 + gaussian_filter1d(np.abs(lon_dif), np.sqrt(SZ_CRIT)))),[0])

df.loc[cond, 'latDeg'] = lat_g1 * lat_crit + lat_g2 * (1.0 - lat_crit)
df.loc[cond, 'lngDeg'] = lon_g1 * lon_crit + lon_g2 * (1.0 - lon_crit)
return df



def mean_with_other_phones(df):
df = df.copy()
collections_list = df[['collectionName']].drop_duplicates().to_numpy()
# Target for each colleciton. 针对每个collection
for collection in collections_list:
phone_list = df[df['collectionName'].to_list() == collection][['phoneName']].drop_duplicates().to_numpy()

phone_data = {}
corrections = {}
# Target for each phone. 针对每个phone
for phone in phone_list:
# Get the boolean of the none value. collection+phone的bool位置
cond = np.logical_and(df['collectionName'] == collection[0], df['phoneName'] == phone[0]).to_list()
phone_data[phone[0]] = df[cond][['millisSinceGpsEpoch', 'latDeg', 'lngDeg']].to_numpy()

# Choose a phone. 选择一个phone的数据
for current in phone_data:
correction = np.ones(phone_data[current].shape, dtype=np.float)
correction[:,1:] = phone_data[current][:,1:] # Load location info. 只载入经纬度,时间全变为1

# Telephones data don't complitely match by time, so - interpolate.
for other in phone_data:
if other == current:
continue
# Use other phone to interpolate. 用其它phone做插值
# x: timestamp; y: location
# x为时间phone_data[other][:,0],y为经纬度phone_data[other][:,1:]
loc = interp1d(phone_data[other][:,0],
phone_data[other][:,1:],
axis=0,
kind='linear',
copy=False,
bounds_error=None,
fill_value='extrapolate',
assume_sorted=True)
# In the same collection, find out which points are the start point and stop point
# 找到同一个collection,哪个点最早和最晚
start_idx = 0
stop_idx = 0
for idx, val in enumerate(phone_data[current][:,0]):
if val < phone_data[other][0,0]:
start_idx = idx
if val < phone_data[other][-1,0]:
stop_idx = idx

if stop_idx - start_idx > 0:
correction[start_idx:stop_idx,0] += 1
correction[start_idx:stop_idx,1:] += loc(phone_data[current][start_idx:stop_idx,0])
# Mean the trajectorie of other phones. 现有机子和其它机子做平均
correction[:,1] /= correction[:,0]
correction[:,2] /= correction[:,0]
corrections[current] = correction.copy()

for phone in phone_list:
cond = np.logical_and(df['collectionName'] == collection[0], df['phoneName'] == phone[0]).to_list()
df.loc[cond, ['latDeg', 'lngDeg']] = corrections[phone[0]][:,1:]
return df



def calc_haversine(lat1, lon1, lat2, lon2):
RADIUS = 6_367_000
lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
dlat = lat2 - lat1
dlon = lon2 - lon1
a = np.sin(dlat/2)**2 + \
np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
dist = 2 * RADIUS * np.arcsin(a**0.5)
return dist



def compute_dist(pred_df, gt_df):
oof = pred_df.copy()
gt = gt_df.copy()
df = oof.merge(gt, on = ['phone','millisSinceGpsEpoch'])
dst_oof = calc_haversine(df.latDeg_x,df.lngDeg_x, df.latDeg_y, df.lngDeg_y)
scores = pd.DataFrame({'phone': df.phone,'dst': dst_oof})
scores_grp = scores.groupby('phone')
d50 = scores_grp.quantile(.50).reset_index()
d50.columns = ['phone','q50']
d95 = scores_grp.quantile(.95).reset_index()
d95.columns = ['phone','q95']
return (scores_grp.quantile(.50).mean() + scores_grp.quantile(.95).mean())/2, d50.merge(d95)



Loading

0 comments on commit 1bd5e4f

Please sign in to comment.