forked from infturing/kdxf
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlgb_model.py
173 lines (160 loc) · 8.36 KB
/
lgb_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# -*- coding: utf-8 -*-
# @Time : 2018/10/15 3:23 PM
# @Author : Inf.Turing
# @Site :
# @File : lgb_baseline.py
# @Software: PyCharm
# 不要浪费太多时间在自己熟悉的地方,要学会适当的绕过一些
# 良好的阶段性收获是坚持的重要动力之一
# 用心做事情,一定会有回报
import datetime
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse
import lightgbm as lgb
import time
import pandas as pd
import numpy as np
path = 'D:/iflytek'
# 全量数据
f = open(path +'/round1_iflyad_train.txt',encoding='UTF-8')
train_data = pd.read_csv(f,sep = '\t')
f = open(path +'/round1_iflyad_test_feature.txt',encoding='UTF-8')
test_data = pd.read_csv(f,sep = '\t')
data = pd.concat([train_data,test_data],axis=0,ignore_index = True)
data = data.fillna(-1)
# data['day'] = data['time'].apply(lambda x: int(time.strftime("%d", time.localtime(x))))
# data['hour'] = data['time'].apply(lambda x: int(time.strftime("%H", time.localtime(x))))
data['day'] = [int(time.strftime("%d", time.localtime(i))) for i in data['time'].values] #速度比apply更快
data['hour'] = [int(time.strftime("%H", time.localtime(i))) for i in data['time'].values] #速度比apply更快
data['label'] = data.click.astype(int)
del data['click']
bool_feature = ['creative_is_jump', 'creative_is_download', 'creative_is_js', 'creative_is_voicead',
'creative_has_deeplink', 'app_paid']
for i in bool_feature:
data[i] = data[i].astype(int)
data['advert_industry_inner_1'] = data['advert_industry_inner'].apply(lambda x: x.split('_')[0])
ad_cate_feature = ['adid', 'advert_id', 'orderid', 'advert_industry_inner_1', 'advert_industry_inner', 'advert_name',
'campaign_id', 'creative_id', 'creative_type', 'creative_tp_dnf', 'creative_has_deeplink',
'creative_is_jump', 'creative_is_download']
media_cate_feature = ['app_cate_id', 'f_channel', 'app_id', 'inner_slot_id']
content_cate_feature = ['city', 'carrier', 'province', 'nnt', 'devtype', 'osv', 'os', 'make', 'model']
origin_cate_list = ad_cate_feature + media_cate_feature + content_cate_feature
# 编码,加速
for i in origin_cate_list:
data[i] = data[i].map(
dict(zip(data[i].unique(), range(0, data[i].nunique()))))
count_feature_list = []
def feature_count(data, features=[], is_feature=True):
if len(set(features)) != len(features):
print('equal feature !!!!')
return data
new_feature = 'count'
nunique = []
for i in features:
nunique.append(data[i].nunique())
new_feature += '_' + i.replace('add_', '') #结果为new_feature_i
if len(features) > 1 and len(data[features].drop_duplicates()) <= np.max(nunique):
print(new_feature, 'is unvalid cross feature:')
return data
# temp = data.groupby(features).size().reset_index().rename(columns={0: new_feature})
# data = data.merge(temp, 'left', on=features)
data[new_feature]=data.groupby(features)[features].transform('count')#这句话可取代上面两句
if is_feature:
count_feature_list.append(new_feature)
if 'day_' in new_feature:
print('fix:', new_feature)
data.loc[data.day == 3, new_feature] = data[data.day == 3][new_feature] * 4
return data
for i in origin_cate_list:
n = data[i].nunique()
if n > 5:
data = feature_count(data, [i])
data = feature_count(data, ['day', 'hour', i])
ratio_feature_list = []
for i in ['adid']:
for j in content_cate_feature:
data = feature_count(data, [i, j])
# if data[i].nunique() > 5 and data[j].nunique() > 5:
if data[i].nunique() > 5 and data[j].nunique() > 5 and 'count_' + i + '_' + j in data.keys():#这里,可能有变量经过feature_count函数之后没结果,若不加这个判断会报错
data['ratio_' + j + '_of_' + i] = data[
'count_' + i + '_' + j] / data['count_' + i]
data['ratio_' + i + '_of_' + j] = data[
'count_' + i + '_' + j] / data['count_' + j]
ratio_feature_list.append('ratio_' + j + '_of_' + i)
ratio_feature_list.append('ratio_' + i + '_of_' + j)
for i in media_cate_feature:
for j in content_cate_feature + ad_cate_feature:
new_feature = 'inf_' + i + '_' + j
data = feature_count(data, [i, j])
# if data[i].nunique() > 5 and data[j].nunique() > 5:
if data[i].nunique() > 5 and data[j].nunique() > 5 and 'count_' + i + '_' + j in data.keys():#这里,可能有变量经过feature_count函数之后没结果,若不加这个判断会报错
data['ratio_' + j + '_of_' + i] = data[
'count_' + i + '_' + j] / data['count_' + i]
data['ratio_' + i + '_of_' + j] = data[
'count_' + i + '_' + j] / data['count_' + j]
ratio_feature_list.append('ratio_' + j + '_of_' + i)
ratio_feature_list.append('ratio_' + i + '_of_' + j)
cate_feature = origin_cate_list
num_feature = ['creative_width', 'creative_height', 'hour'] + count_feature_list + ratio_feature_list
feature = cate_feature + num_feature
print(len(feature), feature)
# 低频过滤
for feature in cate_feature:
if 'count_' + feature in data.keys(): #data.keys() = data.columns.tolist()
print(feature)
data.loc[data['count_' + feature] < 2, feature] = -1
data[feature] = data[feature] + 1
predict = data[(data.label == -1) & (data.data_type == 2)]
predict_result = predict[['instance_id']]
predict_result['predicted_score'] = 0
predict_x = predict.drop('label', axis=1)
train_x = data[data.label != -1].reset_index(drop=True)
train_y = train_x.pop('label').values
base_train_csr = sparse.csr_matrix((len(train_x), 0)) #关于csr,详见百度网盘csr-csc说明.doc
base_predict_csr = sparse.csr_matrix((len(predict_x), 0))
enc = OneHotEncoder()
for feature in cate_feature:
enc.fit(data[feature].values.reshape(-1, 1))
base_train_csr = sparse.hstack((base_train_csr, enc.transform(train_x[feature].values.reshape(-1, 1))), 'csr',
'bool')
base_predict_csr = sparse.hstack((base_predict_csr, enc.transform(predict[feature].values.reshape(-1, 1))),
'csr',
'bool')
print('one-hot prepared !')
cv = CountVectorizer(min_df=20)
for feature in ['user_tags']:
data[feature] = data[feature].astype(str)
cv.fit(data[feature])
base_train_csr = sparse.hstack((base_train_csr, cv.transform(train_x[feature].astype(str))), 'csr', 'bool')
base_predict_csr = sparse.hstack((base_predict_csr, cv.transform(predict_x[feature].astype(str))), 'csr',
'bool')
print('cv prepared !')
train_csr = sparse.hstack(
(sparse.csr_matrix(train_x[num_feature]), base_train_csr), 'csr').astype(
'float32')
predict_csr = sparse.hstack(
(sparse.csr_matrix(predict_x[num_feature]), base_predict_csr), 'csr').astype('float32')
lgb_model = lgb.LGBMClassifier(
boosting_type='gbdt', num_leaves=61, reg_alpha=3, reg_lambda=1,
max_depth=-1, n_estimators=5000, objective='binary',
subsample=0.8, colsample_bytree=0.8, subsample_freq=1,
learning_rate=0.035, random_state=2018, n_jobs=10
)
skf = StratifiedKFold(n_splits=5, random_state=2018, shuffle=True)
best_score = []
for index, (train_index, test_index) in enumerate(skf.split(train_csr, train_y)):
lgb_model.fit(train_csr[train_index], train_y[train_index],
eval_set=[(train_csr[train_index], train_y[train_index]),
(train_csr[test_index], train_y[test_index])], early_stopping_rounds=200, verbose=10)
best_score.append(lgb_model.best_score_['valid_1']['binary_logloss'])
print(best_score)
test_pred = lgb_model.predict_proba(predict_csr, num_iteration=lgb_model.best_iteration_)[:, 1]
predict_result['predicted_score'] = predict_result['predicted_score'] + test_pred
predict_result['predicted_score'] = predict_result['predicted_score'] / 5
mean = predict_result['predicted_score'].mean()
print('mean:', mean)
now = datetime.datetime.now()
now = now.strftime('%m-%d-%H-%M')
predict_result[['instance_id', 'predicted_score']].to_csv(path + "/submission/lgb_baseline_%s.csv" % now, index=False)