-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathES_Class.py
179 lines (163 loc) · 5.95 KB
/
ES_Class.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from elasticsearch import helpers
import json
import time
class ElasticSearchClient(object):
# 实例和事务化单个node,若需要多个node,需要重构代码
def __init__(self, host="localhost", port=9200):
self.host = host
self.port = port
self.es_servers = [{
"host": self.host,
"port": self.port
}]
# http_auth是对设置了安全机制的es库需要写入 账号与密码,如果没有设置则不用写这个参数
try:
self.es_client = Elasticsearch(hosts=self.es_servers)
except Exception as e:
print(e)
print('连接es失败,请查看是否连接。')
# 进行创建一个数据库,即index
def create_index(self, index_name, settings=None):
"""
创建Index
:param index_name:
:param settings:
setting = {
"settings": {
"number_of_replicas": 1,
"number_of_shards": 1
},
"mappings": {
"properties": {
"docid": {
"type": "text"
},
"doc": {
"type": "text"
},
}
}
}
:return:
"""
if not self.es_client.indices.exists(index=index_name, ignore=[400, 404]):
if settings is not None:
self.es_client.indices.create(index=index_name, body=settings)
else:
self.es_client.indices.create(index=index_name)
# 进行删除一个数据库,即index
def delete_es_index(self, index_name):
if self.es_client.indices.exists(index=index_name, ignore=[400, 404]):
self.es_client.indices.delete(index=index_name)
def set_index_mapping(self, index, mappings):
# 设置mapping结构
"""
设置index的mapping,类似于表结构。
注意!!!!现在仅仅对mapping中的properties参数,其他的参数还很多
前提为:已有index,并且已自定义分词器,详情见https://blog.csdn.net/u013905744/article/details/80935846
输入参数举例说明:
mapping = {
'properties': {
'doc': {
'type': 'text',
'analyzer': 'whitespace',
'search_analyzer': 'whitespace',
},
'docid': {
'type': 'keyword',
}
}
}
"""
self.es_client.indices.put_mapping(index=index, body=mappings)
def add_date(self, index, data):
"""
单条插入ES
"""
self.es_client.index(index=index, body=data)
def add_date_bulk(self, index, data, batch_size=2000):
"""
批量插入ES,输入文本格式为单条插入的list格式
"""
actions = []
success_num = 0
fail_num = 0
start = time.time()
batch_step = 1
for idx, data_dict in enumerate(data):
action = {
"_index": index,
"_id": idx,
"_source": data_dict
}
actions.append(action)
# 批量处理
if len(actions) == batch_size or idx == len(data) - 1:
success, failed = bulk(self.es_client, actions, raise_on_error=True)
actions = []
success_num += success
fail_num += len(failed)
print(f'{batch_step}: 成功插入了{success}条数据, 失败{len(failed)}条数据')
batch_step += 1
end = time.time()
print(f'一共成功插入了{success_num}条数据, 失败{fail_num}条数据, 共花费{end - start}秒时间')
def update_by_id(self, index, idx, data):
"""
根据给定的_id,更新ES文档
:return:
"""
self.es_client.update(index=index, body={"doc": data}, id=idx)
def delete_by_id(self, index, idx):
"""
根据给定的id,删除文档
:return:
"""
self.es_client.delete(index=index, id=idx)
def search_by_query(self, index, query, return_list=False):
'''
根据查询的query语句,来搜索查询内容
'''
search_result = self.es_client.search(index=index, body=query)
if return_list:
search_result = self.get_result_list(search_result)
return search_result
# 将查询返回处理成list
def get_result_list(self, es_response):
final_result = []
result_items = es_response['hits']['hits']
for item in result_items:
final_result.append({
"score": item['_score'],
"result": item['_source']
}
)
return final_result
# scan查询
def search_by_scan(self, index, query, scroll='5m', timeout="1m", threshold=100):
es_result = helpers.scan(
client=self.es_client,
query=query,
scroll=scroll,
index=index,
timeout=timeout,
preserve_order=True
)
final_result = []
for item in es_result:
if item['_score'] < threshold:
break
final_result.append({
"score": item['_score'],
"result": item['_source']
}
)
return final_result
if __name__ == '__main__':
es = ElasticSearchClient(host="localhost", port=9200)
print(es)
# # 创建索引
# index = "test"
# es.delete_es_index(index_name=index)
# es.create_index(index_name=index)