-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdatasetmgr.cpp
305 lines (283 loc) · 11.2 KB
/
datasetmgr.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
//
// Created by ngs on 27/04/2018.
//
#include "datasetmgr.h"
#include <cctype>
DatasetMgr::DatasetMgr(bool is_sentence_level){
is_sentence_level_ = is_sentence_level;
ptr_line_ = new char[LINE_MAX_SIZE]();
ptr_tag_set_ = new std::set<std::string>();
ptr_tag_vector_ = new std::vector<std::string>();
ptr_pair_tag_vector_ = new std::vector<std::string>();
ptr_tag_count_map_ = new std::map<std::string, size_t >();
ptr_pair_tag_count_map_ = new std::map<std::string, size_t >();
ptr_state_to_state_prob_map_ = new std::map<std::string, double >();
ptr_x_tag_vector_ = new std::vector<std::string>();
ptr_x_tag_count_map_ = new std::map<std::string, size_t >();
ptr_state_to_x_prob_map_ = new std::map<std::string, double >();
ptr_test_x_vector_ = new std::vector<std::string>();
ptr_test_tag_vector_ = new std::vector<std::string>();
ptr_x_vector_ = new std::vector<std::string>();
ptr_x_set_ = new std::set<std::string>();
num_of_training_setence_ = 0;
}
DatasetMgr::~DatasetMgr() {
delete []ptr_line_;
delete ptr_tag_set_;
delete ptr_tag_vector_;
delete ptr_pair_tag_vector_;
delete ptr_tag_count_map_;
delete ptr_pair_tag_count_map_;
delete ptr_state_to_state_prob_map_;
delete ptr_x_tag_vector_;
delete ptr_x_tag_count_map_;
delete ptr_state_to_x_prob_map_;
delete ptr_test_x_vector_;
delete ptr_test_tag_vector_;
delete ptr_x_vector_;
delete ptr_x_set_;
}
/**
* Extract features and tags from training set and store them into vectors.
*
* @param file_name
* @return
*/
bool DatasetMgr:: OpenDataSet(const char *training_file_name, const char *test_file_name, bool is_training) {
const char *file_name = (is_training) ? training_file_name:test_file_name;
std::ifstream ifs(file_name);
std::vector<std::string> line_vector;
size_t i = 0;
while(ifs.getline(ptr_line_, LINE_MAX_SIZE)){
if('\0' == ptr_line_[0] || '\t'==ptr_line_[0]||' ' == ptr_line_[0]){
continue;
}
if('.' == ptr_line_[0]){
if(is_sentence_level_){
ptr_x_vector_->push_back(SPERATOR_FLAG);
ptr_tag_vector_->push_back(SPERATOR_FLAG);
num_of_training_setence_++;
continue;
}else{
continue;
}
}
i++;
line_vector.clear();
if(true == Tokenized(ptr_line_,"\t ",&line_vector,TAG_MAX_SIZE,is_training)){
if(true == is_training){
OpenTrainSet(&line_vector,is_sentence_level_);
}else{
OpenTrainSet(&line_vector,is_sentence_level_);
// OpenTestSet(&line_vector);
}
}
}
for(std::vector<std::string>::iterator it = ptr_test_x_vector_->begin();it!=ptr_test_x_vector_->end();++it){
//std::cout << "the tag in test set is: "<<*it<<std::endl;
}
return true;
}
void DatasetMgr::OpenFeatureMapFile(std::map<std::pair<int, int>, int> *ptr_feture_map,
std::map<int, std::pair<int, int >> *ptr_reverse_feature_map,
const char *file_name) {
std::ifstream ifs(file_name);
//std::vector<std::string> line_vector;
std::string line_vector;
while (getline(ifs, line_vector)) {
std::cout << line_vector << std::endl;
}
}
void DatasetMgr::OpenTrainSet(std::vector<std::string> *ptr_vector, bool is_sentence_level) {
std::vector<std::string>::iterator it_x = ptr_vector->begin();
std::vector<std::string>::iterator it_tag = ptr_vector->end()-2;
std::vector<std::string>::iterator it_parse = ptr_vector->end()-1;
if(is_sentence_level){
// std::cout << *it_x << std::endl;
// std::cout << *it_tag << std::endl;
ptr_x_vector_->push_back(*it_x);
ptr_x_set_->insert(*it_x);
//pos
ptr_tag_vector_->push_back(*it_tag);
ptr_tag_set_->insert(*it_tag);
//BIO
//ptr_tag_vector_->push_back(*it_parse);
//ptr_tag_set_->insert(*it_parse);
}else{
ptr_tag_set_->insert(*it_tag);
//BOI tag, the "O" is indicated as OUT_FLAG, it indicates that no state transition for a "0" sentence.
if(*it_parse == TAGER_BIO_O){
ptr_tag_vector_->push_back(OUT_FLAG);
}
//transfer the training x into a vector, each sentence is separated by SEPARATOR.
if(*it_parse == TAGER_BIO_B || *it_parse == TAGER_BIO_O){
ptr_x_vector_->push_back(SPERATOR_FLAG);
num_of_training_setence_++;
}
ptr_x_vector_->push_back(*it_x);
//features without duplicate.
ptr_x_set_->insert(*it_x);
ptr_tag_vector_->push_back(*it_tag);
std::string x = *it_tag;
MergeTwoString(&x,*it_x,SPERATOR_FLAG);
ptr_x_tag_vector_->push_back(x);
}
}
void DatasetMgr::OpenTestSet(std::vector<std::string> *ptr_vector) {
std::vector<std::string>::iterator it_x = ptr_vector->begin();
std::vector<std::string>::iterator it_tag = ptr_vector->begin()+1;
std::vector<std::string>::iterator it_parse = ptr_vector->end()-1;
//to facilitate decoding, we insert a SPERATOR_FLAG for each sentence or for each "O" word
if(*it_parse == TAGER_BIO_B || *it_parse == TAGER_BIO_O){
ptr_test_x_vector_->push_back(SPERATOR_FLAG);
ptr_test_tag_vector_->push_back(SPERATOR_FLAG); // to calculate PR
}
ptr_test_x_vector_->push_back(*it_x);
ptr_test_tag_vector_->push_back(*it_tag);
}
bool DatasetMgr::MergeTwoString(std::string *ptr_str1, std::string str2, std::string separator) {
if(ptr_str1){
*ptr_str1 += separator;
*ptr_str1 += str2;
return true;
}
return false;
}
/**
* Read a line and extract features and tag
*
* @param ptr_line
* @param ptr_space
* @param ptr_tagset
* @param tag_maxsize
* @return false: the feature is a punctuation, ture: otherwise
*/
bool DatasetMgr::Tokenized(char *ptr_line, const char *ptr_space, std::vector<std::string> *ptr_string_line, size_t tag_maxsize,bool istraining) {
char * endofline = ptr_line + std::strlen(ptr_line);
const char * endofspace = ptr_space + std::strlen(ptr_space);
size_t size=0;
while(size < tag_maxsize){
char *space = std::find_first_of(ptr_line,endofline,ptr_space,endofspace); //search the space in the line.
*space = '\0';
if(*ptr_line!='\0'){
//if (!ispunct(*ptr_line)){ // omit if it is a punctuation, such as ;, ?
if(istraining){
if(!isdigit(*ptr_line)){
ptr_string_line->push_back(ptr_line);
}else{ //unify as a FLAG if it is a digit, such as 120, 34.5.
ptr_string_line->push_back(std::to_string(DIGITAL_FLAG));
}
}else{
//ptr_string_line->push_back(ptr_line);
if(!isdigit(*ptr_line)){
ptr_string_line->push_back(ptr_line);
}else{ //unify as a FLAG if it is a digit, such as 120, 34.5.
ptr_string_line->push_back(std::to_string(DIGITAL_FLAG));
}
}
// }else{
// return false;
// }
++size;
}
if(space == endofline){
break;
}
ptr_line = space + 1;
}
return true;
}
/**
* Calc the count from training dataset.
*
* @param ptr_vector
* @param ptr_count_map
* @param option
*/
void DatasetMgr::GenerateCountMap(std::vector<std::string> *ptr_vector, std::map<std::string, size_t> *ptr_count_map, bool option) {
//calc the count.
if(option){
for(std::vector<std::string>::iterator it = ptr_vector->begin();it!=ptr_vector->end();++it) {
size_t count = std::count(ptr_vector->begin(),ptr_vector->end(),*it);
ptr_count_map->insert(make_pair(*it,count));
//std::cout << "The count of "<<*it<<" is "<<count<<std::endl;
}
}else{
for(std::vector<std::string>::iterator it = ptr_vector->begin();it!=ptr_vector->end();++it) {
if(*it!=OUT_FLAG){
size_t count = std::count(ptr_vector->begin(),ptr_vector->end(),*it);
ptr_count_map->insert(make_pair(*it,count));
}
}
}
}
/**
* Generate the transition vector of two hidden states from training dataset.
*/
void DatasetMgr::GenerateStateTransitionVector() {
// generate the transition vector.
for(std::vector<std::string>::iterator it = ptr_tag_vector_->begin();it!=ptr_tag_vector_->end();++it){
if((it+1) !=ptr_tag_vector_->end()){
std::vector<std::string>::iterator next_it = it + 1;
if(*it != OUT_FLAG){
if(*next_it != OUT_FLAG){
std::string pair = *it;
MergeTwoString(&pair, *next_it,SPERATOR_FLAG);
ptr_pair_tag_vector_->push_back(pair);
}
} else{
it = next_it;
}
}
}
}
void DatasetMgr::CalcProb(std::map<std::string, size_t> *ptr_cout, std::map<std::string, size_t> *ptr_trans_cout,
std::map<std::string, double> *ptr_prob) {
for(std::map<std::string,size_t >::iterator it = ptr_trans_cout->begin();it!=ptr_trans_cout->end();++it) {
std::string pair_tag = it->first;
size_t pair_count = it->second;
std::string first_state = pair_tag.substr(0,pair_tag.find(SPERATOR_FLAG,0));
std::string second_state = pair_tag.substr(pair_tag.find(SPERATOR_FLAG,0)+1);
size_t tag_count = ptr_cout->find(first_state)->second;
double trans_prob = (double) pair_count / tag_count;
ptr_prob->insert(make_pair(pair_tag,trans_prob));
}
for (std::map<std::string, double >::iterator iit = ptr_state_to_x_prob_map_->begin();iit!=ptr_state_to_x_prob_map_->end();++iit) {
// std::cout << "The count of "<<iit->first<<" is "<<iit->second<<std::endl;
}
}
void DatasetMgr::Calc() {
GenerateStateTransitionVector();
GenerateCountMap(ptr_pair_tag_vector_,ptr_pair_tag_count_map_, true);
GenerateCountMap(ptr_tag_vector_,ptr_tag_count_map_, false);
CalcProb(ptr_tag_count_map_,ptr_pair_tag_count_map_,ptr_state_to_state_prob_map_);
GenerateCountMap(ptr_x_tag_vector_,ptr_x_tag_count_map_, true);
CalcProb(ptr_tag_count_map_,ptr_x_tag_count_map_,ptr_state_to_x_prob_map_);
}
std::map<std::string, double> *DatasetMgr::GetStateTransProbMap() const {
return ptr_state_to_state_prob_map_;
}
std::map<std::string, double> *DatasetMgr::GetEmissionProbMap() const {
return ptr_state_to_x_prob_map_;
}
std::vector<std::string> *DatasetMgr::GetTestFeatureVector() const {
return ptr_test_x_vector_;
}
std::vector<std::string> *DatasetMgr::GetTestFlagVector() const {
return ptr_test_tag_vector_;
}
std::set<std::string> *DatasetMgr::GetTagSet() const {
return ptr_tag_set_;
}
std::vector<std::string> *DatasetMgr::GetTrainingXVector() const {
return ptr_x_vector_;
}
std::set<std::string> *DatasetMgr::GetTrainingXSet() const {
return ptr_x_set_;
}
size_t DatasetMgr::GetNumOfTrainingSeqs() const {
return num_of_training_setence_;
}
std::vector<std::string>* DatasetMgr::GetTageVector() const {
return ptr_tag_vector_;
}