-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwat_array.hpp
351 lines (300 loc) · 12.6 KB
/
wat_array.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
/*
* Copyright (c) 2010 Daisuke Okanohara
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above Copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above Copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. Neither the name of the authors nor the names of its contributors
* may be used to endorse or promote products derived from this
* software without specific prior written permission.
*/
#ifndef WATARRAY_WATARRAY_HPP_
#define WATARRAY_WATARRAY_HPP_
#include <vector>
#include <queue>
#include <tuple>
#include <stdint.h>
#include <iostream>
#include <cassert>
#include <unordered_map>
#include "bit_array.hpp"
namespace wat_array {
/**
Wavelet Tree Array Library for array processing.
Input: A[0...n], 0 <= A[i] < k,
Space: n log_2 k bits
Support many queries in O(log k) time and constant for n.
*/
struct ListResult{
ListResult(uint64_t c, uint64_t freq) : c(c), freq(freq){}
uint64_t c;
uint64_t freq;
bool operator < (const ListResult& lr) const {
if (c != lr.c) return c < lr.c;
return freq < lr.freq;
}
};
class WatArray{
public:
/**
* Constructor
*/
WatArray();
/**
* Destructor
*/
~WatArray();
/**
* Initialize an index from an array
* @param An array to be initialized
*/
void Init(const std::vector<uint64_t>& array);
void Init(const BitArray& ba, uint64_t width, uint64_t length);
/**
* Clear and release the resouces
*/
void Clear();
/**
* Lookup A[pos]
* @param pos the position
* @return return A[pos] if found, or return NOT_FOUND if pos >= length
*/
uint64_t Lookup(uint64_t pos) const;
/**
* Compute the rank = the frequency of a character 'c' in the prefix of the array A[0...pos)
* @param c Character to be examined
* @param pos The position of the prefix (not inclusive)
* @return The frequency of a character 'c' in the prefix of the array A[0...pos)
* or NOT_FOUND if c >= alphabet_num or pos > length
*/
uint64_t Rank(uint64_t c, uint64_t pos) const;
/**
* Compute the select = the position of the (rank+1)-th occurence of 'c' in the array.
* @param c Character to be examined
* @param rank The rank of the character
* @return The position of the (rank+1)-th occurence of 'c' in the array.
* or NOT_FOUND if c >= alphabet_num or rank > Freq(c)
*/
uint64_t Select(uint64_t c, uint64_t rank) const;
/**
* Compute the frequency of characters c' < c in the subarray A[0...pos)
* @param c The upper bound of the characters
* @param pos The position of the end of the prefix (not inclusive)
* @return The frequency of characters c' < c in the prefix of the array A[0...pos)
or NOTFOUND if c > alphabet_num or pos > length
*/
uint64_t RankLessThan(uint64_t c, uint64_t pos) const;
/**
* Compute the frequency of characters c' > c in the subarray A[0...pos)
* @param c The lower bound of the characters
* @param pos The position of the end of the prefix (not inclusive)
* @return The frequency of characters c' < c in the prefix of the array A[0...pos)
or NOTFOUND if c > alphabet_num or pos > length
*/
uint64_t RankMoreThan(uint64_t c, uint64_t pos) const;
/*
* Count distinct characters
* Added by Masumi Shirakawa, Aug. 18, 2014.
*/
int Count(uint64_t beg_pos, uint64_t end_pos,
uint64_t beg_node, uint64_t end_node, size_t h) const;
/*
* Count distinct characters with number of times (to cope with phrases like "Johnson & Johnson")
* Added by Masumi Shirakawa, Apr. 10, 2015.
*/
int Count(std::vector<uint64_t> beg_pos, std::vector<uint64_t> end_pos, std::vector<size_t> nums,
uint64_t beg_node, uint64_t end_node, size_t h) const;
/*
* Approximately Count distinct characters with number of times
* Added by Masumi Shirakawa, Aug. 18, 2015.
*/
int ApproxCount(uint64_t beg_pos, uint64_t end_pos,
uint64_t beg_node, uint64_t end_node, size_t h, int th) const;
int ApproxCount(std::vector<uint64_t> beg_pos, std::vector<uint64_t> end_pos,
std::vector<size_t> nums, uint64_t beg_node, uint64_t end_node,
size_t h, int th) const;
std::tuple<double, int> ApproxCount(std::vector<uint64_t> beg_pos, std::vector<uint64_t> end_pos,
std::vector<size_t> nums, uint64_t beg_node, uint64_t end_node,
size_t h, int th, int tmpcount) const;
/**
* Compute the frequency of characters c' < c, c'=c, and c' > c, in the subarray A[0...pos)
* @param c The character
* @param pos The position of the end of the prefix (not inclusive)
* @param rank The frefquency of c in A[0...pos)
* @param rank_less_than The frequency of c' < c in A[0...pos)
* @param rank_more_than The frequency of c' > c in A[0...pos)
*/
void RankAll(uint64_t c, uint64_t pos, uint64_t& rank,
uint64_t& rank_less_than, uint64_t& rank_more_than) const;
/**
* Compute the frequency of characters min_c <= c' < max_c in the subarray A[beg_pos ... end_pos)
* @param min_c The smallerest character to be examined
* @param max_c The uppker bound of the character to be examined
* @param beg_pos The beginning position of the array (inclusive)
* @param end_pos The ending position of the array (not inclusive)
* @return The frequency of characters min_c <= c < max_c in the subarray A[beg_pos .. end_pos)
or NOTFOUND if max_c > alphabet_num or end_pos > length
*/
uint64_t FreqRange(uint64_t min_c, uint64_t max_c, uint64_t beg_pos, uint64_t end_pos) const;
/**
* Range Max Query
* @param beg_pos The beginning position
* @param end_pos The ending position
* @param pos The position where the largest value appeared in the subarray A[beg_pos .. end_pos)
If there are many items having the largest values, the smallest pos will be reporeted
* @param val The largest value appeared in the subarray A[beg_pos ... end_pos)
*/
void MaxRange(uint64_t beg_pos, uint64_t end_pos, uint64_t& pos, uint64_t& val) const;
/**
* Range Min Query
* @param beg_pos The beginning position
* @param end_pos The ending position
* @param pos The position where the smallest value appeared in the subarray A[beg_pos .. end_pos)
If there are many items having the smalles values, the smallest pos will be reporeted
* @param val The smallest value appeared in the subarray A[beg_pos ... end_pos)
*/
void MinRange(uint64_t beg_pos, uint64_t end_pos, uint64_t& pos, uint64_t& val) const;
/**
* Range Quantile Query, Return the K-th smallest value in the subarray
* @param beg_pos The beginning position
* @param end_pos The ending position
* @param k The order (should be smaller than end_pos - beg_pos).
* @param pos The position where the k-th largest value appeared in the subarray A[beg_pos .. end_pos)
If there are many items having the k-th largest values, the smallest pos will be reporeted
* @param val The k-th largest value appeared in the subarray A[beg_pos ... end_pos)
*/
void QuantileRange(uint64_t beg_pos, uint64_t end_pos, uint64_t k, uint64_t& pos, uint64_t& val) const;
/**
* List the distinct characters appeared in A[beg_pos ... end_pos) from most frequent ones
*/
void ListModeRange(uint64_t min_c, uint64_t max_c, uint64_t beg_pos, uint64_t end_pos, uint64_t num, std::vector<ListResult>& res) const;
/**
* List the distinct characters in A[beg_pos ... end_pos) min_c <= c < max_c from smallest ones
* @param min_c The smallerest character to be examined
* @param max_c The uppker bound of the character to be examined
* @param beg_pos The beginning position of the array (inclusive)
* @param end_pos The ending positin of the array (not inclusive)
* @param num The maximum number of reporting results.
* @param res The distinct chracters in the A[beg_pos ... end_pos) from smallest ones.
* Each item consists of c:character and freq: frequency of c.
*/
void ListMinRange(uint64_t min_c, uint64_t max_c, uint64_t beg_pos, uint64_t end_pos, uint64_t num, std::vector<ListResult>& res) const;
/**
* List the distinct characters appeared in A[beg_pos ... end_pos) from largest ones
* @param min_c The smallerest character to be examined
* @param max_c The uppker bound of the character to be examined
* @param beg_pos The beginning position of the array (inclusive)
* @param end_pos The ending positin of the array (not inclusive)
* @param num The maximum number of reporting results.
* @param res The distinct chracters in the A[beg_pos ... end_pos) from largestx ones.
* Each item consists of c:character and freq: frequency of c.
*/
void ListMaxRange(uint64_t min_c, uint64_t max_c, uint64_t beg_pos, uint64_t end_pos, uint64_t num, std::vector<ListResult>& res) const;
/**
* Compute the frequency of the character c
* @param c The character to be examined
* param Return the frequency of c in the array.
*/
uint64_t Freq(uint64_t c) const;
/**
* Compute the frequency of the characters
* @param min_c The minimum character
* @param max_c The maximum character
* param Return the frequency of min_c <= c < max_c in the array.
*/
uint64_t FreqSum(uint64_t min_c, uint64_t max_c) const;
/**
* Return the number of alphabets in the array
* @return The number of alphabet in the array
*/
uint64_t alphabet_num() const;
/**
* Return the length of the array
* @return The length of the array
*/
uint64_t length() const;
/**
* Save the current status to a stream
* @param os The output stream where the data is saved
*/
void Save(std::ostream& os) const;
/**
* Load the current status from a stream
* @param is The input stream where the status is saved
*/
void Load(std::istream& is);
private:
uint64_t GetAlphabetNum(const std::vector<uint64_t>& array) const;
uint64_t Log2(uint64_t x) const;
uint64_t PrefixCode(uint64_t x, uint64_t len, uint64_t total_len) const;
static uint64_t GetMSB(uint64_t x, uint64_t pos, uint64_t len);
static uint64_t GetLSB(uint64_t x, uint64_t pos);
void SetArray(const std::vector<uint64_t>& array);
void SetOccs(const std::vector<uint64_t>& array);
void GetBegPoses(const std::vector<uint64_t>& array, uint64_t alpha_bit_num,
std::vector<std::vector<uint64_t> >& beg_poses) const;
struct QueryOnNode{
QueryOnNode(uint64_t beg_node, uint64_t end_node, uint64_t beg_pos, uint64_t end_pos,
uint64_t depth, uint64_t prefix_char) :
beg_node(beg_node), end_node(end_node), beg_pos(beg_pos), end_pos(end_pos),
depth(depth), prefix_char(prefix_char) {}
uint64_t beg_node;
uint64_t end_node;
uint64_t beg_pos;
uint64_t end_pos;
uint64_t depth;
uint64_t prefix_char;
void print() {
std::cout << beg_node << " " << end_node << " "
<< beg_pos << " " << end_pos << " "
<< depth << " " ;
for (uint64_t i = 0; i < depth; ++i){
std::cout << GetMSB(prefix_char, i, depth);
}
std::cout << std::endl;
}
};
class ListModeComparator;
class ListMinComparator;
class ListMaxComparator;
template <class Comparator>
void ListRange(uint64_t min_c, uint64_t max_c,
uint64_t beg_pos, uint64_t end_pos,
uint64_t num, std::vector<ListResult>& res) const {
res.clear();
if (end_pos > length_ || beg_pos >= end_pos) return;
std::priority_queue<QueryOnNode, std::vector<QueryOnNode>, Comparator> qons;
qons.push(QueryOnNode(0, length_, beg_pos, end_pos, 0, 0));
while (res.size() < num && !qons.empty()){
QueryOnNode qon = qons.top();
qons.pop();
if (qon.depth >= alphabet_bit_num_){
res.push_back(ListResult(qon.prefix_char, qon.end_pos - qon.beg_pos));
} else {
std::vector<QueryOnNode> next;
ExpandNode(min_c, max_c, qon, next);
for (size_t i = 0; i < next.size(); ++i){
qons.push(next[i]);
}
}
}
}
bool CheckPrefix(uint64_t prefix, uint64_t depth, uint64_t min_c, uint64_t max_c) const;
void ExpandNode(uint64_t min_c, uint64_t max_c,
const QueryOnNode& qon, std::vector<QueryOnNode>& next) const;
std::vector<BitArray> bit_arrays_;
BitArray occs_;
uint64_t alphabet_num_;
uint64_t alphabet_bit_num_;
uint64_t length_;
};
}
#endif // WASEQ_WASEQ_HPP_