-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathpopcount.h
187 lines (161 loc) · 6.95 KB
/
popcount.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
/* -*- Mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
#ifndef _FASTRANK_POPCOUNT_H_
#define _FASTRANK_POPCOUNT_H_
#include <sys/types.h>
#include <stdio.h>
#define L8 0x0101010101010101ULL // Every lowest 8th bit set: 00000001...
#define G2 0xAAAAAAAAAAAAAAAAULL // Every highest 2nd bit: 101010...
#define G4 0x3333333333333333ULL // 00110011 ... used to group the sum of 4 bits.
#define G8 0x0F0F0F0F0F0F0F0FULL
#define H8 0x8080808080808080ULL
#define L9 0x0040201008040201ULL
#define H9 (L9 << 8)
#define L16 0x0001000100010001ULL
#define H16 0x8000800080008000ULL
#define ONES_STEP_4 ( 0x1111111111111111ULL )
#define ONES_STEP_8 ( 0x0101010101010101ULL )
#define ONES_STEP_9 ( 1ULL << 0 | 1ULL << 9 | 1ULL << 18 | 1ULL << 27 | 1ULL << 36 | 1ULL << 45 | 1ULL << 54 )
#define ONES_STEP_16 ( 1ULL << 0 | 1ULL << 16 | 1ULL << 32 | 1ULL << 48 )
#define MSBS_STEP_4 ( 0x8ULL * ONES_STEP_4 )
#define MSBS_STEP_8 ( 0x80ULL * ONES_STEP_8 )
#define MSBS_STEP_9 ( 0x100ULL * ONES_STEP_9 )
#define MSBS_STEP_16 ( 0x8000ULL * ONES_STEP_16 )
#define INCR_STEP_8 ( 0x80ULL << 56 | 0x40ULL << 48 | 0x20ULL << 40 | 0x10ULL << 32 | 0x8ULL << 24 | 0x4ULL << 16 | 0x2ULL << 8 | 0x1 )
#define ONES_STEP_32 ( 0x0000000100000001ULL )
#define MSBS_STEP_32 ( 0x8000000080000000ULL )
#define COMPARE_STEP_8(x,y) ( ( ( ( ( (x) | MSBS_STEP_8 ) - ( (y) & ~MSBS_STEP_8 ) ) ^ (x) ^ ~(y) ) & MSBS_STEP_8 ) >> 7 )
#define LEQ_STEP_8(x,y) ( ( ( ( ( (y) | MSBS_STEP_8 ) - ( (x) & ~MSBS_STEP_8 ) ) ^ (x) ^ (y) ) & MSBS_STEP_8 ) >> 7 )
#define UCOMPARE_STEP_9(x,y) ( ( ( ( ( ( (x) | MSBS_STEP_9 ) - ( (y) & ~MSBS_STEP_9 ) ) | ( x ^ y ) ) ^ ( x | ~y ) ) & MSBS_STEP_9 ) >> 8 )
#define UCOMPARE_STEP_16(x,y) ( ( ( ( ( ( (x) | MSBS_STEP_16 ) - ( (y) & ~MSBS_STEP_16 ) ) | ( x ^ y ) ) ^ ( x | ~y ) ) & MSBS_STEP_16 ) >> 15 )
#define ULEQ_STEP_9(x,y) ( ( ( ( ( ( (y) | MSBS_STEP_9 ) - ( (x) & ~MSBS_STEP_9 ) ) | ( x ^ y ) ) ^ ( x & ~y ) ) & MSBS_STEP_9 ) >> 8 )
#define ULEQ_STEP_16(x,y) ( ( ( ( ( ( (y) | MSBS_STEP_16 ) - ( (x) & ~MSBS_STEP_16 ) ) | ( x ^ y ) ) ^ ( x & ~y ) ) & MSBS_STEP_16 ) >> 15 )
#define ZCOMPARE_STEP_8(x) ( ( ( x | ( ( x | MSBS_STEP_8 ) - ONES_STEP_8 ) ) & MSBS_STEP_8 ) >> 7 )
// Population count of a 64 bit integer in SWAR (SIMD within a register) style
// From Sebastiano Vigna, "Broadword Implementation of Rank/Select Queries"
// http://sux.dsi.unimi.it/paper.pdf p4
// This variant uses multiplication for the last summation instead of
// continuing the shift/mask/addition chain.
inline int suxpopcount(uint64 x) {
// Step 1: 00 - 00 = 0; 01 - 00 = 01; 10 - 01 = 01; 11 - 01 = 10;
x = x - ((x & G2) >> 1);
// step 2: add 2 groups of 2.
x = (x & G4) + ((x >> 2) & G4);
// 2 groups of 4.
x = (x + (x >> 4)) & G8;
// Using a multiply to collect the 8 groups of 8 together.
x = x * L8 >> 56;
return x;
}
// Default to using the GCC builtin popcount. On architectures
// with -march popcnt, this compiles to a single popcnt instruction.
#ifndef popcount
#define popcount __builtin_popcountll
//#define popcount suxpopcount
#endif
#define popcountsize 64ULL
#define popcountmask (popcountsize - 1)
inline uint64 popcountLinear(uint64 *bits, uint64 x, uint64 nbits) {
if (nbits == 0) { return 0; }
uint64 lastword = (nbits - 1) / popcountsize;
uint64 p = 0;
for (int i = 0; i < lastword; i++) { /* tested; manually unrolling doesn't help, at least in C */
p += popcount(bits[x+i]); // note that use binds us to 64 bit popcount impls
}
// 'nbits' may or may not fall on a multiple of 64 boundary,
// so we may need to zero out the right side of the last word
// (accomplished by shifting it right, since we're just popcounting)
uint64 lastshifted = bits[x+lastword] >> (63 - ((nbits - 1) & popcountmask));
p += popcount(lastshifted);
return p;
}
// Return the index of the kth bit set in x
inline int select64_naive(uint64 x, int k) {
int count = -1;
for (int i = 63; i >= 0; i--) {
count++;
if (x & (1ULL << i)) {
k--;
if (k == 0) {
return count;
}
}
}
return -1;
}
inline int select64_popcount_search(uint64 x, int k) {
int loc = -1;
// if (k > popcount(x)) { return -1; }
for (int testbits = 32; testbits > 0; testbits >>= 1) {
int lcount = popcount(x >> testbits);
if (k > lcount) {
x &= ((1ULL << testbits)-1);
loc += testbits;
k -= lcount;
} else {
x >>= testbits;
}
}
return loc+k;
}
inline int select64_broadword(uint64 x, int k) {
uint64 word = x;
int residual = k;
register uint64 byte_sums;
byte_sums = word - ( ( word & 0xa * ONES_STEP_4 ) >> 1 );
byte_sums = ( byte_sums & 3 * ONES_STEP_4 ) + ( ( byte_sums >> 2 ) & 3 * ONES_STEP_4 );
byte_sums = ( byte_sums + ( byte_sums >> 4 ) ) & 0x0f * ONES_STEP_8;
byte_sums *= ONES_STEP_8;
// Phase 2: compare each byte sum with the residual
const uint64 residual_step_8 = residual * ONES_STEP_8;
const int place = ( LEQ_STEP_8( byte_sums, residual_step_8 ) * ONES_STEP_8 >> 53 ) & ~0x7;
// Phase 3: Locate the relevant byte and make 8 copies with incremental masks
const int byte_rank = residual - ( ( ( byte_sums << 8 ) >> place ) & 0xFF );
const uint64 spread_bits = ( word >> place & 0xFF ) * ONES_STEP_8 & INCR_STEP_8;
const uint64 bit_sums = ZCOMPARE_STEP_8( spread_bits ) * ONES_STEP_8;
// Compute the inside-byte location and return the sum
const uint64 byte_rank_step_8 = byte_rank * ONES_STEP_8;
return place + ( LEQ_STEP_8( bit_sums, byte_rank_step_8 ) * ONES_STEP_8 >> 56 );
}
inline int select64(uint64 x, int k) {
return select64_popcount_search(x, k);
}
// x is the starting offset of the 512 bits;
// k is the thing we're selecting for.
inline int select512(uint64 *bits, int x, int k) {
__asm__ __volatile__ (
"prefetchnta (%0)\n"
: : "r" (&bits[x]) );
int i = 0;
int pop = popcount(bits[x+i]);
while (k > pop && i < 7) {
k -= pop;
i++;
pop = popcount(bits[x+i]);
}
if (i == 7 && popcount(bits[x+i]) < k) {
return -1;
}
// We're now certain that the bit we want is stored in bv[x+i]
return i*64 + select64(bits[x+i], k);
}
// brute-force linear select
// x is the starting offset of the bits in bv;
// k is the thing we're selecting for (starting from bv[x]).
// bvlen is the total length of bv
inline uint64 selectLinear(uint64* bits, uint64 length, uint64 x, uint64 k) {
if (k > (length - x) * 64)
return -1;
uint64 i = 0;
uint64 pop = popcount(bits[x+i]);
while (k > pop && i < (length - 1)) {
k -= pop;
i++;
pop = popcount(bits[x+i]);
}
if ((i == length - 1) && (pop < k)) {
return -1;
}
// We're now certain that the bit we want is stored in bits[x+i]
return i*64 + select64(bits[x+i], k);
}
#endif /* _FASTRANK_POPCOUNT_H_ */