forked from smithoss/gonymizer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocessors.go
316 lines (260 loc) · 11.5 KB
/
processors.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
package gonymizer
import (
"fmt"
"math/rand"
"strconv"
"strings"
"time"
"unicode/utf8"
"github.com/google/uuid"
"github.com/icrowley/fake"
)
// All processors are designed to work "unseeded"
// Make sure something seeds the RNG before you call the top level process function.
// in order for the processor to "find" the functions it's got to
// 1. conform to ProcessorFunc
// 2. be in the processor map
// There are fancy ways for the reflection/runtime system to find functions
// that match certain text patters, like how the system finds TestX(*t.Testing) funcs
// but we dont' need that. just put them in the map to make my life easy please.
// The number of times to check the input string for similarity to the output string. We want to keep this at a distance
// of 0.4 or higher. Please see: https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance
//const jaroWinklerAttempts = 1000
// lookup string for random lowercase letters
const lowercaseSet = "abcdefghijklmnopqrstuvwxyz"
// lookup string for random uppercase letters
const uppercaseSet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
// lookup string for random integers
const numericSet = "0123456789"
const lowercaseSetLen = 26
const uppercaseSetLen = 26
const numericSetLen = 10
// ProcessorCatalog is the function map that points to each Processor to it's entry function. All Processors are listed
// in this map.
var ProcessorCatalog map[string]ProcessorFunc
// AlphaNumericMap is used to keep consistency with scrambled alpha numeric strings.
// For example, if we need to scramble things such as Social Security Numbers, but it is nice to keep track of these
// changes so if we run across the same SSN again we can scramble it to what we already have.
var AlphaNumericMap = map[string]map[string]string{}
// UUIDMap is the Global UUID map for all UUIDs that we anonymize. Similar to AlphaNumericMap this map contains all
// UUIDs and what they are changed to. Some tables use UUIDs as the primary key and this allows us to keep consistency
// in the data set when anonymizing it.
var UUIDMap = map[uuid.UUID]uuid.UUID{}
// init initializes the ProcessorCatalog map for all processors. A processor must be listed here to be accessible.
func init() {
ProcessorCatalog = map[string]ProcessorFunc{
"AlphaNumericScrambler": ProcessorAlphaNumericScrambler,
"FakeStreetAddress": ProcessorAddress,
"FakeCity": ProcessorCity,
"FakeEmailAddress": ProcessorEmailAddress,
"FakeFirstName": ProcessorFirstName,
"FakeFullName": ProcessorFullName,
"FakeLastName": ProcessorLastName,
"FakePhoneNumber": ProcessorPhoneNumber,
"FakeState": ProcessorState,
"FakeStateAbbrev": ProcessorStateAbbrev,
"FakeUsername": ProcessorUserName,
"FakeZip": ProcessorZip,
"Identity": ProcessorIdentity, // Default: Does not modify field
"RandomDate": ProcessorRandomDate,
"RandomUUID": ProcessorRandomUUID,
"ScrubString": ProcessorScrubString,
}
}
// ProcessorFunc is a simple function prototype for the ProcessorMap function pointers.
type ProcessorFunc func(*ColumnMapper, string) (string, error)
// fakeFuncPtr is a simple function prototype for function pointers to the Fake package's fake functions.
//type fakeFuncPtr func() string
// ProcessorAlphaNumericScrambler will receive the column metadata via ColumnMap and the column's actual data via the
// input string. The processor will scramble all alphanumeric digits and characters, but it will leave all
// non-alphanumerics the same without modification. These values are globally mapped and use the AlphaNumericMap to
// remap values once they are seen more than once.
//
// Example:
// "PUI-7x9vY" = ProcessorAlphaNumericScrambler("ABC-1a2bC")
func ProcessorAlphaNumericScrambler(cmap *ColumnMapper, input string) (string, error) {
var (
err error
scramble string
parentKey string
)
// Build the parent key which will be used for mapping columns to each other. Useful for PK/FK relationships
parentKey = fmt.Sprintf("%s.%s.%s", cmap.ParentSchema, cmap.ParentTable, cmap.ParentColumn)
// Check to see if we are working on a mapped column
if cmap.ParentSchema != "" && cmap.ParentTable != "" && cmap.ParentColumn != "" {
// Check to see if value already exists in AlphaNumericMap
if len(AlphaNumericMap[parentKey]) < 1 {
AlphaNumericMap[parentKey] = map[string]string{}
}
if len(AlphaNumericMap[parentKey][input]) < 1 {
scramble = scrambleString(input)
AlphaNumericMap[parentKey][input] = scramble
} else {
// Key already exists so use consistent value
scramble = AlphaNumericMap[parentKey][input]
}
} else {
scramble = scrambleString(input)
}
return scramble, err
}
// ProcessorAddress will return a fake address string that is compiled from the fake library
func ProcessorAddress(cmap *ColumnMapper, input string) (string, error) {
return fake.StreetAddress(), nil
}
// ProcessorCity will return a real city name that is >= 0.4 Jaro-Winkler similar than the input.
func ProcessorCity(cmap *ColumnMapper, input string) (string, error) {
return fake.City(), nil
}
// ProcessorEmailAddress will return an e-mail address that is >= 0.4 Jaro-Winkler similar than the input.
func ProcessorEmailAddress(cmap *ColumnMapper, input string) (string, error) {
return fake.EmailAddress(), nil
}
// ProcessorFirstName will return a first name that is >= 0.4 Jaro-Winkler similar than the input.
func ProcessorFirstName(cmap *ColumnMapper, input string) (string, error) {
return fake.FirstName(), nil
}
// ProcessorFullName will return a full name that is >= 0.4 Jaro-Winkler similar than the input.
func ProcessorFullName(cmap *ColumnMapper, input string) (string, error) {
return fake.FullName(), nil
}
// ProcessorIdentity will skip anonymization and leave output === input.
func ProcessorIdentity(cmap *ColumnMapper, input string) (string, error) {
return input, nil
}
// ProcessorLastName will return a last name that is >= 0.4 Jaro-Winkler similar than the input.
func ProcessorLastName(cmap *ColumnMapper, input string) (string, error) {
return fake.LastName(), nil
}
// ProcessorPhoneNumber will return a phone number that is >= 0.4 Jaro-Winkler similar than the input.
func ProcessorPhoneNumber(cmap *ColumnMapper, input string) (string, error) {
return fake.Phone(), nil
}
// ProcessorState will return a state that is >= 0.4 Jaro-Winkler similar than the input.
func ProcessorState(cmap *ColumnMapper, input string) (string, error) {
return fake.State(), nil
}
// ProcessorStateAbbrev will return a state abbreviation.
func ProcessorStateAbbrev(cmap *ColumnMapper, input string) (string, error) {
return fake.StateAbbrev(), nil
}
// ProcessorUserName will return a username that is >= 0.4 Jaro-Winkler similar than the input.
func ProcessorUserName(cmap *ColumnMapper, input string) (string, error) {
return fake.UserName(), nil
}
// ProcessorZip will return a zip code that is >= 0.4 Jaro-Winkler similar than the input.
func ProcessorZip(cmap *ColumnMapper, input string) (string, error) {
return fake.Zip(), nil
}
// ProcessorRandomDate will return a random day and month, but keep year the same (See: HIPAA rules)
func ProcessorRandomDate(cmap *ColumnMapper, input string) (string, error) {
// ISO 8601/SQL standard -> 2018-08-28
dateSplit := strings.Split(input, "-")
if len(dateSplit) < 3 || len(dateSplit) > 3 {
return "", fmt.Errorf("Date format is not ISO-8601: %q", dateSplit)
}
// Parse Year
year, err := strconv.Atoi(dateSplit[0])
if err != nil {
return "", fmt.Errorf("Unable to parse year from date: %q", dateSplit)
}
// NOTE: HIPAA only requires we scramble month and day, not year
scrambledDate := randomizeDate(year)
return scrambledDate, nil
}
// ProcessorRandomUUID will generate a random UUID and replace the input with the new UUID. The input however will be
// mapped to the output so every occurrence of the input UUID will replace it with the same output UUID that was
// originally created during the first occurrence of the input UUID.
func ProcessorRandomUUID(cmap *ColumnMapper, input string) (string, error) {
var scrambledUUID string
inputID, err := uuid.Parse(input)
if err != nil {
scrambledUUID = ""
} else {
scrambledUUID, err = randomizeUUID(inputID)
}
return scrambledUUID, err
}
// ProcessorScrubString will replace the input string with asterisks (*). Useful for blanking out password fields.
func ProcessorScrubString(cmap *ColumnMapper, input string) (string, error) {
return scrubString(input), nil
}
/*
func jaroWinkler(input string, jwDistance float64, faker fakeFuncPtr) (output string, err error) {
for counter := 0; counter < jaroWinklerAttempts; counter++ {
output = faker()
if jw := matchr.JaroWinkler(input, output, true); jw > jwDistance {
return output, nil
}
}
return output, fmt.Errorf("Jaro-Winkler: distance < %e for %d attempts. Input: %s, Output: %s",
jwDistance, jaroWinklerAttempts, input, output)
}
*/
// randomizeUUID creates a random UUID and adds it to the map of input->output. If input already exists it returns
// the output that was previously calculated for input.
func randomizeUUID(input uuid.UUID) (string, error) {
var (
finalUUID uuid.UUID
err error
)
if _, ok := UUIDMap[input]; !ok {
finalUUID, err = uuid.NewRandom()
if err != nil {
return "", err
}
UUIDMap[input] = finalUUID
} else {
finalUUID = UUIDMap[input]
}
return finalUUID.String(), nil
}
// randomizeDate randomizes a day and month for a given year. This function is leap year compatible.
func randomizeDate(year int) string {
// To find the length of the randomly selected month we need to find the last day of the month.
// See: https://yourbasic.org/golang/last-day-month-date/
randMonth := rand.Intn(12) + 1
monthMaxDay := date(year, randMonth, 0).Day()
randDay := rand.Intn(monthMaxDay) + 1
fullDateTime := date(year, randMonth, randDay).Format("2006-01-02")
return fullDateTime
}
// date returns the date for a given year, month, day. Used to check validity of supplied date.
func date(year, month, day int) time.Time {
return time.Date(year, time.Month(month), day, 0, 0, 0, 0, time.UTC)
}
// scrambleString will replace capital letters with a random capital letter, a lower-case letter with a random
// lower-case letter, and numbers with a random number. String size will be the same length and non-alphanumerics will
// be ignored in the input and output.
func scrambleString(input string) string {
var b strings.Builder
for i := 0; i < len(input); i++ {
switch c := input[i]; {
case c >= 'a' && c <= 'z':
b.WriteString(randomLowercase())
case c >= 'A' && c <= 'Z':
b.WriteString(randomUppercase())
case c >= '0' && c <= '9':
b.WriteString(randomNumeric())
default:
b.WriteByte(c)
}
}
return b.String()
}
// scrubString replaces the input string with asterisks (*) and returns it as the output.
func scrubString(input string) string {
return strings.Repeat("*", utf8.RuneCountInString(input))
}
// randomLowercase will pick a random location in the lowercase constant string and return the letter at that position.
func randomLowercase() string {
return string(lowercaseSet[rand.Intn(lowercaseSetLen)])
}
// randomUppercase will pick a random location in the uppercase constant string and return the letter at that position.
func randomUppercase() string {
return string(uppercaseSet[rand.Intn(uppercaseSetLen)])
}
// randomNumeric will return a random location in the numeric constant string and return the number at that position.
func randomNumeric() string {
return string(numericSet[rand.Intn(numericSetLen)])
}