This repository has been archived by the owner on Sep 26, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathmain.go
114 lines (104 loc) · 2.84 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
package main
import (
"encoding/csv"
"flag"
"fmt"
"hash/fnv"
"io"
"log"
"math/rand"
"os"
"time"
)
func main() {
rand.Seed(time.Now().UTC().UnixNano())
//TODO move args parsing to a function
configFile := flag.String("config", "config.json", "Configuration of the data to be anonymised. Default is 'config.json'")
outputFile := flag.String("output", "", "Output file. Default is stdout.")
flag.Parse()
log.Printf("Using configuration in file %s\n", *configFile)
conf, err := loadConfig(*configFile)
if err != nil {
log.Fatal(err)
}
r := initReader(flag.Arg(0), conf.Csv)
w := initWriter(*outputFile, conf.Csv)
anons, err := anonymisations(&conf.Actions)
if err != nil {
log.Fatal(err)
}
if err := process(r, w, conf, &anons); err != nil {
log.Fatal(err)
}
}
func process(r *csv.Reader, w *csv.Writer, conf *Config, anons *[]Anonymisation) error {
i := 0
for {
record, err := r.Read()
if err == io.EOF {
break
} else if pe, ok := err.(*csv.ParseError); ok && pe.Err == csv.ErrFieldCount {
// we just print the error and skip the record
log.Print(err)
} else if err != nil {
return err
} else if int64(conf.Sampling.IDColumn) >= int64(len(record)) {
return fmt.Errorf("id column (%d) out of range, record has %d columns", conf.Sampling.IDColumn, len(record))
} else if sample(record[conf.Sampling.IDColumn], conf.Sampling) {
anonymised, err := anonymise(record, *anons)
if err != nil {
// we just print the error and skip the record
log.Print(err)
} else {
w.Write(anonymised)
}
//TODO decide how often do we want to flush
if i%100 == 0 {
w.Flush()
}
}
i++
}
w.Flush()
return nil
}
func sample(s string, conf SamplingConfig) bool {
h := fnv.New32a()
h.Write([]byte(s))
return h.Sum32()%conf.Mod == 0
}
func initReader(filename string, conf CsvConfig) *csv.Reader {
reader := csv.NewReader(fileOr(filename, os.Stdin, os.Open))
reader.Comma = []rune(conf.Delimiter)[0]
return reader
}
func initWriter(filename string, conf CsvConfig) *csv.Writer {
writer := csv.NewWriter(fileOr(filename, os.Stdout, os.Create))
writer.Comma = []rune(conf.Delimiter)[0]
return writer
}
// If filename is empty, will return `def`, if it's not, will return the
// result of the function `action` after passing `filename` ot it.
func fileOr(filename string, def *os.File, action func(string) (*os.File, error)) *os.File {
if filename == "" {
return def
}
f, err := action(filename)
if err != nil {
log.Fatal(err)
}
return f
}
func anonymise(record []string, anons []Anonymisation) ([]string, error) {
var err error
for i := range record {
// TODO decide if we fail if not enough anonmisations are defined
// or we just skip the column (i.e. we apply identity)
if i < len(anons) {
if record[i], err = anons[i](record[i]); err != nil {
return nil, err
}
}
}
return record, nil
}