-
Notifications
You must be signed in to change notification settings - Fork 0
/
dat-get-unique-fields.js
163 lines (151 loc) · 5.36 KB
/
dat-get-unique-fields.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
/*
dat-get-unique-fields.js
Purpose: For a particular organization, develop a list pairing measurements with their fields.
Steps:
1. run through a list of .dat files and collect all the fieldnames in the list
2. If there a list that already exists, check to see which ones are already on this list
3. Remove redundant fields.
4. Save list of new unique fields
5. Run each field through the guessing algorithm and pair the field with a measurement
6. Insert the pair into "terms" of the org.unique_fields.json file.
-- if the measurement already exists, then append the field to the fields list
*/
fs = require("fs")
// Argument processing of paths
args = process.argv.slice(1)
org_slug = args[1]
datdir = args[2]
boo_write = true
new_fields = []
if(args.length != 3) {
console.log("usage: node dat-get-unique-fields <organization> <dat file directory>)")
process.exit()
}
console.log("dat-unique-fields: organization:",org_slug,", checking directory:",datdir)
// Load existing lists, create new if one doesn't exist
vocab_measurements = JSON.parse(fs.readFileSync('vocabulary/measurement_full.vocabulary.json'))
org_unique_fields_path = "fields/"+org_slug+".field_names_unique.json"
if(fs.existsSync(org_unique_fields_path)) {
unique_fields = JSON.parse(fs.readFileSync(org_unique_fields_path))
} else {
unique_fields = {
"org_slug": org_slug,
"terms": [
{
"fields": ["timestamp","record","day_of_year","_day_of_year","hourmin","sta_id","stationnum","cvmeta"],
"label": "IGNORE"
}
]
}
unique_field_string = JSON.stringify(unique_fields,null,2)
fs.writeFileSync("fields/"+org_slug+".field_names_unique.json",unique_field_string,'utf-8')
}
// Compare a header field to the organization's list of known fields.
// If it is not there, register it as new
function is_field_unique(header) {
is_unique = true
header = header.toLowerCase().replace('\n',"")
for (let term of unique_fields.terms) {
if(typeof(term.fields) === "undefined") {
console.log("ERROR. NO FIELDS IN TERM")
process.exit()
}
if(is_unique == false) { break }
for (let existing_field of term.fields) {
existing_field = existing_field.toLowerCase()
//console.log("\t",header,"=?",existing_field)
if(header == existing_field) {
is_unique = false
//console.log("\t!",header,"==",existing_field)
break
}
}
}
if(is_unique == true) {
//console.log("is_field_unique("+header+") NOT FOUND! Assuming unique.")
}
return is_unique
}
// Parses header fieldname, returns a Dendra-approved measurement, medium, variable or Unknown
function find_measurement_guess_field(header) {
header = header.toLowerCase().replace('\n',"").replace(" ","")
if(header.match("_flag")) { // This is specific to DRI files, which have an _flag condition column.
return "IGNORE"
}
for (let term of vocab_measurements.terms) {
if(typeof(term.abbreviations) === "undefined") { continue }
for (let abbrev of term.abbreviations) {
abbrev = abbrev.toLowerCase()
//console.log("\t",header,"=?",abbrev)
if(header.match(abbrev)) {
console.log("\t!",header,"==",abbrev,"-->",term.label)
return term.label
}
}
}
console.log("find_measurement_guess_field("+header+") NOT FOUND!")
return "UNKNOWN"
}
//-------------------------------------------------------
// Check each DAT file in directory for unique fields
fs.readdirSync(datdir).forEach(datfile => {
if(datfile.match(".dat")) {
console.log(datdir+datfile)
// Parse the header rows of the .dat file
datfile_content = fs.readFileSync(datdir+datfile).toString().split("\r\n")
headers = datfile_content[1].split(',')
console.log(datfile,headers.length)
//console.log("\t headers:")
for (var i=0;i<headers.length;i++) {
field = headers[i].replace(/"/g,'').replace(/ /g,'').replace('\n',"")
is_new_field = is_field_unique(field)
console.log("\t header:",field,"is_new:",is_new_field)
if(is_new_field == true) {
new_fields.push(field)
//console.log("\t header:",field)
}
}
}
})
// Remove duplicates from list
// new_fields will have duplicates of a new field
new_unique_fields = Array.from(new Set(new_fields))
//-------------------------------------------------------
// Now that we have a unique list of new fields
// Guess what measurement is correct for them
new_terms = {
"org_slug": org_slug,
"terms": [
{
"fields": [],
"label": "UNKNOWN"
}
]
}
for (let nufield of new_unique_fields) {
measurement = find_measurement_guess_field(nufield)
// Loop through new_terms and add the new field if it finds a measurement match
measurement_already_exists = false
for (let term of new_terms.terms) {
if(term.label == measurement) {
term.fields.push(nufield)
measurement_already_exists = true
break
}
}
// Add new term if it doesn't yet exist for this organization
if(measurement_already_exists == false) {
new_terms.terms.push( { "fields": [nufield], "label": measurement } )
}
}
// Write JSON file
if(boo_write == true) {
new_terms_string = JSON.stringify(new_terms,null,2)
fs.writeFileSync("fields/"+org_slug+".field_names_new.json",new_terms_string,'utf-8')
// temp remove
//new_fields_string = JSON.stringify(new_fields,null,2)
//fs.writeFileSync("fields/"+org_slug+".field_names_all.json",new_fields_string,'utf-8')
}
// List new headers found
console.log("New headers found:",new_fields.length,"Unique headers found:",new_unique_fields.length)
console.log("DONE!")