-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathfieldprofiler.awk
50 lines (32 loc) · 1.05 KB
/
fieldprofiler.awk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# FieldProfiler.awk
# This little script will profile the number of fields per record
# helping to check the integrity of your data parsing's output schema
# USAGE
#
# Here is an example of how to double check your data has consistent fields per row:
#
# awk -F"\t" -f fieldprofiler.awk testdata/*.tab | sort -rn | column -t -s $'\t'
#
#
BEGIN{
OFS="\t"
} #endBegin
{ #loop
# for each row, tabulate the count of rows, indexed by the number of fields found in them
# uncomment to access raw data for file, field number
# print FILENAME, NF
filerowcount[FILENAME, NF]++
} #endloop
# with the final array, output a report detailing the findings
END {
# print header
print("filename", "RowCount", "NumFieldsSeen")
# sort our array by rowcounts
for (key in filerowcount) {
split(key, vals, SUBSEP)
namedfile = vals[1]
numfields = vals[2]
rowcount = filerowcount[namedfile, numfields]
print (namedfile, rowcount, numfields) | " sort -n"
} #endFor
} #endEnd