Skip to content

Commit

Permalink
add the group count and two-pass function
Browse files Browse the repository at this point in the history
  • Loading branch information
danielecook committed Jul 14, 2020
1 parent 9bdf789 commit 3b15462
Show file tree
Hide file tree
Showing 7 changed files with 194 additions and 7 deletions.
1 change: 1 addition & 0 deletions data/data-01.still
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
currency: any(great)
letters: is_subset_list("A,B,C", ",") && !is_missing()
identical_col: identical()
extra: any("A", "B")
---
great_values:
- SZL
Expand Down
27 changes: 27 additions & 0 deletions docs/functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,30 @@ configuration: count(color, size) <= 10 # Fails if the combination of values is
(colname ?? 1) == 1 # returns TRUE if colname==NA/nil
```
## Two-Pass Functions
Two-pass functions allow for more advanced expressions to be evaluated, but they require a first-pass through the file to collect information.
##### `group_count`
```
group_count(group_column, count_column, eq_value)
```
`group_count` will group data by the `group_column`, and count the number of occurences of `eq_value` in the `count_column`.
__example__
```
family_id: is_int()
person: group_count(person, is_head_of_household, true) == 1
is_head_of_household: is_bool()
```
In the example above, we are checking to see that only one person in a family is set to `true` for the column `is_head_of_household`. If you have missing data in your grouping column you may need use an if_else statement to conditionally validate a row as true:
```
family_id: is_int()
person: if_else(is_missing(), true, group_count(person, is_head_of_household, true) == 1)
is_head_of_household: is_bool()
```
4 changes: 4 additions & 0 deletions src/reader/excel/excel.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,7 @@ func (r *excelReader) Read() (row []string, err error) {
func (r *excelReader) Row() int {
return r.currentRow
}

func (r *excelReader) Reset() {
r.currentRow = 0
}
11 changes: 11 additions & 0 deletions src/utils/utils.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package utils

import (
"crypto/sha1"
"fmt"
"log"
)

Expand All @@ -10,3 +12,12 @@ func Check(e error) {
log.Fatal(e)
}
}

// Generate sha1 from string
func StringHash(Txt string) string {
h := sha1.New()
h.Write([]byte(Txt))
bs := h.Sum(nil)
sh := string(fmt.Sprintf("%x", bs))
return sh
}
2 changes: 1 addition & 1 deletion src/validate/functions.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import (

func print(args ...interface{}) (interface{}, error) {
fmt.Println(args...)
return (bool)(true), nil
return ([]interface{})(args), nil
}

func strLen(args ...interface{}) (interface{}, error) {
Expand Down
51 changes: 51 additions & 0 deletions src/validate/two-pass-functions.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package validate

/*
Two pass functions
(1) First pass through file computes something
(2) Second pass uses computed values to evaluate an expression
*/

import (
"fmt"
)

var groupMap = map[string]map[string]int{}

func groupCountFunc(args ...interface{}) (interface{}, error) {
/*
(1) hash - stores data for function in specific hash
(2) group - group column
(3) count_column - value to count within group
(4) eqVal - Value to count
*/
if m, _ := isMissing(args); m.(bool) {
return (bool)(true), nil
}
hashCol := args[0].(string) // groupHash
groupCol := fmt.Sprintf("%v", args[1])
countCol := fmt.Sprintf("%v", args[2])
eqVal := fmt.Sprintf("%v", args[3])
if groupMap[hashCol] == nil {
groupMap[hashCol] = make(map[string]int)
}
if _, ok := groupMap[hashCol][groupCol]; ok == false {
// The value has not been seen
groupMap[hashCol][groupCol] = 0
}
if countCol == eqVal {
groupMap[hashCol][groupCol]++
}
return (map[string]map[string]int)(groupMap), nil
}

func groupCountFuncEval(args ...interface{}) (interface{}, error) {
if m, _ := isMissing(args); m.(bool) {
return (bool)(true), nil
}
hashCol := args[0].(string) // groupHash
groupCol := fmt.Sprintf("%v", args[1])
//warning := errors.New(fmt.Sprintf("Group Count = %d", groupMap[hashCol][groupCol]))
return (float64)(groupMap[hashCol][groupCol]), nil //warning
}
105 changes: 99 additions & 6 deletions src/validate/validate.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ var utilFunctions = map[string]govaluate.ExpressionFunction{
"replace": replace,
// Cumulative
"last": last,
// two-pass
"group_count": groupCountFunc,
"group_count_eval": groupCountFuncEval,
}

// Define functions
Expand Down Expand Up @@ -84,6 +87,11 @@ var keyFunctions = []string{
"last",
}

// Functions that result in the file being read 2x, and data stored
var twoPassFunctions = []string{
"group_count",
}

// RunValidation
func RunValidation(schema schema.SchemaRules, input string) bool {

Expand All @@ -110,27 +118,73 @@ func RunValidation(schema schema.SchemaRules, input string) bool {
schema.IsOrdered(colnames)
schema.IsFixed(colnames)

// Compile search functions
funcMatch, err := regexp.Compile(fmt.Sprintf("(%s)\\(", funcSet))
utils.Check(err)

// Key Functions - Require a hashed first element
// All twoPassFunctions are also key functions
keyFunctions = append(keyFunctions, twoPassFunctions...)

// Used to match keyFunctions
keyMatch, err := regexp.Compile(fmt.Sprintf("(%s)", strings.Join(keyFunctions, "|")))
utils.Check(err)

// Used to detect two pass functions
twoPassMatchFull, err := regexp.Compile(fmt.Sprintf("(%s)(\\([^)]+\\))", strings.Join(twoPassFunctions, "|")))
utils.Check(err)

twoPassExpr := make(map[string][]*govaluate.EvaluableExpression)
twoPassStub := make(map[string][]string)

/*
Format rules, passing implicit columns
*/
for idx, col := range schema.Columns {

// Allow for explcit references by removing them initialy
explicitReplace, err := regexp.Compile(fmt.Sprintf("(%s)\\([ ]?%s[ ,)]", funcSet, col.Name))
rule = explicitReplace.ReplaceAllString(col.Rule, "$1(")

// Add implicit variables; Remove trailing commas
funcMatch, err := regexp.Compile(fmt.Sprintf("(%s)\\(", funcSet))
utils.Check(err)
rule = funcMatch.ReplaceAllString(rule, "$1(current_var_,")
rule = strings.Replace(rule, ",)", ")", -1)

// Key functions require the variable name to create a key
keyFunc, err := regexp.Compile(fmt.Sprintf("(%s)\\(([^)]+)", strings.Join(keyFunctions, "|")))
utils.Check(err)
rule = keyFunc.ReplaceAllString(rule, fmt.Sprintf("$1(\"%s:$2\",$2", col.Name))
// Add hash keys for certain functions
// These need a stub for results to be function-specific
var stub string
idxOffset := 0
twoPassIdx := keyMatch.FindAllStringIndex(rule, -1)
twoPassFuncs := keyMatch.FindAllString(rule, -1)
for idx := range twoPassIdx {
stub = utils.StringHash(fmt.Sprintf("%d-%s", twoPassIdx[idx], twoPassFuncs[idx]))
// Modify rule to insert hash
start := twoPassIdx[idx][0] + idxOffset + len(twoPassFuncs[idx]) + 1 // + 1 for '('
rule = fmt.Sprintf("%s\"%s\",%s", rule[:start], stub, rule[start:])
idxOffset = idxOffset + len(stub) + 3 // For two " and ,
}

// Add two pass expressions to be evaluated individually
twoPassMatch := twoPassMatchFull.FindAllString(rule, -1)
if len(twoPassMatch) > 0 {
twoPassExpr[col.Name] = make([]*govaluate.EvaluableExpression, len(twoPassMatch))
twoPassStub[col.Name] = make([]string, len(twoPassMatch))
for idx, expr := range twoPassMatch {
expr, err := govaluate.NewEvaluableExpressionWithFunctions(expr, functions)
utils.Check(err)
twoPassExpr[col.Name][idx] = expr
twoPassStub[col.Name][idx] = stub
}
}

// Convert two-pass functions to their 2nd pass version
rule = twoPassMatchFull.ReplaceAllString(rule, "${1}_eval$2")

// If no expression is supplied set to true
if rule == "" {
rule = "true"
}

// Parse expressions
expr, err := govaluate.NewEvaluableExpressionWithFunctions(rule, functions)
if err != nil {
Expand All @@ -149,7 +203,46 @@ func RunValidation(schema schema.SchemaRules, input string) bool {
parameters["false"] = false
parameters["data_"] = schema.YAMLData

// If two-pass functions are used, evaluate those here
// first, and then replace with second pass function.
stopRead := false
if len(twoPassExpr) > 0 {
fmt.Println(
aurora.Yellow("Two-pass functions are being used"))
for ok := true; ok; ok = (stopRead == false) {
record, readErr := f.Read()
if readErr == io.EOF {
stopRead = true
break
}
for idx := range record {
parameters[colnames[idx]] = typeConvert(record[idx], schema.NA, schema.EMPTY)
}

for colName, exprs := range twoPassExpr {
currentVar := parameters[colName]
parameters["current_var_"] = currentVar
for idx, expr := range exprs {
result, err := expr.Eval(parameters)
utils.Check(err)

// Add result to parameters
// TODO: See if there is a way to set parameter at end
// instead of at ever iteration of loop
parameters[twoPassStub[colName][idx]] = result
}
}
}

// Update twoPass expressions to evaluate versions

f, err = reader.NewReader(input, schema)
f.ReadHeader()
}

utils.Check(err)

stopRead = false
for ok := true; ok; ok = (stopRead == false) {
record, readErr := f.Read()
if readErr == io.EOF {
Expand Down

0 comments on commit 3b15462

Please sign in to comment.