From 4e5894be2469ca42046850f85f0aa1c75c13a5d3 Mon Sep 17 00:00:00 2001 From: Dmitry Mozzherin Date: Tue, 31 Jan 2023 02:31:38 +0000 Subject: [PATCH] remove duplicate occurrences from short version of dump (fix #65) --- internal/bhlindex.go | 20 ++++++++++++++++++++ internal/ent/output/interface.go | 4 ++++ internal/ent/output/name-format.go | 4 ++++ internal/ent/output/name-short-format.go | 4 ++++ internal/ent/output/occur-format.go | 4 ++++ internal/ent/output/occur-short-format.go | 4 ++++ internal/ent/output/odds-verif-format.go | 4 ++++ 7 files changed, 44 insertions(+) diff --git a/internal/bhlindex.go b/internal/bhlindex.go index b4e460a5..c53050bc 100644 --- a/internal/bhlindex.go +++ b/internal/bhlindex.go @@ -5,6 +5,7 @@ import ( "fmt" "os" "path/filepath" + "reflect" "strings" "sync" @@ -257,9 +258,12 @@ func processOutput[O output.Output]( ch <-chan []O, ) error { var o O + var shortOccur bool var err error var w *os.File var count int + var pageID string + pageName := make(map[string]bool) for rows := range ch { select { case <-ctx.Done(): @@ -268,6 +272,10 @@ func processOutput[O output.Output]( for i := range rows { o = rows[i] if count == 0 { + t := reflect.TypeOf(o).String() + if t == "output.OutputOccurrenceShort" { + shortOccur = true + } path := filepath.Join(bi.OutputDir, o.Name()+bi.extension()) w, err = os.Create(path) if err != nil { @@ -285,6 +293,18 @@ func processOutput[O output.Output]( fmt.Fprintf(os.Stderr, "\r%s\r", strings.Repeat(" ", 80)) log.Info().Msgf("Processed %d %s", count, o.Name()) } + if shortOccur { + pg, nm := o.PageNameIDs() + if pageID != pg { + pageName = make(map[string]bool) + pageID = pg + } + key := pg + "|" + nm + if pageName[key] { + continue + } + pageName[key] = true + } _, err = w.WriteString(output.Format(rows[i], bi.OutputFormat) + "\n") if err != nil { return err diff --git a/internal/ent/output/interface.go b/internal/ent/output/interface.go index 1e9cdced..e517e0f9 100644 --- a/internal/ent/output/interface.go +++ b/internal/ent/output/interface.go @@ -32,4 +32,8 @@ type Output interface { // jsonOutput provides a method to generate a JSON output. jsonOutput(bool) string + + // PageNameIDs returns PageID and Name, if available, empty int and string + // otherwize + PageNameIDs() (string, string) } diff --git a/internal/ent/output/name-format.go b/internal/ent/output/name-format.go index f94e9dba..8a4d8d6d 100644 --- a/internal/ent/output/name-format.go +++ b/internal/ent/output/name-format.go @@ -53,3 +53,7 @@ func (on OutputName) jsonOutput(pretty bool) string { res, _ := enc.Encode(on) return string(res) } + +func (on OutputName) PageNameIDs() (string, string) { + return "", "" +} diff --git a/internal/ent/output/name-short-format.go b/internal/ent/output/name-short-format.go index 3fb3642b..a62c126e 100644 --- a/internal/ent/output/name-short-format.go +++ b/internal/ent/output/name-short-format.go @@ -71,3 +71,7 @@ func (on OutputNameShort) jsonOutput(pretty bool) string { res, _ := enc.Encode(out) return string(res) } + +func (on OutputNameShort) PageNameIDs() (string, string) { + return "", "" +} diff --git a/internal/ent/output/occur-format.go b/internal/ent/output/occur-format.go index 83be005c..ed31f5ac 100644 --- a/internal/ent/output/occur-format.go +++ b/internal/ent/output/occur-format.go @@ -41,3 +41,7 @@ func (o OutputOccurrence) jsonOutput(pretty bool) string { res, _ := enc.Encode(o) return string(res) } + +func (o OutputOccurrence) PageNameIDs() (string, string) { + return strconv.Itoa(o.PageID), o.NameID +} diff --git a/internal/ent/output/occur-short-format.go b/internal/ent/output/occur-short-format.go index c1820d5c..25ec2cd9 100644 --- a/internal/ent/output/occur-short-format.go +++ b/internal/ent/output/occur-short-format.go @@ -37,3 +37,7 @@ func (o OutputOccurrenceShort) jsonOutput(pretty bool) string { res, _ := enc.Encode(oShort) return string(res) } + +func (o OutputOccurrenceShort) PageNameIDs() (string, string) { + return strconv.Itoa(o.PageID), o.NameID +} diff --git a/internal/ent/output/odds-verif-format.go b/internal/ent/output/odds-verif-format.go index d6ba1665..07c1dcbc 100644 --- a/internal/ent/output/odds-verif-format.go +++ b/internal/ent/output/odds-verif-format.go @@ -27,3 +27,7 @@ func (o OutputOddsVerification) jsonOutput(pretty bool) string { res, _ := enc.Encode(o) return string(res) } + +func (o OutputOddsVerification) PageNameIDs() (string, string) { + return "", "" +}