From af544bf5e505e36a4d083335e644edbb2c9b041a Mon Sep 17 00:00:00 2001 From: Dmitry Mozzherin Date: Thu, 25 Nov 2021 06:08:54 -0600 Subject: [PATCH] add year column --- export.go | 46 +++++++++++++---------------- go.mod | 2 +- rebuild/file_name_string.go | 22 ++++++++++++-- rebuild/file_name_string_indices.go | 7 ++++- rebuild/model.go | 2 ++ rebuild/words.go | 20 ++++++++----- 6 files changed, 63 insertions(+), 36 deletions(-) diff --git a/export.go b/export.go index 420e2f0..e835c90 100644 --- a/export.go +++ b/export.go @@ -1,36 +1,32 @@ package gnidump import ( - "fmt" - "log" - "github.com/gnames/gnidump/rebuild" - "github.com/gnames/gnsys" ) func (gnd GNIdump) PopulatePG() error { - var err error - log.Printf("Rebuilding '%s' database.\n", gnd.PgDB.PgDB) - if err = gnd.ResetDB(); err != nil { - return fmt.Errorf("reset of DB did not work: %w", err) - } - if err = gnd.Migrate(); err != nil { - return fmt.Errorf("cannot rebuild DB schema: %w", err) - } + // var err error + // log.Printf("Rebuilding '%s' database.\n", gnd.PgDB.PgDB) + // if err = gnd.ResetDB(); err != nil { + // return fmt.Errorf("reset of DB did not work: %w", err) + // } + // if err = gnd.Migrate(); err != nil { + // return fmt.Errorf("cannot rebuild DB schema: %w", err) + // } rb := rebuild.NewRebuild(gnd.PgDB, gnd.InputDir, gnd.JobsNum) - if err = gnsys.MakeDir(rb.ParserKeyValDir); err != nil { - return err - } - if err = rb.UploadNameString(); err != nil { - return fmt.Errorf("unable to populate name_strings table: %w", err) - } - if err = rb.UploadDataSources(); err != nil { - return fmt.Errorf("unable to populate data_sources table: %w", err) - } - - rb.UploadNameStringIndices() - rb.RemoveOrphans() - rb.CreateWords() + // if err = gnsys.MakeDir(rb.ParserKeyValDir); err != nil { + // return err + // } + // if err = rb.UploadNameString(); err != nil { + // return fmt.Errorf("unable to populate name_strings table: %w", err) + // } + // if err = rb.UploadDataSources(); err != nil { + // return fmt.Errorf("unable to populate data_sources table: %w", err) + // } + // + // rb.UploadNameStringIndices() + // rb.RemoveOrphans() + // rb.CreateWords() rb.VerificationView() return nil } diff --git a/go.mod b/go.mod index aca25d8..6fc3393 100644 --- a/go.mod +++ b/go.mod @@ -6,7 +6,7 @@ require ( github.com/dgraph-io/badger/v2 v2.0.1 github.com/dustin/go-humanize v1.0.0 github.com/gnames/gnfmt v0.2.0 - github.com/gnames/gnparser v1.5.5 + github.com/gnames/gnparser v1.5.6 github.com/gnames/gnsys v0.2.2 github.com/gnames/gnuuid v0.1.1 github.com/go-sql-driver/mysql v1.5.0 diff --git a/rebuild/file_name_string.go b/rebuild/file_name_string.go index a774557..a35d42d 100644 --- a/rebuild/file_name_string.go +++ b/rebuild/file_name_string.go @@ -8,6 +8,7 @@ import ( "log" "os" "path/filepath" + "strconv" "strings" "sync" "time" @@ -17,6 +18,7 @@ import ( "github.com/gnames/gnfmt" "github.com/gnames/gnidump/keyval" "github.com/gnames/gnparser" + "github.com/gnames/gnparser/ent/parsed" "github.com/gnames/gnuuid" "github.com/lib/pq" ) @@ -148,7 +150,7 @@ func (rb Rebuild) saveCanonicals(cs []CanonicalData) { } func (rb Rebuild) saveNameStrings(db *sql.DB, ns []NameString) int64 { - columns := []string{"id", "name", "cardinality", "canonical_id", + columns := []string{"id", "name", "year", "cardinality", "canonical_id", "canonical_full_id", "canonical_stem_id", "virus", "bacteria", "surrogate", "parse_quality"} transaction, err := db.Begin() @@ -160,7 +162,7 @@ func (rb Rebuild) saveNameStrings(db *sql.DB, ns []NameString) int64 { log.Fatal(err) } for _, v := range ns { - _, err = stmt.Exec(v.ID, v.Name, v.Cardinality, v.CanonicalID, + _, err = stmt.Exec(v.ID, v.Name, v.Year, v.Cardinality, v.CanonicalID, v.CanonicalFullID, v.CanonicalStemID, v.Virus, v.Bacteria, v.Surrogate, v.ParseQuality) } @@ -235,11 +237,13 @@ func (rb Rebuild) workerNameString(kv *badger.DB, chIn <-chan []string, } var canonicalID, canonicalFullID, canonicalStemID sql.NullString var cardinality sql.NullInt32 + var year sql.NullInt16 if p.Parsed { cardinality = sql.NullInt32{ Int32: int32(p.Cardinality), Valid: true, } + year = parseYear(p) val := p.Canonical.Simple canonicalID = sql.NullString{ String: gnuuid.New(val).String(), @@ -287,6 +291,7 @@ func (rb Rebuild) workerNameString(kv *badger.DB, chIn <-chan []string, ID: p.VerbatimID, Name: p.Verbatim, Cardinality: cardinality, + Year: year, CanonicalID: canonicalID, CanonicalFullID: canonicalFullID, CanonicalStemID: canonicalStemID, @@ -342,3 +347,16 @@ func (rb Rebuild) loadNameStrings(chIn chan<- []string) { } close(chIn) } + +func parseYear(p parsed.Parsed) sql.NullInt16 { + res := sql.NullInt16{} + if p.Authorship == nil || p.Authorship.Year == "" { + return res + } + yr := strings.Trim(p.Authorship.Year, "()") + yrInt, err := strconv.Atoi(yr[0:4]) + if err != nil { + return res + } + return sql.NullInt16{Int16: int16(yrInt), Valid: true} +} diff --git a/rebuild/file_name_string_indices.go b/rebuild/file_name_string_indices.go index 3010b0c..2c68c84 100644 --- a/rebuild/file_name_string_indices.go +++ b/rebuild/file_name_string_indices.go @@ -287,7 +287,7 @@ SELECT nsi.data_source_id, nsi.record_id, nsi.name_string_id, ns.name ON nsi.name_string_id = ns.id ) SELECT nsi.data_source_id, nsi.record_id, nsi.name_string_id, - ns.name, ns.cardinality, ns.canonical_id, ns.virus, ns.bacteria, + ns.name, ns.year, ns.cardinality, ns.canonical_id, ns.virus, ns.bacteria, ns.parse_quality, nsi.local_id, nsi.outlink_id, nsi.accepted_record_id, tn.name_string_id as accepted_name_id, tn.name as accepted_name, nsi.classification, nsi.classification_ranks, @@ -324,5 +324,10 @@ SELECT nsi.data_source_id, nsi.record_id, nsi.name_string_id, log.Printf("verificationView") log.Fatal(err) } + _, err = db.Exec("CREATE INDEX ON verification (year)") + if err != nil { + log.Printf("verificationView") + log.Fatal(err) + } log.Println("View verification is created") } diff --git a/rebuild/model.go b/rebuild/model.go index 1c3a9c6..f25e362 100644 --- a/rebuild/model.go +++ b/rebuild/model.go @@ -65,6 +65,8 @@ type NameString struct { // Sometimes an authorship is concatenated with a name-string by our // import scripts. Name string `gorm:"type:varchar(255);not null"` + // Year is the year when a name was published + Year sql.NullInt16 `gorm:"type:int"` // Number of elements in a 'classic' Linnaen name: 0 - unknown, not available, // 1 - uninomial, 2 - binomial, 3 - trinomial etc. // Cardinality can be used to filter out surrogates and hybrid formulas -- diff --git a/rebuild/words.go b/rebuild/words.go index c06d094..287a1ff 100644 --- a/rebuild/words.go +++ b/rebuild/words.go @@ -84,14 +84,20 @@ func processParsedWords(gnp gnparser.GNparser, names []string) ([]Word, []WordNa idstr := fmt.Sprintf("%s|%d", mod, int(wt)) wordID := gnuuid.New(idstr).String() nw := WordNameString{NameStringID: nsID, CanonicalID: cID, WordID: wordID} - word := Word{ - ID: wordID, - Normalized: v.Normalized, - Modified: mod, - TypeID: int(wt), + switch wt { + case + parsed.SpEpithetType, + parsed.InfraspEpithetType, + parsed.AuthorWordType: + word := Word{ + ID: wordID, + Normalized: v.Normalized, + Modified: mod, + TypeID: int(wt), + } + words = append(words, word) + wordNames = append(wordNames, nw) } - words = append(words, word) - wordNames = append(wordNames, nw) } } return words, wordNames