Skip to content

Commit

Permalink
add year column
Browse files Browse the repository at this point in the history
  • Loading branch information
dimus committed Nov 25, 2021
1 parent 6c2f59a commit af544bf
Show file tree
Hide file tree
Showing 6 changed files with 63 additions and 36 deletions.
46 changes: 21 additions & 25 deletions export.go
Original file line number Diff line number Diff line change
@@ -1,36 +1,32 @@
package gnidump

import (
"fmt"
"log"

"github.com/gnames/gnidump/rebuild"
"github.com/gnames/gnsys"
)

func (gnd GNIdump) PopulatePG() error {
var err error
log.Printf("Rebuilding '%s' database.\n", gnd.PgDB.PgDB)
if err = gnd.ResetDB(); err != nil {
return fmt.Errorf("reset of DB did not work: %w", err)
}
if err = gnd.Migrate(); err != nil {
return fmt.Errorf("cannot rebuild DB schema: %w", err)
}
// var err error
// log.Printf("Rebuilding '%s' database.\n", gnd.PgDB.PgDB)
// if err = gnd.ResetDB(); err != nil {
// return fmt.Errorf("reset of DB did not work: %w", err)
// }
// if err = gnd.Migrate(); err != nil {
// return fmt.Errorf("cannot rebuild DB schema: %w", err)
// }
rb := rebuild.NewRebuild(gnd.PgDB, gnd.InputDir, gnd.JobsNum)
if err = gnsys.MakeDir(rb.ParserKeyValDir); err != nil {
return err
}
if err = rb.UploadNameString(); err != nil {
return fmt.Errorf("unable to populate name_strings table: %w", err)
}
if err = rb.UploadDataSources(); err != nil {
return fmt.Errorf("unable to populate data_sources table: %w", err)
}

rb.UploadNameStringIndices()
rb.RemoveOrphans()
rb.CreateWords()
// if err = gnsys.MakeDir(rb.ParserKeyValDir); err != nil {
// return err
// }
// if err = rb.UploadNameString(); err != nil {
// return fmt.Errorf("unable to populate name_strings table: %w", err)
// }
// if err = rb.UploadDataSources(); err != nil {
// return fmt.Errorf("unable to populate data_sources table: %w", err)
// }
//
// rb.UploadNameStringIndices()
// rb.RemoveOrphans()
// rb.CreateWords()
rb.VerificationView()
return nil
}
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ require (
github.com/dgraph-io/badger/v2 v2.0.1
github.com/dustin/go-humanize v1.0.0
github.com/gnames/gnfmt v0.2.0
github.com/gnames/gnparser v1.5.5
github.com/gnames/gnparser v1.5.6
github.com/gnames/gnsys v0.2.2
github.com/gnames/gnuuid v0.1.1
github.com/go-sql-driver/mysql v1.5.0
Expand Down
22 changes: 20 additions & 2 deletions rebuild/file_name_string.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"log"
"os"
"path/filepath"
"strconv"
"strings"
"sync"
"time"
Expand All @@ -17,6 +18,7 @@ import (
"github.com/gnames/gnfmt"
"github.com/gnames/gnidump/keyval"
"github.com/gnames/gnparser"
"github.com/gnames/gnparser/ent/parsed"
"github.com/gnames/gnuuid"
"github.com/lib/pq"
)
Expand Down Expand Up @@ -148,7 +150,7 @@ func (rb Rebuild) saveCanonicals(cs []CanonicalData) {
}

func (rb Rebuild) saveNameStrings(db *sql.DB, ns []NameString) int64 {
columns := []string{"id", "name", "cardinality", "canonical_id",
columns := []string{"id", "name", "year", "cardinality", "canonical_id",
"canonical_full_id", "canonical_stem_id", "virus", "bacteria", "surrogate",
"parse_quality"}
transaction, err := db.Begin()
Expand All @@ -160,7 +162,7 @@ func (rb Rebuild) saveNameStrings(db *sql.DB, ns []NameString) int64 {
log.Fatal(err)
}
for _, v := range ns {
_, err = stmt.Exec(v.ID, v.Name, v.Cardinality, v.CanonicalID,
_, err = stmt.Exec(v.ID, v.Name, v.Year, v.Cardinality, v.CanonicalID,
v.CanonicalFullID, v.CanonicalStemID, v.Virus, v.Bacteria, v.Surrogate,
v.ParseQuality)
}
Expand Down Expand Up @@ -235,11 +237,13 @@ func (rb Rebuild) workerNameString(kv *badger.DB, chIn <-chan []string,
}
var canonicalID, canonicalFullID, canonicalStemID sql.NullString
var cardinality sql.NullInt32
var year sql.NullInt16
if p.Parsed {
cardinality = sql.NullInt32{
Int32: int32(p.Cardinality),
Valid: true,
}
year = parseYear(p)
val := p.Canonical.Simple
canonicalID = sql.NullString{
String: gnuuid.New(val).String(),
Expand Down Expand Up @@ -287,6 +291,7 @@ func (rb Rebuild) workerNameString(kv *badger.DB, chIn <-chan []string,
ID: p.VerbatimID,
Name: p.Verbatim,
Cardinality: cardinality,
Year: year,
CanonicalID: canonicalID,
CanonicalFullID: canonicalFullID,
CanonicalStemID: canonicalStemID,
Expand Down Expand Up @@ -342,3 +347,16 @@ func (rb Rebuild) loadNameStrings(chIn chan<- []string) {
}
close(chIn)
}

func parseYear(p parsed.Parsed) sql.NullInt16 {
res := sql.NullInt16{}
if p.Authorship == nil || p.Authorship.Year == "" {
return res
}
yr := strings.Trim(p.Authorship.Year, "()")
yrInt, err := strconv.Atoi(yr[0:4])
if err != nil {
return res
}
return sql.NullInt16{Int16: int16(yrInt), Valid: true}
}
7 changes: 6 additions & 1 deletion rebuild/file_name_string_indices.go
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ SELECT nsi.data_source_id, nsi.record_id, nsi.name_string_id, ns.name
ON nsi.name_string_id = ns.id
)
SELECT nsi.data_source_id, nsi.record_id, nsi.name_string_id,
ns.name, ns.cardinality, ns.canonical_id, ns.virus, ns.bacteria,
ns.name, ns.year, ns.cardinality, ns.canonical_id, ns.virus, ns.bacteria,
ns.parse_quality, nsi.local_id, nsi.outlink_id, nsi.accepted_record_id,
tn.name_string_id as accepted_name_id,
tn.name as accepted_name, nsi.classification, nsi.classification_ranks,
Expand Down Expand Up @@ -324,5 +324,10 @@ SELECT nsi.data_source_id, nsi.record_id, nsi.name_string_id,
log.Printf("verificationView")
log.Fatal(err)
}
_, err = db.Exec("CREATE INDEX ON verification (year)")
if err != nil {
log.Printf("verificationView")
log.Fatal(err)
}
log.Println("View verification is created")
}
2 changes: 2 additions & 0 deletions rebuild/model.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ type NameString struct {
// Sometimes an authorship is concatenated with a name-string by our
// import scripts.
Name string `gorm:"type:varchar(255);not null"`
// Year is the year when a name was published
Year sql.NullInt16 `gorm:"type:int"`
// Number of elements in a 'classic' Linnaen name: 0 - unknown, not available,
// 1 - uninomial, 2 - binomial, 3 - trinomial etc.
// Cardinality can be used to filter out surrogates and hybrid formulas --
Expand Down
20 changes: 13 additions & 7 deletions rebuild/words.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,14 +84,20 @@ func processParsedWords(gnp gnparser.GNparser, names []string) ([]Word, []WordNa
idstr := fmt.Sprintf("%s|%d", mod, int(wt))
wordID := gnuuid.New(idstr).String()
nw := WordNameString{NameStringID: nsID, CanonicalID: cID, WordID: wordID}
word := Word{
ID: wordID,
Normalized: v.Normalized,
Modified: mod,
TypeID: int(wt),
switch wt {
case
parsed.SpEpithetType,
parsed.InfraspEpithetType,
parsed.AuthorWordType:
word := Word{
ID: wordID,
Normalized: v.Normalized,
Modified: mod,
TypeID: int(wt),
}
words = append(words, word)
wordNames = append(wordNames, nw)
}
words = append(words, word)
wordNames = append(wordNames, nw)
}
}
return words, wordNames
Expand Down

0 comments on commit af544bf

Please sign in to comment.