Add 'chronam/' from commit 'ae53a3d0c5210ac8de0e65e2f336eb69491eaebd'

git-subtree-dir: chronam git-subtree-mainline: b21a8e6 git-subtree-split: ae53a3d
lmullen · Jan 23, 2020 · 9c0d144 · 9c0d144
2 parents b21a8e6 + ae53a3d
commit 9c0d144
Show file tree

Hide file tree

Showing 18 changed files with 464,412 additions and 0 deletions.
diff --git a/chronam/batch-chronam.R b/chronam/batch-chronam.R
@@ -0,0 +1,34 @@
+# Export the Chronam batches to send to Argo
+
+library(tidyverse)
+library(odbc)
+db <- dbConnect(odbc::odbc(), "Research DB")
+
+out_dir <- "/media/data/chronam-to-argo/"
+batch <- 0
+
+while (TRUE) {
+  batch_text <- str_pad(batch, 5, pad = "0")
+  out_file <- str_c(out_dir, "chronam-", batch_text, ".csv")
+  if (file.exists(out_file)) {
+    batch <- batch + 1
+    next
+  }
+  message("Processing batch ", batch_text)
+  rows <- tbl(db, "chronam_pages") %>%
+    filter(trunc(id/1000) == batch) %>%
+    select(doc_id, text) %>%
+    collect()
+
+  # Break out of the loop when there are no more results
+  if (nrow(rows) == 0) {
+    message("Found the end of the batches. Quitting.")
+    break
+  }
+
+  write_csv(rows, out_file, col_names = FALSE)
+
+  batch <- batch + 1
+}
+
+dbDisconnect(db)
diff --git a/chronam/chronam-import-text/chronam-import-text.go b/chronam/chronam-import-text/chronam-import-text.go
@@ -0,0 +1,110 @@
+package main
+
+import (
+	"database/sql"
+	"fmt"
+	_ "github.com/lib/pq"
+	"gopkg.in/cheggaaa/pb.v1"
+	"log"
+	"os"
+	"path/filepath"
+	"sync"
+	"time"
+)
+
+// WORKERS is how many workers to use at once
+const WORKERS int = 200
+
+// BUFFERSIZE is how many files to keep in the queue at one time
+const BUFFERSIZE int = 100000
+
+// PAGESGUESS is how many pages we think there are for the progress bar
+const PAGESGUESS int = 15006843
+
+var wg sync.WaitGroup
+
+// Each job is a string representing a path to an OCR text file on disk
+func worker(jobs <-chan string, baseDir string, bar *pb.ProgressBar, db *sql.DB) {
+	defer wg.Done()
+	for path := range jobs {
+		// Just increment the progress bar at the start instead of the end because
+		// there are multiple failure possibilities and we dont need to include this
+		// for each one.
+		time.Sleep(1 * time.Second)
+		bar.Increment()
+		docid := getDocid(path, baseDir)
+
+		// Check whether this docid exists. If it does skip the rest of the parsing,
+		// especially reading in the file.
+		// var exists bool
+		// err := db.QueryRow("SELECT EXISTS (SELECT 1 FROM chronam_pages WHERE doc_id = $1)", docid).Scan(&exists)
+		// if err != nil {
+		//   log.Printf("Failed to check if %s already is in the database.", docid)
+		//   log.Println(err)
+		// }
+		// if exists {
+		//   continue
+		// }
+
+		lccn, date, page := getMetadata(docid)
+		text, err := getText(path)
+		if err != nil {
+			log.Printf("Failed to read %s\n.", path)
+			log.Println(err)
+			continue
+		}
+		wordcount := countWords(text)
+		_, err = db.Exec("INSERT INTO chronam_pages VALUES ($1, $2, $3, $4, $5, $6) ON CONFLICT DO NOTHING;",
+			docid, lccn, date, page, wordcount, text)
+		if err != nil {
+			log.Printf("Failed to write %s to database.\n", docid)
+			log.Println(err)
+			continue
+		}
+	}
+}
+
+func main() {
+
+	dataDir := os.Args[1]
+
+	constr := fmt.Sprintf("user=lmullen dbname=lmullen password=%s sslmode=disable", os.Getenv("DBPASS"))
+	db, err := sql.Open("postgres", constr)
+	if err != nil {
+		log.Fatal(err)
+	}
+	defer db.Close()
+	if err := db.Ping(); err != nil {
+		log.Fatal(err)
+	}
+
+	// Create the jobs channel which will hold filenames
+	jobs := make(chan string, BUFFERSIZE)
+
+	bar := pb.New(PAGESGUESS)
+	bar.SetRefreshRate(time.Second)
+	bar.ShowTimeLeft = true
+
+	// Start the workers which will begin to pull jobs off the channel
+	for w := 1; w <= WORKERS; w++ {
+		wg.Add(1)
+		go worker(jobs, dataDir, bar, db)
+	}
+
+	bar.Start()
+	err = filepath.Walk(dataDir, func(path string, info os.FileInfo, err error) error {
+		if err != nil {
+			return err
+		}
+		if !info.IsDir() && filepath.Ext(path) == ".txt" {
+			jobs <- path
+			return nil
+		}
+		return nil
+
+	})
+
+	close(jobs)
+	wg.Wait()
+	bar.FinishPrint("Finished processing.")
+}
diff --git a/chronam/chronam-import-text/go.mod b/chronam/chronam-import-text/go.mod
@@ -0,0 +1,12 @@
+module github.com/public-bible/chronam/chronam-import-text
+
+go 1.12
+
+require (
+	github.com/fatih/color v1.7.0 // indirect
+	github.com/lib/pq v1.1.1
+	github.com/mattn/go-colorable v0.1.2 // indirect
+	github.com/mattn/go-runewidth v0.0.4 // indirect
+	golang.org/x/sys v0.0.0-20190602015325-4c4f7f33c9ed // indirect
+	gopkg.in/cheggaaa/pb.v1 v1.0.28
+)
diff --git a/chronam/chronam-import-text/go.sum b/chronam/chronam-import-text/go.sum
@@ -0,0 +1,15 @@
+github.com/fatih/color v1.7.0 h1:DkWD4oS2D8LGGgTQ6IvwJJXSL5Vp2ffcQg58nFV38Ys=
+github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4=
+github.com/lib/pq v1.1.1 h1:sJZmqHoEaY7f+NPP8pgLB/WxulyR3fewgCM2qaSlBb4=
+github.com/lib/pq v1.1.1/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo=
+github.com/mattn/go-colorable v0.1.2 h1:/bC9yWikZXAL9uJdulbSfyVNIR3n3trXl+v8+1sx8mU=
+github.com/mattn/go-colorable v0.1.2/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE=
+github.com/mattn/go-isatty v0.0.8 h1:HLtExJ+uU2HOZ+wI0Tt5DtUDrx8yhUqDcp7fYERX4CE=
+github.com/mattn/go-isatty v0.0.8/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s=
+github.com/mattn/go-runewidth v0.0.4 h1:2BvfKmzob6Bmd4YsL0zygOqfdFnK7GR4QL06Do4/p7Y=
+github.com/mattn/go-runewidth v0.0.4/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU=
+golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190602015325-4c4f7f33c9ed h1:uPxWBzB3+mlnjy9W58qY1j/cjyFjutgw/Vhan2zLy/A=
+golang.org/x/sys v0.0.0-20190602015325-4c4f7f33c9ed/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+gopkg.in/cheggaaa/pb.v1 v1.0.28 h1:n1tBJnnK2r7g9OW2btFH91V92STTUevLXYFb8gy9EMk=
+gopkg.in/cheggaaa/pb.v1 v1.0.28/go.mod h1:V/YB90LKu/1FcN3WVnfiiE5oMCibMjukxqG/qStrOgw=
diff --git a/chronam/chronam-import-text/parse-path.go b/chronam/chronam-import-text/parse-path.go
@@ -0,0 +1,31 @@
+package main
+
+import (
+	"io/ioutil"
+	"path/filepath"
+	"regexp"
+	"strings"
+)
+
+func getDocid(path string, base string) string {
+	docid := filepath.Dir(strings.Replace(path, base+"/", "", 1)) + "/"
+	return docid
+}
+
+func getMetadata(docid string) (lccn, date, page string) {
+	s := strings.Split(docid, "/")
+	lccn = s[0]
+	date = s[1]
+	re := regexp.MustCompile("\\d+")
+	page = re.FindString(s[3])
+	return
+}
+
+func getText(path string) (string, error) {
+	b, err := ioutil.ReadFile(path)
+	if err != nil {
+		return "", err
+	}
+	text := string(b)
+	return text, nil
+}
diff --git a/chronam/chronam-import-text/wordcount.go b/chronam/chronam-import-text/wordcount.go
@@ -0,0 +1,9 @@
+package main
+
+import (
+	"strings"
+)
+
+func countWords(s string) int {
+	return len(strings.Fields(s))
+}
diff --git a/chronam/chronam.Rproj b/chronam/chronam.Rproj
@@ -0,0 +1,16 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
+
+AutoAppendNewline: Yes
+StripTrailingWhitespace: Yes
diff --git a/chronam/download.R b/chronam/download.R
@@ -0,0 +1,99 @@
+# Download the Chronam batches
+
+library(tidyverse)
+library(fs)
+library(jsonlite)
+library(odbc)
+
+# Figure out what the maximum batch is.
+MAX_BATCH <- 77
+batches <- 1:MAX_BATCH
+
+batch_urls <- str_glue("https://chroniclingamerica.loc.gov/batches/{batches}.json")
+
+walk(batch_urls, function(url) {
+  out <- path("temp/batchjson", basename(url))
+  if (!file_exists(out))
+    download.file(url, out)
+})
+
+batch_files <- dir_ls("temp/batchjson")
+batch_file_ids <- tools::file_path_sans_ext(basename(batch_files))
+stopifnot(all(batches %in% batch_file_ids))
+
+batch_json <- seq_along(batch_files) %>%
+  map(function(i) { message(i); read_json(batch_files[i])})
+
+get_val <- function(x, n) { sapply(x, `[[`, n) }
+
+parse_batches <- function(l) {
+  batch <- l$batches %>% get_val("name")
+  url <- l$batches %>% get_val("url")
+  page_count <- l$batches %>% get_val("page_count")
+  ingested <- l$batches %>% get_val("ingested")
+  tibble(batch, url, page_count, ingested)
+}
+
+chronam_batches <- batch_json %>% map_df(parse_batches)
+
+stopifnot(chronam_batches %>% count(batch) %>% filter(n > 1) %>% nrow() == 0)
+
+parse_batch_for_lccns <- function(batch_l) {
+  batch <- batch_l$name
+  lccn <- batch_l$lccns %>% purrr::flatten_chr()
+  stopifnot(length(batch) == 1)
+  tibble(batch, lccn)
+}
+
+batch_to_lccn <- batch_json %>%
+  map_df(function(x) map_df(x$batches, parse_batch_for_lccns))
+
+stopifnot(batch_to_lccn %>% count(batch, lccn) %>% filter(n > 1) %>% nrow() == 0)
+
+lccns <- unique(batch_to_lccn$lccn)
+lccn_urls <- str_glue("http://chroniclingamerica.loc.gov/lccn/{lccns}.json")
+
+walk(lccn_urls, function(url) {
+  out <- path("temp/lccnjson", basename(url))
+  if (!file_exists(out))
+    download.file(url, out)
+})
+
+lccn_files <- dir_ls("temp/lccnjson")
+lccn_file_ids <- tools::file_path_sans_ext(basename(lccn_files))
+stopifnot(all(lccns %in% lccn_file_ids))
+
+lccn_json <- seq_along(lccn_files) %>%
+  map(function(i) { message(i); read_json(lccn_files[i])})
+
+parse_lccns <- function(l) {
+  lccn <- l$lccn
+  title <- l$name
+  # publisher <- l$publisher
+  place_of_pub <- l$place_of_publication
+  url <- l$url
+  # year_start <- l$start_year %>% as.integer()
+  # year_end <- l$end_year %>% as.integer()
+  issues_l <- l$issues %>% transpose()
+  issues <- tibble(date = as.Date(issues_l$date_issued %>% simplify()),
+                   url = issues_l$url %>% simplify()) %>% list()
+  place <- l$place
+  tibble(lccn, title, place_of_pub, url, issues, place)
+}
+
+lccn_data <- map_df(lccn_json, parse_lccns)
+
+newspapers <- lccn_data %>% select(-issues, -place)
+issues <- lccn_data %>% select(lccn, issues) %>% unnest()
+places <- lccn_data %>% select(lccn, place) %>% unnest() %>%
+  separate(place, into = c("state", "county", "city"), sep = "--", remove = FALSE) %>%
+  mutate(dup = if_else(is.na(city), TRUE, FALSE),
+         city = if_else(is.na(city), county, city),
+         county = if_else(dup, NA_character_, county)) %>%
+  select(-dup)
+
+con <- dbConnect(odbc::odbc(), "ResearchDB", timeout = 10)
+dbWriteTable(con, "chronam_batch_to_lccn", batch_to_lccn)
+dbWriteTable(con, "chronam_batches", chronam_batches)
+dbWriteTable(con, "chronam_newspapers", newspapers)
+dbWriteTable(con, "chronam_newspaper_places", places)
diff --git a/chronam/get-ocr-files.R b/chronam/get-ocr-files.R
@@ -0,0 +1,26 @@
+# Create the download files for getting the OCR text
+
+library(tidyverse)
+library(jsonlite)
+library(fs)
+library(progress)
+
+OUT_FILE <- file("/media/data/chronam-wget/all-page-urls.txt", "a")
+# JSONISSUES <- dir_ls("/media/data/chronam-wget/chroniclingamerica.loc.gov/lccn/", type = "file", glob = "*.json", recurse = TRUE)
+
+pb <- progress_bar$new(total = length(JSONISSUES))
+
+get_ocr_urls <- function(f) {
+  json <- read_json(f)
+  batch <- json$batch$name
+  pages_tr <- json$pages %>% transpose()
+  urls <- pages_tr$url %>%
+    simplify() %>%
+    str_replace(".json", "/ocr.txt") %>%
+    str_replace("http", "https")
+  cat(urls, file = OUT_FILE, sep = "\n")
+  pb$tick()
+}
+pb$tick(0)
+walk(JSONISSUES, get_ocr_urls)
+close(OUT_FILE)
diff --git a/chronam/import-most-ocr-data.R b/chronam/import-most-ocr-data.R
@@ -0,0 +1,23 @@
+# Import as much of the OCR files as can be easily parsed in-memory in R
+
+library(tidyverse)
+library(jsonlite)
+library(fs)
+library(progress)
+
+JSONISSUES <- dir_ls("/media/data/chronam-wget/chroniclingamerica.loc.gov/lccn/", type = "file", glob = "*.json", recurse = TRUE)
+
+pb <- progress_bar$new(total = length(JSONISSUES))
+
+get_page_data <- function(f) {
+  json <- read_json(f)
+  batch <- json$batch$name
+  pages_tr <- json$pages %>% transpose()
+  urls <- pages_tr$url %>%
+    simplify() %>%
+    str_replace(".json", "/ocr.txt") %>%
+    str_replace("http", "https")
+  cat(urls, file = OUT_FILE, sep = "\n")
+  pb$tick()
+}
+pb$tick(0)
diff --git a/chronam/ocr-quality/go.mod b/chronam/ocr-quality/go.mod
@@ -0,0 +1,14 @@
+module github.com/public-bible/chronam/ocr-quality
+
+go 1.12
+
+require (
+	github.com/euskadi31/go-tokenizer v1.0.0
+	github.com/fatih/color v1.7.0 // indirect
+	github.com/lib/pq v1.1.1
+	github.com/mattn/go-colorable v0.1.2 // indirect
+	github.com/mattn/go-runewidth v0.0.4 // indirect
+	github.com/stretchr/testify v1.3.0 // indirect
+	golang.org/x/sys v0.0.0-20190610200419-93c9922d18ae // indirect
+	gopkg.in/cheggaaa/pb.v1 v1.0.28
+)