Skip to content

Commit

Permalink
Add 'chronam/' from commit 'ae53a3d0c5210ac8de0e65e2f336eb69491eaebd'
Browse files Browse the repository at this point in the history
git-subtree-dir: chronam
git-subtree-mainline: b21a8e6
git-subtree-split: ae53a3d
  • Loading branch information
lmullen committed Jan 23, 2020
2 parents b21a8e6 + ae53a3d commit 9c0d144
Show file tree
Hide file tree
Showing 18 changed files with 464,412 additions and 0 deletions.
34 changes: 34 additions & 0 deletions chronam/batch-chronam.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Export the Chronam batches to send to Argo

library(tidyverse)
library(odbc)
db <- dbConnect(odbc::odbc(), "Research DB")

out_dir <- "/media/data/chronam-to-argo/"
batch <- 0

while (TRUE) {
batch_text <- str_pad(batch, 5, pad = "0")
out_file <- str_c(out_dir, "chronam-", batch_text, ".csv")
if (file.exists(out_file)) {
batch <- batch + 1
next
}
message("Processing batch ", batch_text)
rows <- tbl(db, "chronam_pages") %>%
filter(trunc(id/1000) == batch) %>%
select(doc_id, text) %>%
collect()

# Break out of the loop when there are no more results
if (nrow(rows) == 0) {
message("Found the end of the batches. Quitting.")
break
}

write_csv(rows, out_file, col_names = FALSE)

batch <- batch + 1
}

dbDisconnect(db)
110 changes: 110 additions & 0 deletions chronam/chronam-import-text/chronam-import-text.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
package main

import (
"database/sql"
"fmt"
_ "github.com/lib/pq"
"gopkg.in/cheggaaa/pb.v1"
"log"
"os"
"path/filepath"
"sync"
"time"
)

// WORKERS is how many workers to use at once
const WORKERS int = 200

// BUFFERSIZE is how many files to keep in the queue at one time
const BUFFERSIZE int = 100000

// PAGESGUESS is how many pages we think there are for the progress bar
const PAGESGUESS int = 15006843

var wg sync.WaitGroup

// Each job is a string representing a path to an OCR text file on disk
func worker(jobs <-chan string, baseDir string, bar *pb.ProgressBar, db *sql.DB) {
defer wg.Done()
for path := range jobs {
// Just increment the progress bar at the start instead of the end because
// there are multiple failure possibilities and we dont need to include this
// for each one.
time.Sleep(1 * time.Second)
bar.Increment()
docid := getDocid(path, baseDir)

// Check whether this docid exists. If it does skip the rest of the parsing,
// especially reading in the file.
// var exists bool
// err := db.QueryRow("SELECT EXISTS (SELECT 1 FROM chronam_pages WHERE doc_id = $1)", docid).Scan(&exists)
// if err != nil {
// log.Printf("Failed to check if %s already is in the database.", docid)
// log.Println(err)
// }
// if exists {
// continue
// }

lccn, date, page := getMetadata(docid)
text, err := getText(path)
if err != nil {
log.Printf("Failed to read %s\n.", path)
log.Println(err)
continue
}
wordcount := countWords(text)
_, err = db.Exec("INSERT INTO chronam_pages VALUES ($1, $2, $3, $4, $5, $6) ON CONFLICT DO NOTHING;",
docid, lccn, date, page, wordcount, text)
if err != nil {
log.Printf("Failed to write %s to database.\n", docid)
log.Println(err)
continue
}
}
}

func main() {

dataDir := os.Args[1]

constr := fmt.Sprintf("user=lmullen dbname=lmullen password=%s sslmode=disable", os.Getenv("DBPASS"))
db, err := sql.Open("postgres", constr)
if err != nil {
log.Fatal(err)
}
defer db.Close()
if err := db.Ping(); err != nil {
log.Fatal(err)
}

// Create the jobs channel which will hold filenames
jobs := make(chan string, BUFFERSIZE)

bar := pb.New(PAGESGUESS)
bar.SetRefreshRate(time.Second)
bar.ShowTimeLeft = true

// Start the workers which will begin to pull jobs off the channel
for w := 1; w <= WORKERS; w++ {
wg.Add(1)
go worker(jobs, dataDir, bar, db)
}

bar.Start()
err = filepath.Walk(dataDir, func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}
if !info.IsDir() && filepath.Ext(path) == ".txt" {
jobs <- path
return nil
}
return nil

})

close(jobs)
wg.Wait()
bar.FinishPrint("Finished processing.")
}
12 changes: 12 additions & 0 deletions chronam/chronam-import-text/go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
module github.com/public-bible/chronam/chronam-import-text

go 1.12

require (
github.com/fatih/color v1.7.0 // indirect
github.com/lib/pq v1.1.1
github.com/mattn/go-colorable v0.1.2 // indirect
github.com/mattn/go-runewidth v0.0.4 // indirect
golang.org/x/sys v0.0.0-20190602015325-4c4f7f33c9ed // indirect
gopkg.in/cheggaaa/pb.v1 v1.0.28
)
15 changes: 15 additions & 0 deletions chronam/chronam-import-text/go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
github.com/fatih/color v1.7.0 h1:DkWD4oS2D8LGGgTQ6IvwJJXSL5Vp2ffcQg58nFV38Ys=
github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4=
github.com/lib/pq v1.1.1 h1:sJZmqHoEaY7f+NPP8pgLB/WxulyR3fewgCM2qaSlBb4=
github.com/lib/pq v1.1.1/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo=
github.com/mattn/go-colorable v0.1.2 h1:/bC9yWikZXAL9uJdulbSfyVNIR3n3trXl+v8+1sx8mU=
github.com/mattn/go-colorable v0.1.2/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE=
github.com/mattn/go-isatty v0.0.8 h1:HLtExJ+uU2HOZ+wI0Tt5DtUDrx8yhUqDcp7fYERX4CE=
github.com/mattn/go-isatty v0.0.8/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s=
github.com/mattn/go-runewidth v0.0.4 h1:2BvfKmzob6Bmd4YsL0zygOqfdFnK7GR4QL06Do4/p7Y=
github.com/mattn/go-runewidth v0.0.4/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU=
golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190602015325-4c4f7f33c9ed h1:uPxWBzB3+mlnjy9W58qY1j/cjyFjutgw/Vhan2zLy/A=
golang.org/x/sys v0.0.0-20190602015325-4c4f7f33c9ed/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
gopkg.in/cheggaaa/pb.v1 v1.0.28 h1:n1tBJnnK2r7g9OW2btFH91V92STTUevLXYFb8gy9EMk=
gopkg.in/cheggaaa/pb.v1 v1.0.28/go.mod h1:V/YB90LKu/1FcN3WVnfiiE5oMCibMjukxqG/qStrOgw=
31 changes: 31 additions & 0 deletions chronam/chronam-import-text/parse-path.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package main

import (
"io/ioutil"
"path/filepath"
"regexp"
"strings"
)

func getDocid(path string, base string) string {
docid := filepath.Dir(strings.Replace(path, base+"/", "", 1)) + "/"
return docid
}

func getMetadata(docid string) (lccn, date, page string) {
s := strings.Split(docid, "/")
lccn = s[0]
date = s[1]
re := regexp.MustCompile("\\d+")
page = re.FindString(s[3])
return
}

func getText(path string) (string, error) {
b, err := ioutil.ReadFile(path)
if err != nil {
return "", err
}
text := string(b)
return text, nil
}
9 changes: 9 additions & 0 deletions chronam/chronam-import-text/wordcount.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
package main

import (
"strings"
)

func countWords(s string) int {
return len(strings.Fields(s))
}
16 changes: 16 additions & 0 deletions chronam/chronam.Rproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
Version: 1.0

RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default

EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8

RnwWeave: Sweave
LaTeX: pdfLaTeX

AutoAppendNewline: Yes
StripTrailingWhitespace: Yes
99 changes: 99 additions & 0 deletions chronam/download.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# Download the Chronam batches

library(tidyverse)
library(fs)
library(jsonlite)
library(odbc)

# Figure out what the maximum batch is.
MAX_BATCH <- 77
batches <- 1:MAX_BATCH

batch_urls <- str_glue("https://chroniclingamerica.loc.gov/batches/{batches}.json")

walk(batch_urls, function(url) {
out <- path("temp/batchjson", basename(url))
if (!file_exists(out))
download.file(url, out)
})

batch_files <- dir_ls("temp/batchjson")
batch_file_ids <- tools::file_path_sans_ext(basename(batch_files))
stopifnot(all(batches %in% batch_file_ids))

batch_json <- seq_along(batch_files) %>%
map(function(i) { message(i); read_json(batch_files[i])})

get_val <- function(x, n) { sapply(x, `[[`, n) }

parse_batches <- function(l) {
batch <- l$batches %>% get_val("name")
url <- l$batches %>% get_val("url")
page_count <- l$batches %>% get_val("page_count")
ingested <- l$batches %>% get_val("ingested")
tibble(batch, url, page_count, ingested)
}

chronam_batches <- batch_json %>% map_df(parse_batches)

stopifnot(chronam_batches %>% count(batch) %>% filter(n > 1) %>% nrow() == 0)

parse_batch_for_lccns <- function(batch_l) {
batch <- batch_l$name
lccn <- batch_l$lccns %>% purrr::flatten_chr()
stopifnot(length(batch) == 1)
tibble(batch, lccn)
}

batch_to_lccn <- batch_json %>%
map_df(function(x) map_df(x$batches, parse_batch_for_lccns))

stopifnot(batch_to_lccn %>% count(batch, lccn) %>% filter(n > 1) %>% nrow() == 0)

lccns <- unique(batch_to_lccn$lccn)
lccn_urls <- str_glue("http://chroniclingamerica.loc.gov/lccn/{lccns}.json")

walk(lccn_urls, function(url) {
out <- path("temp/lccnjson", basename(url))
if (!file_exists(out))
download.file(url, out)
})

lccn_files <- dir_ls("temp/lccnjson")
lccn_file_ids <- tools::file_path_sans_ext(basename(lccn_files))
stopifnot(all(lccns %in% lccn_file_ids))

lccn_json <- seq_along(lccn_files) %>%
map(function(i) { message(i); read_json(lccn_files[i])})

parse_lccns <- function(l) {
lccn <- l$lccn
title <- l$name
# publisher <- l$publisher
place_of_pub <- l$place_of_publication
url <- l$url
# year_start <- l$start_year %>% as.integer()
# year_end <- l$end_year %>% as.integer()
issues_l <- l$issues %>% transpose()
issues <- tibble(date = as.Date(issues_l$date_issued %>% simplify()),
url = issues_l$url %>% simplify()) %>% list()
place <- l$place
tibble(lccn, title, place_of_pub, url, issues, place)
}

lccn_data <- map_df(lccn_json, parse_lccns)

newspapers <- lccn_data %>% select(-issues, -place)
issues <- lccn_data %>% select(lccn, issues) %>% unnest()
places <- lccn_data %>% select(lccn, place) %>% unnest() %>%
separate(place, into = c("state", "county", "city"), sep = "--", remove = FALSE) %>%
mutate(dup = if_else(is.na(city), TRUE, FALSE),
city = if_else(is.na(city), county, city),
county = if_else(dup, NA_character_, county)) %>%
select(-dup)

con <- dbConnect(odbc::odbc(), "ResearchDB", timeout = 10)
dbWriteTable(con, "chronam_batch_to_lccn", batch_to_lccn)
dbWriteTable(con, "chronam_batches", chronam_batches)
dbWriteTable(con, "chronam_newspapers", newspapers)
dbWriteTable(con, "chronam_newspaper_places", places)
26 changes: 26 additions & 0 deletions chronam/get-ocr-files.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Create the download files for getting the OCR text

library(tidyverse)
library(jsonlite)
library(fs)
library(progress)

OUT_FILE <- file("/media/data/chronam-wget/all-page-urls.txt", "a")
# JSONISSUES <- dir_ls("/media/data/chronam-wget/chroniclingamerica.loc.gov/lccn/", type = "file", glob = "*.json", recurse = TRUE)

pb <- progress_bar$new(total = length(JSONISSUES))

get_ocr_urls <- function(f) {
json <- read_json(f)
batch <- json$batch$name
pages_tr <- json$pages %>% transpose()
urls <- pages_tr$url %>%
simplify() %>%
str_replace(".json", "/ocr.txt") %>%
str_replace("http", "https")
cat(urls, file = OUT_FILE, sep = "\n")
pb$tick()
}
pb$tick(0)
walk(JSONISSUES, get_ocr_urls)
close(OUT_FILE)
23 changes: 23 additions & 0 deletions chronam/import-most-ocr-data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Import as much of the OCR files as can be easily parsed in-memory in R

library(tidyverse)
library(jsonlite)
library(fs)
library(progress)

JSONISSUES <- dir_ls("/media/data/chronam-wget/chroniclingamerica.loc.gov/lccn/", type = "file", glob = "*.json", recurse = TRUE)

pb <- progress_bar$new(total = length(JSONISSUES))

get_page_data <- function(f) {
json <- read_json(f)
batch <- json$batch$name
pages_tr <- json$pages %>% transpose()
urls <- pages_tr$url %>%
simplify() %>%
str_replace(".json", "/ocr.txt") %>%
str_replace("http", "https")
cat(urls, file = OUT_FILE, sep = "\n")
pb$tick()
}
pb$tick(0)
14 changes: 14 additions & 0 deletions chronam/ocr-quality/go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
module github.com/public-bible/chronam/ocr-quality

go 1.12

require (
github.com/euskadi31/go-tokenizer v1.0.0
github.com/fatih/color v1.7.0 // indirect
github.com/lib/pq v1.1.1
github.com/mattn/go-colorable v0.1.2 // indirect
github.com/mattn/go-runewidth v0.0.4 // indirect
github.com/stretchr/testify v1.3.0 // indirect
golang.org/x/sys v0.0.0-20190610200419-93c9922d18ae // indirect
gopkg.in/cheggaaa/pb.v1 v1.0.28
)
Loading

0 comments on commit 9c0d144

Please sign in to comment.