-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add 'chronam/' from commit 'ae53a3d0c5210ac8de0e65e2f336eb69491eaebd'
- Loading branch information
Showing
18 changed files
with
464,412 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# Export the Chronam batches to send to Argo | ||
|
||
library(tidyverse) | ||
library(odbc) | ||
db <- dbConnect(odbc::odbc(), "Research DB") | ||
|
||
out_dir <- "/media/data/chronam-to-argo/" | ||
batch <- 0 | ||
|
||
while (TRUE) { | ||
batch_text <- str_pad(batch, 5, pad = "0") | ||
out_file <- str_c(out_dir, "chronam-", batch_text, ".csv") | ||
if (file.exists(out_file)) { | ||
batch <- batch + 1 | ||
next | ||
} | ||
message("Processing batch ", batch_text) | ||
rows <- tbl(db, "chronam_pages") %>% | ||
filter(trunc(id/1000) == batch) %>% | ||
select(doc_id, text) %>% | ||
collect() | ||
|
||
# Break out of the loop when there are no more results | ||
if (nrow(rows) == 0) { | ||
message("Found the end of the batches. Quitting.") | ||
break | ||
} | ||
|
||
write_csv(rows, out_file, col_names = FALSE) | ||
|
||
batch <- batch + 1 | ||
} | ||
|
||
dbDisconnect(db) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
package main | ||
|
||
import ( | ||
"database/sql" | ||
"fmt" | ||
_ "github.com/lib/pq" | ||
"gopkg.in/cheggaaa/pb.v1" | ||
"log" | ||
"os" | ||
"path/filepath" | ||
"sync" | ||
"time" | ||
) | ||
|
||
// WORKERS is how many workers to use at once | ||
const WORKERS int = 200 | ||
|
||
// BUFFERSIZE is how many files to keep in the queue at one time | ||
const BUFFERSIZE int = 100000 | ||
|
||
// PAGESGUESS is how many pages we think there are for the progress bar | ||
const PAGESGUESS int = 15006843 | ||
|
||
var wg sync.WaitGroup | ||
|
||
// Each job is a string representing a path to an OCR text file on disk | ||
func worker(jobs <-chan string, baseDir string, bar *pb.ProgressBar, db *sql.DB) { | ||
defer wg.Done() | ||
for path := range jobs { | ||
// Just increment the progress bar at the start instead of the end because | ||
// there are multiple failure possibilities and we dont need to include this | ||
// for each one. | ||
time.Sleep(1 * time.Second) | ||
bar.Increment() | ||
docid := getDocid(path, baseDir) | ||
|
||
// Check whether this docid exists. If it does skip the rest of the parsing, | ||
// especially reading in the file. | ||
// var exists bool | ||
// err := db.QueryRow("SELECT EXISTS (SELECT 1 FROM chronam_pages WHERE doc_id = $1)", docid).Scan(&exists) | ||
// if err != nil { | ||
// log.Printf("Failed to check if %s already is in the database.", docid) | ||
// log.Println(err) | ||
// } | ||
// if exists { | ||
// continue | ||
// } | ||
|
||
lccn, date, page := getMetadata(docid) | ||
text, err := getText(path) | ||
if err != nil { | ||
log.Printf("Failed to read %s\n.", path) | ||
log.Println(err) | ||
continue | ||
} | ||
wordcount := countWords(text) | ||
_, err = db.Exec("INSERT INTO chronam_pages VALUES ($1, $2, $3, $4, $5, $6) ON CONFLICT DO NOTHING;", | ||
docid, lccn, date, page, wordcount, text) | ||
if err != nil { | ||
log.Printf("Failed to write %s to database.\n", docid) | ||
log.Println(err) | ||
continue | ||
} | ||
} | ||
} | ||
|
||
func main() { | ||
|
||
dataDir := os.Args[1] | ||
|
||
constr := fmt.Sprintf("user=lmullen dbname=lmullen password=%s sslmode=disable", os.Getenv("DBPASS")) | ||
db, err := sql.Open("postgres", constr) | ||
if err != nil { | ||
log.Fatal(err) | ||
} | ||
defer db.Close() | ||
if err := db.Ping(); err != nil { | ||
log.Fatal(err) | ||
} | ||
|
||
// Create the jobs channel which will hold filenames | ||
jobs := make(chan string, BUFFERSIZE) | ||
|
||
bar := pb.New(PAGESGUESS) | ||
bar.SetRefreshRate(time.Second) | ||
bar.ShowTimeLeft = true | ||
|
||
// Start the workers which will begin to pull jobs off the channel | ||
for w := 1; w <= WORKERS; w++ { | ||
wg.Add(1) | ||
go worker(jobs, dataDir, bar, db) | ||
} | ||
|
||
bar.Start() | ||
err = filepath.Walk(dataDir, func(path string, info os.FileInfo, err error) error { | ||
if err != nil { | ||
return err | ||
} | ||
if !info.IsDir() && filepath.Ext(path) == ".txt" { | ||
jobs <- path | ||
return nil | ||
} | ||
return nil | ||
|
||
}) | ||
|
||
close(jobs) | ||
wg.Wait() | ||
bar.FinishPrint("Finished processing.") | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
module github.com/public-bible/chronam/chronam-import-text | ||
|
||
go 1.12 | ||
|
||
require ( | ||
github.com/fatih/color v1.7.0 // indirect | ||
github.com/lib/pq v1.1.1 | ||
github.com/mattn/go-colorable v0.1.2 // indirect | ||
github.com/mattn/go-runewidth v0.0.4 // indirect | ||
golang.org/x/sys v0.0.0-20190602015325-4c4f7f33c9ed // indirect | ||
gopkg.in/cheggaaa/pb.v1 v1.0.28 | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
github.com/fatih/color v1.7.0 h1:DkWD4oS2D8LGGgTQ6IvwJJXSL5Vp2ffcQg58nFV38Ys= | ||
github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= | ||
github.com/lib/pq v1.1.1 h1:sJZmqHoEaY7f+NPP8pgLB/WxulyR3fewgCM2qaSlBb4= | ||
github.com/lib/pq v1.1.1/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo= | ||
github.com/mattn/go-colorable v0.1.2 h1:/bC9yWikZXAL9uJdulbSfyVNIR3n3trXl+v8+1sx8mU= | ||
github.com/mattn/go-colorable v0.1.2/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE= | ||
github.com/mattn/go-isatty v0.0.8 h1:HLtExJ+uU2HOZ+wI0Tt5DtUDrx8yhUqDcp7fYERX4CE= | ||
github.com/mattn/go-isatty v0.0.8/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= | ||
github.com/mattn/go-runewidth v0.0.4 h1:2BvfKmzob6Bmd4YsL0zygOqfdFnK7GR4QL06Do4/p7Y= | ||
github.com/mattn/go-runewidth v0.0.4/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU= | ||
golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= | ||
golang.org/x/sys v0.0.0-20190602015325-4c4f7f33c9ed h1:uPxWBzB3+mlnjy9W58qY1j/cjyFjutgw/Vhan2zLy/A= | ||
golang.org/x/sys v0.0.0-20190602015325-4c4f7f33c9ed/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= | ||
gopkg.in/cheggaaa/pb.v1 v1.0.28 h1:n1tBJnnK2r7g9OW2btFH91V92STTUevLXYFb8gy9EMk= | ||
gopkg.in/cheggaaa/pb.v1 v1.0.28/go.mod h1:V/YB90LKu/1FcN3WVnfiiE5oMCibMjukxqG/qStrOgw= |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
package main | ||
|
||
import ( | ||
"io/ioutil" | ||
"path/filepath" | ||
"regexp" | ||
"strings" | ||
) | ||
|
||
func getDocid(path string, base string) string { | ||
docid := filepath.Dir(strings.Replace(path, base+"/", "", 1)) + "/" | ||
return docid | ||
} | ||
|
||
func getMetadata(docid string) (lccn, date, page string) { | ||
s := strings.Split(docid, "/") | ||
lccn = s[0] | ||
date = s[1] | ||
re := regexp.MustCompile("\\d+") | ||
page = re.FindString(s[3]) | ||
return | ||
} | ||
|
||
func getText(path string) (string, error) { | ||
b, err := ioutil.ReadFile(path) | ||
if err != nil { | ||
return "", err | ||
} | ||
text := string(b) | ||
return text, nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
package main | ||
|
||
import ( | ||
"strings" | ||
) | ||
|
||
func countWords(s string) int { | ||
return len(strings.Fields(s)) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
Version: 1.0 | ||
|
||
RestoreWorkspace: Default | ||
SaveWorkspace: Default | ||
AlwaysSaveHistory: Default | ||
|
||
EnableCodeIndexing: Yes | ||
UseSpacesForTab: Yes | ||
NumSpacesForTab: 2 | ||
Encoding: UTF-8 | ||
|
||
RnwWeave: Sweave | ||
LaTeX: pdfLaTeX | ||
|
||
AutoAppendNewline: Yes | ||
StripTrailingWhitespace: Yes |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
# Download the Chronam batches | ||
|
||
library(tidyverse) | ||
library(fs) | ||
library(jsonlite) | ||
library(odbc) | ||
|
||
# Figure out what the maximum batch is. | ||
MAX_BATCH <- 77 | ||
batches <- 1:MAX_BATCH | ||
|
||
batch_urls <- str_glue("https://chroniclingamerica.loc.gov/batches/{batches}.json") | ||
|
||
walk(batch_urls, function(url) { | ||
out <- path("temp/batchjson", basename(url)) | ||
if (!file_exists(out)) | ||
download.file(url, out) | ||
}) | ||
|
||
batch_files <- dir_ls("temp/batchjson") | ||
batch_file_ids <- tools::file_path_sans_ext(basename(batch_files)) | ||
stopifnot(all(batches %in% batch_file_ids)) | ||
|
||
batch_json <- seq_along(batch_files) %>% | ||
map(function(i) { message(i); read_json(batch_files[i])}) | ||
|
||
get_val <- function(x, n) { sapply(x, `[[`, n) } | ||
|
||
parse_batches <- function(l) { | ||
batch <- l$batches %>% get_val("name") | ||
url <- l$batches %>% get_val("url") | ||
page_count <- l$batches %>% get_val("page_count") | ||
ingested <- l$batches %>% get_val("ingested") | ||
tibble(batch, url, page_count, ingested) | ||
} | ||
|
||
chronam_batches <- batch_json %>% map_df(parse_batches) | ||
|
||
stopifnot(chronam_batches %>% count(batch) %>% filter(n > 1) %>% nrow() == 0) | ||
|
||
parse_batch_for_lccns <- function(batch_l) { | ||
batch <- batch_l$name | ||
lccn <- batch_l$lccns %>% purrr::flatten_chr() | ||
stopifnot(length(batch) == 1) | ||
tibble(batch, lccn) | ||
} | ||
|
||
batch_to_lccn <- batch_json %>% | ||
map_df(function(x) map_df(x$batches, parse_batch_for_lccns)) | ||
|
||
stopifnot(batch_to_lccn %>% count(batch, lccn) %>% filter(n > 1) %>% nrow() == 0) | ||
|
||
lccns <- unique(batch_to_lccn$lccn) | ||
lccn_urls <- str_glue("http://chroniclingamerica.loc.gov/lccn/{lccns}.json") | ||
|
||
walk(lccn_urls, function(url) { | ||
out <- path("temp/lccnjson", basename(url)) | ||
if (!file_exists(out)) | ||
download.file(url, out) | ||
}) | ||
|
||
lccn_files <- dir_ls("temp/lccnjson") | ||
lccn_file_ids <- tools::file_path_sans_ext(basename(lccn_files)) | ||
stopifnot(all(lccns %in% lccn_file_ids)) | ||
|
||
lccn_json <- seq_along(lccn_files) %>% | ||
map(function(i) { message(i); read_json(lccn_files[i])}) | ||
|
||
parse_lccns <- function(l) { | ||
lccn <- l$lccn | ||
title <- l$name | ||
# publisher <- l$publisher | ||
place_of_pub <- l$place_of_publication | ||
url <- l$url | ||
# year_start <- l$start_year %>% as.integer() | ||
# year_end <- l$end_year %>% as.integer() | ||
issues_l <- l$issues %>% transpose() | ||
issues <- tibble(date = as.Date(issues_l$date_issued %>% simplify()), | ||
url = issues_l$url %>% simplify()) %>% list() | ||
place <- l$place | ||
tibble(lccn, title, place_of_pub, url, issues, place) | ||
} | ||
|
||
lccn_data <- map_df(lccn_json, parse_lccns) | ||
|
||
newspapers <- lccn_data %>% select(-issues, -place) | ||
issues <- lccn_data %>% select(lccn, issues) %>% unnest() | ||
places <- lccn_data %>% select(lccn, place) %>% unnest() %>% | ||
separate(place, into = c("state", "county", "city"), sep = "--", remove = FALSE) %>% | ||
mutate(dup = if_else(is.na(city), TRUE, FALSE), | ||
city = if_else(is.na(city), county, city), | ||
county = if_else(dup, NA_character_, county)) %>% | ||
select(-dup) | ||
|
||
con <- dbConnect(odbc::odbc(), "ResearchDB", timeout = 10) | ||
dbWriteTable(con, "chronam_batch_to_lccn", batch_to_lccn) | ||
dbWriteTable(con, "chronam_batches", chronam_batches) | ||
dbWriteTable(con, "chronam_newspapers", newspapers) | ||
dbWriteTable(con, "chronam_newspaper_places", places) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
# Create the download files for getting the OCR text | ||
|
||
library(tidyverse) | ||
library(jsonlite) | ||
library(fs) | ||
library(progress) | ||
|
||
OUT_FILE <- file("/media/data/chronam-wget/all-page-urls.txt", "a") | ||
# JSONISSUES <- dir_ls("/media/data/chronam-wget/chroniclingamerica.loc.gov/lccn/", type = "file", glob = "*.json", recurse = TRUE) | ||
|
||
pb <- progress_bar$new(total = length(JSONISSUES)) | ||
|
||
get_ocr_urls <- function(f) { | ||
json <- read_json(f) | ||
batch <- json$batch$name | ||
pages_tr <- json$pages %>% transpose() | ||
urls <- pages_tr$url %>% | ||
simplify() %>% | ||
str_replace(".json", "/ocr.txt") %>% | ||
str_replace("http", "https") | ||
cat(urls, file = OUT_FILE, sep = "\n") | ||
pb$tick() | ||
} | ||
pb$tick(0) | ||
walk(JSONISSUES, get_ocr_urls) | ||
close(OUT_FILE) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
# Import as much of the OCR files as can be easily parsed in-memory in R | ||
|
||
library(tidyverse) | ||
library(jsonlite) | ||
library(fs) | ||
library(progress) | ||
|
||
JSONISSUES <- dir_ls("/media/data/chronam-wget/chroniclingamerica.loc.gov/lccn/", type = "file", glob = "*.json", recurse = TRUE) | ||
|
||
pb <- progress_bar$new(total = length(JSONISSUES)) | ||
|
||
get_page_data <- function(f) { | ||
json <- read_json(f) | ||
batch <- json$batch$name | ||
pages_tr <- json$pages %>% transpose() | ||
urls <- pages_tr$url %>% | ||
simplify() %>% | ||
str_replace(".json", "/ocr.txt") %>% | ||
str_replace("http", "https") | ||
cat(urls, file = OUT_FILE, sep = "\n") | ||
pb$tick() | ||
} | ||
pb$tick(0) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
module github.com/public-bible/chronam/ocr-quality | ||
|
||
go 1.12 | ||
|
||
require ( | ||
github.com/euskadi31/go-tokenizer v1.0.0 | ||
github.com/fatih/color v1.7.0 // indirect | ||
github.com/lib/pq v1.1.1 | ||
github.com/mattn/go-colorable v0.1.2 // indirect | ||
github.com/mattn/go-runewidth v0.0.4 // indirect | ||
github.com/stretchr/testify v1.3.0 // indirect | ||
golang.org/x/sys v0.0.0-20190610200419-93c9922d18ae // indirect | ||
gopkg.in/cheggaaa/pb.v1 v1.0.28 | ||
) |
Oops, something went wrong.