From 6c64ebd56b98f5616c2014e2e0567fa37791844c Mon Sep 17 00:00:00 2001 From: Matthew Esposito Date: Fri, 15 Nov 2024 16:53:00 -0500 Subject: [PATCH] fix(scraper): additionally grab common words --- Cargo.lock | 46 ++++++++++++++++++++++++++ Cargo.toml | 1 + src/scraper/main.rs | 79 ++++++++++++++++++++++++++++++++++++++------- 3 files changed, 114 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d4a1a057..6447c2ca 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -71,6 +71,12 @@ version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8365de52b16c035ff4fcafe0092ba9390540e3e352870ac09933bebcaa2c8c56" +[[package]] +name = "anyhow" +version = "1.0.93" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c95c10ba0b00a02636238b814946408b1322d5ac4760326e6fb8ec956d85775" + [[package]] name = "arc-swap" version = "1.7.1" @@ -307,6 +313,18 @@ version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97" +[[package]] +name = "common-words-all" +version = "0.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84a6ff47eb813c9e315610ceca0ddd247827e22f2cdadc4189e4676a81470c77" +dependencies = [ + "anyhow", + "csv", + "glob", + "serde", +] + [[package]] name = "cookie" version = "0.18.1" @@ -370,6 +388,27 @@ dependencies = [ "typenum", ] +[[package]] +name = "csv" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" +dependencies = [ + "memchr", +] + [[package]] name = "darling" version = "0.20.10" @@ -642,6 +681,12 @@ version = "0.31.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" +[[package]] +name = "glob" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" + [[package]] name = "globset" version = "0.4.15" @@ -1160,6 +1205,7 @@ dependencies = [ "build_html", "cached", "clap", + "common-words-all", "cookie", "dotenvy", "fastrand", diff --git a/Cargo.toml b/Cargo.toml index 2fe14563..dac3f483 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,6 +48,7 @@ rss = "2.0.7" arc-swap = "1.7.1" serde_json_path = "0.6.7" async-recursion = "1.1.1" +common-words-all = { version = "0.0.2", default-features = false, features = ["english", "one"] } [dev-dependencies] diff --git a/src/scraper/main.rs b/src/scraper/main.rs index bc154ffa..f2e48d63 100644 --- a/src/scraper/main.rs +++ b/src/scraper/main.rs @@ -1,6 +1,7 @@ -use std::{fmt::Display, io::Write}; +use std::{collections::HashMap, fmt::Display, io::Write}; use clap::{Parser, ValueEnum}; +use common_words_all::{get_top, Language, NgramSize}; use redlib::utils::Post; #[derive(Parser)] @@ -10,9 +11,6 @@ struct Cli { #[arg(short = 's', long = "sub")] sub: String, - #[arg(short = 'c', long = "count")] - count: usize, - #[arg(long = "sort")] sort: SortOrder, @@ -50,28 +48,85 @@ enum Format { #[tokio::main] async fn main() { + pretty_env_logger::init(); let cli = Cli::parse(); - let (sub, final_count, sort, format, output) = (cli.sub, cli.count, cli.sort, cli.format, cli.output); + let (sub, sort, format, output) = (cli.sub, cli.sort, cli.format, cli.output); let initial = format!("/r/{sub}/{sort}.json?&raw_json=1"); - let (mut posts, mut after) = Post::fetch(&initial, false).await.unwrap(); - while posts.len() < final_count { + let (posts, mut after) = Post::fetch(&initial, false).await.unwrap(); + let mut hashmap = HashMap::new(); + hashmap.extend(posts.into_iter().map(|post| (post.id.clone(), post))); + loop { print!("\r"); let path = format!("/r/{sub}/{sort}.json?sort={sort}&t=&after={after}&raw_json=1"); let (new_posts, new_after) = Post::fetch(&path, false).await.unwrap(); - posts.extend(new_posts); + let old_len = hashmap.len(); + // convert to hashmap and extend hashmap + let new_posts = new_posts.into_iter().map(|post| (post.id.clone(), post)).collect::>(); + let len = new_posts.len(); + hashmap.extend(new_posts); + if hashmap.len() - old_len < 3 { + break; + } + + let x = hashmap.len() - old_len; after = new_after; // Print number of posts fetched - print!("Fetched {} posts", posts.len()); + print!("Fetched {len} posts (+{x})",); std::io::stdout().flush().unwrap(); } + println!("\n\n"); + // additionally search if final count not reached + + for word in get_top(Language::English, 10_000, NgramSize::One) { + let mut retrieved_posts_from_search = 0; + let initial = format!("/r/{sub}/search.json?q={word}&restrict_sr=on&include_over_18=on&raw_json=1&sort={sort}"); + println!("Grabbing posts with word {word}."); + let (posts, mut after) = Post::fetch(&initial, false).await.unwrap(); + hashmap.extend(posts.into_iter().map(|post| (post.id.clone(), post))); + 'search: loop { + let path = format!("/r/{sub}/search.json?q={word}&restrict_sr=on&include_over_18=on&raw_json=1&sort={sort}&after={after}"); + let (new_posts, new_after) = Post::fetch(&path, false).await.unwrap(); + if new_posts.is_empty() || new_after.is_empty() { + println!("No more posts for word {word}"); + break 'search; + } + retrieved_posts_from_search += new_posts.len(); + let old_len = hashmap.len(); + let new_posts = new_posts.into_iter().map(|post| (post.id.clone(), post)).collect::>(); + let len = new_posts.len(); + hashmap.extend(new_posts); + let delta = hashmap.len() - old_len; + after = new_after; + // Print number of posts fetched + println!("Fetched {len} posts (+{delta})",); + + if retrieved_posts_from_search > 1000 { + println!("Reached 1000 posts from search"); + break 'search; + } + } + // Need to save incrementally. atomic save + move + let tmp_file = output.clone().unwrap_or_else(|| format!("{sub}.json.tmp")); + let perm_file = output.clone().unwrap_or_else(|| format!("{sub}.json")); + write_posts(&hashmap.values().collect(), tmp_file.clone()); + // move file + std::fs::rename(tmp_file, perm_file).unwrap(); + } + + println!("\n\n"); - posts.truncate(final_count); + println!("Size of hashmap: {}", hashmap.len()); + let posts: Vec<&Post> = hashmap.values().collect(); match format { Format::Json => { let filename: String = output.unwrap_or_else(|| format!("{sub}.json")); - let json = serde_json::to_string(&posts).unwrap(); - std::fs::write(filename, json).unwrap(); + write_posts(&posts, filename); } } } + +fn write_posts(posts: &Vec<&Post>, filename: String) { + let json = serde_json::to_string(&posts).unwrap(); + std::fs::write(filename, json).unwrap(); +}