From f3d2f0cc5943312f8d6c8bec50d0daf70f4116e5 Mon Sep 17 00:00:00 2001 From: Matthew Esposito Date: Mon, 21 Oct 2024 20:54:05 -0400 Subject: [PATCH] feat(scraper): add scraper CLI --- Cargo.lock | 19 ++++++++++++ Cargo.toml | 10 +++++++ src/scraper/main.rs | 73 +++++++++++++++++++++++++++++++++++++++++++++ src/utils.rs | 15 ++++++++-- 4 files changed, 114 insertions(+), 3 deletions(-) create mode 100644 src/scraper/main.rs diff --git a/Cargo.lock b/Cargo.lock index 8a3a6f18..63271fb4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -268,6 +268,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5db83dced34638ad474f39f250d7fea9598bdd239eaced1bdf45d597da0f433f" dependencies = [ "clap_builder", + "clap_derive", ] [[package]] @@ -280,6 +281,18 @@ dependencies = [ "clap_lex", ] +[[package]] +name = "clap_derive" +version = "4.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c780290ccf4fb26629baa7a1081e68ced113f1d3ec302fa5948f1c381ebf06c6" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.68", +] + [[package]] name = "clap_lex" version = "0.7.1" @@ -735,6 +748,12 @@ version = "0.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb" +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + [[package]] name = "hermit-abi" version = "0.3.9" diff --git a/Cargo.toml b/Cargo.toml index cac6f7e7..4bba8487 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,6 +9,7 @@ authors = [ "spikecodes <19519553+spikecodes@users.noreply.github.com>", ] edition = "2021" +default-run = "redlib" [dependencies] rinja = { version = "0.3.4", default-features = false } @@ -16,6 +17,7 @@ cached = { version = "0.51.3", features = ["async"] } clap = { version = "4.4.11", default-features = false, features = [ "std", "env", + "derive", ] } regex = "1.10.2" serde = { version = "1.0.193", features = ["derive"] } @@ -56,3 +58,11 @@ sealed_test = "1.0.0" codegen-units = 1 lto = true strip = "symbols" + +[[bin]] +name = "redlib" +path = "src/main.rs" + +[[bin]] +name = "scraper" +path = "src/scraper/main.rs" \ No newline at end of file diff --git a/src/scraper/main.rs b/src/scraper/main.rs new file mode 100644 index 00000000..bf7ee763 --- /dev/null +++ b/src/scraper/main.rs @@ -0,0 +1,73 @@ +use std::{fmt::Display, io::Write}; + +use clap::{Parser, ValueEnum}; +use redlib::utils::Post; + +#[derive(Parser)] +#[command(name = "my_cli")] +#[command(about = "A simple CLI example", long_about = None)] +struct Cli { + #[arg(short = 's', long = "sub")] + sub: String, + + #[arg(short = 'c', long = "count")] + count: usize, + + #[arg(long = "sort")] + sort: SortOrder, + + #[arg(short = 'f', long = "format", value_enum)] + format: Format, +} + +#[derive(Debug, Clone, ValueEnum)] +enum SortOrder { + Hot, + Rising, + New, + Top, + Controversial, +} + +impl Display for SortOrder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + SortOrder::Hot => write!(f, "hot"), + SortOrder::Rising => write!(f, "rising"), + SortOrder::New => write!(f, "new"), + SortOrder::Top => write!(f, "top"), + SortOrder::Controversial => write!(f, "controversial"), + } + } +} + +#[derive(Debug, Clone, ValueEnum)] +enum Format { + Json, +} + +#[tokio::main] +async fn main() { + let cli = Cli::parse(); + let (sub, final_count, sort, format) = (cli.sub, cli.count, cli.sort, cli.format); + let initial = format!("/r/{sub}/{sort}.json?&raw_json=1"); + let (mut posts, mut after) = Post::fetch(&initial, false).await.unwrap(); + while posts.len() < final_count { + print!("\r"); + let path = format!("/r/{sub}/{sort}.json?sort={sort}&t=&after={after}&raw_json=1"); + let (new_posts, new_after) = Post::fetch(&path, false).await.unwrap(); + posts.extend(new_posts); + after = new_after; + // Print number of posts fetched + print!("Fetched {} posts", posts.len()); + std::io::stdout().flush().unwrap(); + } + + match format { + Format::Json => { + let filename: String = format!("{sub}.json"); + let json = serde_json::to_string(&posts).unwrap(); + std::fs::write(filename, json).unwrap(); + } + } +} diff --git a/src/utils.rs b/src/utils.rs index 6f977754..abee2689 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -11,6 +11,7 @@ use once_cell::sync::Lazy; use regex::Regex; use rinja::Template; use rust_embed::RustEmbed; +use serde::Serialize; use serde_json::Value; use serde_json_path::{JsonPath, JsonPathExt}; use std::collections::{HashMap, HashSet}; @@ -46,6 +47,7 @@ pub enum ResourceType { } // Post flair with content, background color and foreground color +#[derive(Serialize)] pub struct Flair { pub flair_parts: Vec, pub text: String, @@ -54,7 +56,7 @@ pub struct Flair { } // Part of flair, either emoji or text -#[derive(Clone)] +#[derive(Clone, Serialize)] pub struct FlairPart { pub flair_part_type: String, pub value: String, @@ -96,12 +98,14 @@ impl FlairPart { } } +#[derive(Serialize)] pub struct Author { pub name: String, pub flair: Flair, pub distinguished: String, } +#[derive(Serialize)] pub struct Poll { pub poll_options: Vec, pub voting_end_timestamp: (String, String), @@ -129,6 +133,7 @@ impl Poll { } } +#[derive(Serialize)] pub struct PollOption { pub id: u64, pub text: String, @@ -158,13 +163,14 @@ impl PollOption { } // Post flags with nsfw and stickied +#[derive(Serialize)] pub struct Flags { pub spoiler: bool, pub nsfw: bool, pub stickied: bool, } -#[derive(Debug)] +#[derive(Debug, Serialize)] pub struct Media { pub url: String, pub alt_url: String, @@ -264,6 +270,7 @@ impl Media { } } +#[derive(Serialize)] pub struct GalleryMedia { pub url: String, pub width: i64, @@ -304,6 +311,7 @@ impl GalleryMedia { } // Post containing content, metadata and media +#[derive(Serialize)] pub struct Post { pub id: String, pub title: String, @@ -470,7 +478,7 @@ pub struct Comment { pub prefs: Preferences, } -#[derive(Default, Clone)] +#[derive(Default, Clone, Serialize)] pub struct Award { pub name: String, pub icon_url: String, @@ -484,6 +492,7 @@ impl std::fmt::Display for Award { } } +#[derive(Serialize)] pub struct Awards(pub Vec); impl std::ops::Deref for Awards {