diff --git a/src/server.tsx b/src/server.tsx index 80e0185..6816f70 100644 --- a/src/server.tsx +++ b/src/server.tsx @@ -2,9 +2,11 @@ import { Hono } from "hono"; import { serveStatic } from "hono/bun"; import { logger } from "hono/logger"; import Layout from "./components/Layout.tsx"; -import { fetchAllArticles } from "./util/api"; -import db from "./util/db"; -import { Article } from "./types.ts"; +import { + fetchAndStoreArticles, + getCachedArticles, + isCacheValid, +} from "./util/api"; const app = new Hono(); @@ -12,7 +14,9 @@ app.use("/styles/*", serveStatic({ root: "./public/" })); app.use("*", logger()); app.get("/", async (c) => { - await fetchAllArticles(); + if (!isCacheValid()) { + await fetchAndStoreArticles(); + } return c.html( @@ -74,9 +78,11 @@ app.get("/articles", async (c) => { const articlesPerPage = 5; const offset = (page - 1) * articlesPerPage; - const articles = db - .prepare("SELECT * FROM articles ORDER BY RANDOM() DESC LIMIT ? OFFSET ?") - .all(articlesPerPage, offset) as Article[]; + if (!isCacheValid()) { + await fetchAndStoreArticles(); + } + + const articles = getCachedArticles(offset, articlesPerPage); const nextPage = page + 1; diff --git a/src/util/api.ts b/src/util/api.ts index 139d9d3..b605f36 100644 --- a/src/util/api.ts +++ b/src/util/api.ts @@ -1,75 +1,13 @@ import { load } from "cheerio"; -import { z } from "zod"; import db from "./db"; import { Article } from "../types"; import shuffle from "./shuffle"; - -const articleSchema = z.object({ - title: z - .string() - .refine((title) => title.split(" ").length >= 5, { - message: "Title must contain at least 5 words", - }) - .refine( - (title) => - !["Video Duration", "play", "play-inverse"].some((prefix) => - title.startsWith(prefix), - ), - ), - link: z.string().url(), - source: z.string(), -}); - -type NewsSource = { - name: string; - url: string; - listSelector: string; - baseUrl?: string; -}; - -const newsSources: NewsSource[] = [ - { - name: "NPR", - url: `http://text.npr.org`, - listSelector: "ul > li > a", - baseUrl: "http://text.npr.org", - }, - { - name: "Al Jazeera", - url: `https://www.aljazeera.com/us-canada`, - listSelector: "article .gc__content a", - baseUrl: "https://www.aljazeera.com", - }, -]; - -const isValidArticle = (article: { title: string; link: string }) => { - try { - articleSchema.parse(article); - return true; - } catch (e) { - return false; - } -}; - -const clearCacheIfNeeded = () => { - const oldestArticle = db - .prepare("SELECT created_at FROM articles ORDER BY created_at ASC LIMIT 1") - .get() as { created_at: string } | undefined; - - if (oldestArticle) { - const articleDate = new Date(oldestArticle.created_at); - const now = new Date(); - const hoursDifference = - (now.getTime() - articleDate.getTime()) / (1000 * 60 * 60); - - if (hoursDifference >= 8) { - if (process.env["DEBUG"] === "true") { - console.log("*** CLEARING CACHE"); - } - db.prepare("DELETE FROM articles").run(); - } - } -}; +import { newsSources, NewsSource } from "./newsSources"; +import { + isValidArticle, + insertArticle, + clearCacheIfNeeded, +} from "./articleUtils"; const fetchArticlesFromSource = async ( source: NewsSource, @@ -77,33 +15,25 @@ const fetchArticlesFromSource = async ( ): Promise => { clearCache(); - const cachedArticles = db - .prepare("SELECT * FROM articles WHERE source = ?") - .all(source.name) as Article[]; - - if (cachedArticles.length > 0) { - if (process.env["DEBUG"] === "true") { - console.log(`*** CACHE HIT: ${source.name}`); - } - return cachedArticles; + if (process.env["DEBUG"] === "true") { + console.log(`*** Fetching articles from: ${source.name}`); } const response = await fetch(source.url); const text = await response.text(); if (process.env["DEBUG"] === "true") { - console.log(`*** CACHE MISS: ${source.name}`); + console.log(`*** FETCHING: ${source.name}`); } const $ = load(text); const articles: Article[] = []; $(source.listSelector).each((_, element) => { const title = $(element).text().trim(); - const link = source.baseUrl - ? `${source.baseUrl}${$(element).attr("href")}` - : $(element).attr("href"); + const relativeLink = $(element).attr("href"); - if (title && link) { + if (title && relativeLink) { + const link = new URL(relativeLink, source.baseUrl).href; const article: Article = { id: title, title, @@ -117,55 +47,116 @@ const fetchArticlesFromSource = async ( } } else { articles.push(article); + if (process.env["DEBUG"] === "true") { + console.log(`*** VALID: ${source.name}: ${title} ${link}`); + } + } + } else { + if (process.env["DEBUG"] === "true") { + console.log( + `*** MISSING INFO: ${source.name}: ${title} ${relativeLink}`, + ); } } }); + if (process.env["DEBUG"] === "true") { + console.log(`*** Fetched ${articles.length} articles from: ${source.name}`); + } + return articles; }; -const fetchAllArticles = async () => { +// Fetch articles from all sources +const fetchAllArticles = async (): Promise => { const allArticles: Article[] = []; for (const source of newsSources) { + if (process.env["DEBUG"] === "true") { + console.log(`*** Fetching articles from all sources`); + } const fetchedArticles = await fetchArticlesFromSource(source); allArticles.push(...fetchedArticles); } shuffle(allArticles); - const insert = db.prepare( - "INSERT INTO articles (id, title, link, source, created_at) VALUES (?, ?, ?, ?, ?)", - ); - - allArticles.forEach((article) => { - try { - insert.run( - article.id, - article.title, - article.link, - article.source, - article.created_at, - ); - } catch (error) { - if (process.env["DEBUG"] === "true") { - console.log(`*** DUPLICATE: ${article.title}`); - } - } - }); + if (process.env["DEBUG"] === "true") { + console.log(`*** Total articles fetched: ${allArticles.length}`); + } + + return allArticles; +}; + +const insertArticles = (articles: Article[]) => { + if (process.env["DEBUG"] === "true") { + console.log(`*** Inserting ${articles.length} articles into the database`); + } + articles.forEach(insertArticle); }; const getCachedArticles = (offset: number, limit: number): Article[] => { - return db - .prepare("SELECT * FROM articles ORDER BY created_at DESC LIMIT ? OFFSET ?") + if (process.env["DEBUG"] === "true") { + console.log( + `*** Getting cached articles with offset: ${offset}, limit: ${limit}`, + ); + } + const articles = db + .prepare("SELECT * FROM articles ORDER BY RANDOM() DESC LIMIT ? OFFSET ?") .all(limit, offset) as Article[]; + + if (process.env["DEBUG"] === "true") { + console.log(`*** Retrieved ${articles.length} cached articles`); + } + + return articles; +}; + +const fetchAndStoreArticles = async () => { + if (process.env["DEBUG"] === "true") { + console.log(`*** Fetching and storing articles`); + } + const allArticles = await fetchAllArticles(); + insertArticles(allArticles); + if (process.env["DEBUG"] === "true") { + console.log(`*** Articles fetched and stored successfully`); + } +}; + +const isCacheValid = (): boolean => { + const oldestArticle = db + .prepare("SELECT created_at FROM articles ORDER BY created_at ASC LIMIT 1") + .get() as { created_at: string } | undefined; + + if (oldestArticle) { + const articleDate = new Date(oldestArticle.created_at); + const now = new Date(); + const hoursDifference = + (now.getTime() - articleDate.getTime()) / (1000 * 60 * 60); + + if (process.env["DEBUG"] === "true") { + console.log( + `*** Cache validity checked. Hours difference: ${hoursDifference}`, + ); + } + + return hoursDifference < 8; + } + + if (process.env["DEBUG"] === "true") { + console.log(`*** No articles in cache`); + } + + return false; }; export { fetchArticlesFromSource, fetchAllArticles, + insertArticles, getCachedArticles, - isValidArticle, + fetchAndStoreArticles, + isCacheValid, newsSources, clearCacheIfNeeded, }; diff --git a/src/util/articleSchema.ts b/src/util/articleSchema.ts new file mode 100644 index 0000000..110b61b --- /dev/null +++ b/src/util/articleSchema.ts @@ -0,0 +1,22 @@ +import { z } from "zod"; + +const articleSchema = z.object({ + title: z + .string() + .refine((title) => title.split(" ").length >= 3, { + message: "Title must contain at least 3 words", + }) + .refine( + (title) => + !["Video Duration", "play", "play-inverse"].some((prefix) => + title.startsWith(prefix), + ), + { + message: "Title starts with an invalid prefix", + }, + ), + link: z.string().url(), + source: z.string(), +}); + +export default articleSchema; diff --git a/src/util/articleUtils.ts b/src/util/articleUtils.ts new file mode 100644 index 0000000..8af7857 --- /dev/null +++ b/src/util/articleUtils.ts @@ -0,0 +1,70 @@ +import db from "./db"; +import { Article } from "../types"; +import articleSchema from "./articleSchema"; + +const isValidArticle = (article: Article) => { + try { + articleSchema.parse(article); + return true; + } catch (e) { + if (process.env["DEBUG"] === "true") { + console.log( + `*** INVALID: ${article.source}: ${article.title} - ${e.errors.map((err: any) => err.message).join(", ")}`, + ); + } + return false; + } +}; + +const insertArticle = (article: Article) => { + const insert = db.prepare( + "INSERT INTO articles (id, title, link, source, created_at) VALUES (?, ?, ?, ?, ?)", + ); + + const checkExistence = db.prepare( + "SELECT COUNT(*) as count FROM articles WHERE title = ?", + ); + + const result = checkExistence.get(article.title) as { count: number }; + if (result.count === 0) { + try { + insert.run( + article.id, + article.title, + article.link, + article.source, + article.created_at, + ); + } catch (error) { + if (process.env["DEBUG"] === "true") { + console.log(`*** ERROR: ${error.message}`); + } + } + } else { + if (process.env["DEBUG"] === "true") { + console.log(`*** DUPLICATE: ${article.title}`); + } + } +}; + +const clearCacheIfNeeded = () => { + const oldestArticle = db + .prepare("SELECT created_at FROM articles ORDER BY created_at ASC LIMIT 1") + .get() as { created_at: string } | undefined; + + if (oldestArticle) { + const articleDate = new Date(oldestArticle.created_at); + const now = new Date(); + const hoursDifference = + (now.getTime() - articleDate.getTime()) / (1000 * 60 * 60); + + if (hoursDifference >= 8) { + if (process.env["DEBUG"] === "true") { + console.log("*** CLEARING CACHE"); + } + db.prepare("DELETE FROM articles").run(); + } + } +}; + +export { isValidArticle, insertArticle, clearCacheIfNeeded }; diff --git a/src/util/newsSources.ts b/src/util/newsSources.ts new file mode 100644 index 0000000..8b08677 --- /dev/null +++ b/src/util/newsSources.ts @@ -0,0 +1,23 @@ +type NewsSource = { + name: string; + url: string; + listSelector: string; + baseUrl?: string; +}; + +const newsSources: NewsSource[] = [ + { + name: "NPR", + url: `http://text.npr.org`, + listSelector: "ul > li > a", + baseUrl: "https://text.npr.org", + }, + { + name: "Al Jazeera", + url: `https://www.aljazeera.com/us-canada`, + listSelector: "article .gc__content a", + baseUrl: "https://www.aljazeera.com", + }, +]; + +export { newsSources, NewsSource };