Skip to content

Commit

Permalink
Refactor for better clarity and fix several data layer bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
tireymorris committed Jun 22, 2024
1 parent b125241 commit 2b83051
Show file tree
Hide file tree
Showing 5 changed files with 224 additions and 112 deletions.
20 changes: 13 additions & 7 deletions src/server.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,21 @@ import { Hono } from "hono";
import { serveStatic } from "hono/bun";
import { logger } from "hono/logger";
import Layout from "./components/Layout.tsx";
import { fetchAllArticles } from "./util/api";
import db from "./util/db";
import { Article } from "./types.ts";
import {
fetchAndStoreArticles,
getCachedArticles,
isCacheValid,
} from "./util/api";

const app = new Hono();

app.use("/styles/*", serveStatic({ root: "./public/" }));
app.use("*", logger());

app.get("/", async (c) => {
await fetchAllArticles();
if (!isCacheValid()) {
await fetchAndStoreArticles();
}

return c.html(
<Layout title="hyperwave">
Expand Down Expand Up @@ -74,9 +78,11 @@ app.get("/articles", async (c) => {
const articlesPerPage = 5;
const offset = (page - 1) * articlesPerPage;

const articles = db
.prepare("SELECT * FROM articles ORDER BY RANDOM() DESC LIMIT ? OFFSET ?")
.all(articlesPerPage, offset) as Article[];
if (!isCacheValid()) {
await fetchAndStoreArticles();
}

const articles = getCachedArticles(offset, articlesPerPage);

const nextPage = page + 1;

Expand Down
201 changes: 96 additions & 105 deletions src/util/api.ts
Original file line number Diff line number Diff line change
@@ -1,109 +1,39 @@
import { load } from "cheerio";
import { z } from "zod";
import db from "./db";
import { Article } from "../types";
import shuffle from "./shuffle";

const articleSchema = z.object({
title: z
.string()
.refine((title) => title.split(" ").length >= 5, {
message: "Title must contain at least 5 words",
})
.refine(
(title) =>
!["Video Duration", "play", "play-inverse"].some((prefix) =>
title.startsWith(prefix),
),
),
link: z.string().url(),
source: z.string(),
});

type NewsSource = {
name: string;
url: string;
listSelector: string;
baseUrl?: string;
};

const newsSources: NewsSource[] = [
{
name: "NPR",
url: `http://text.npr.org`,
listSelector: "ul > li > a",
baseUrl: "http://text.npr.org",
},
{
name: "Al Jazeera",
url: `https://www.aljazeera.com/us-canada`,
listSelector: "article .gc__content a",
baseUrl: "https://www.aljazeera.com",
},
];

const isValidArticle = (article: { title: string; link: string }) => {
try {
articleSchema.parse(article);
return true;
} catch (e) {
return false;
}
};

const clearCacheIfNeeded = () => {
const oldestArticle = db
.prepare("SELECT created_at FROM articles ORDER BY created_at ASC LIMIT 1")
.get() as { created_at: string } | undefined;

if (oldestArticle) {
const articleDate = new Date(oldestArticle.created_at);
const now = new Date();
const hoursDifference =
(now.getTime() - articleDate.getTime()) / (1000 * 60 * 60);

if (hoursDifference >= 8) {
if (process.env["DEBUG"] === "true") {
console.log("*** CLEARING CACHE");
}
db.prepare("DELETE FROM articles").run();
}
}
};
import { newsSources, NewsSource } from "./newsSources";
import {
isValidArticle,
insertArticle,
clearCacheIfNeeded,
} from "./articleUtils";

const fetchArticlesFromSource = async (
source: NewsSource,
clearCache: () => void = clearCacheIfNeeded,
): Promise<Article[]> => {
clearCache();

const cachedArticles = db
.prepare("SELECT * FROM articles WHERE source = ?")
.all(source.name) as Article[];

if (cachedArticles.length > 0) {
if (process.env["DEBUG"] === "true") {
console.log(`*** CACHE HIT: ${source.name}`);
}
return cachedArticles;
if (process.env["DEBUG"] === "true") {
console.log(`*** Fetching articles from: ${source.name}`);
}

const response = await fetch(source.url);
const text = await response.text();

if (process.env["DEBUG"] === "true") {
console.log(`*** CACHE MISS: ${source.name}`);
console.log(`*** FETCHING: ${source.name}`);
}
const $ = load(text);
const articles: Article[] = [];

$(source.listSelector).each((_, element) => {
const title = $(element).text().trim();
const link = source.baseUrl
? `${source.baseUrl}${$(element).attr("href")}`
: $(element).attr("href");
const relativeLink = $(element).attr("href");

if (title && link) {
if (title && relativeLink) {
const link = new URL(relativeLink, source.baseUrl).href;
const article: Article = {
id: title,
title,
Expand All @@ -117,55 +47,116 @@ const fetchArticlesFromSource = async (
}
} else {
articles.push(article);
if (process.env["DEBUG"] === "true") {
console.log(`*** VALID: ${source.name}: ${title} ${link}`);
}
}
} else {
if (process.env["DEBUG"] === "true") {
console.log(
`*** MISSING INFO: ${source.name}: ${title} ${relativeLink}`,
);
}
}
});

if (process.env["DEBUG"] === "true") {
console.log(`*** Fetched ${articles.length} articles from: ${source.name}`);
}

return articles;
};

const fetchAllArticles = async () => {
// Fetch articles from all sources
const fetchAllArticles = async (): Promise<Article[]> => {
const allArticles: Article[] = [];

for (const source of newsSources) {
if (process.env["DEBUG"] === "true") {
console.log(`*** Fetching articles from all sources`);
}
const fetchedArticles = await fetchArticlesFromSource(source);
allArticles.push(...fetchedArticles);
}

shuffle(allArticles);

const insert = db.prepare(
"INSERT INTO articles (id, title, link, source, created_at) VALUES (?, ?, ?, ?, ?)",
);

allArticles.forEach((article) => {
try {
insert.run(
article.id,
article.title,
article.link,
article.source,
article.created_at,
);
} catch (error) {
if (process.env["DEBUG"] === "true") {
console.log(`*** DUPLICATE: ${article.title}`);
}
}
});
if (process.env["DEBUG"] === "true") {
console.log(`*** Total articles fetched: ${allArticles.length}`);
}

return allArticles;
};

const insertArticles = (articles: Article[]) => {
if (process.env["DEBUG"] === "true") {
console.log(`*** Inserting ${articles.length} articles into the database`);
}
articles.forEach(insertArticle);
};

const getCachedArticles = (offset: number, limit: number): Article[] => {
return db
.prepare("SELECT * FROM articles ORDER BY created_at DESC LIMIT ? OFFSET ?")
if (process.env["DEBUG"] === "true") {
console.log(
`*** Getting cached articles with offset: ${offset}, limit: ${limit}`,
);
}
const articles = db
.prepare("SELECT * FROM articles ORDER BY RANDOM() DESC LIMIT ? OFFSET ?")
.all(limit, offset) as Article[];

if (process.env["DEBUG"] === "true") {
console.log(`*** Retrieved ${articles.length} cached articles`);
}

return articles;
};

const fetchAndStoreArticles = async () => {
if (process.env["DEBUG"] === "true") {
console.log(`*** Fetching and storing articles`);
}
const allArticles = await fetchAllArticles();
insertArticles(allArticles);
if (process.env["DEBUG"] === "true") {
console.log(`*** Articles fetched and stored successfully`);
}
};

const isCacheValid = (): boolean => {
const oldestArticle = db
.prepare("SELECT created_at FROM articles ORDER BY created_at ASC LIMIT 1")
.get() as { created_at: string } | undefined;

if (oldestArticle) {
const articleDate = new Date(oldestArticle.created_at);
const now = new Date();
const hoursDifference =
(now.getTime() - articleDate.getTime()) / (1000 * 60 * 60);

if (process.env["DEBUG"] === "true") {
console.log(
`*** Cache validity checked. Hours difference: ${hoursDifference}`,
);
}

return hoursDifference < 8;
}

if (process.env["DEBUG"] === "true") {
console.log(`*** No articles in cache`);
}

return false;
};

export {
fetchArticlesFromSource,
fetchAllArticles,
insertArticles,
getCachedArticles,
isValidArticle,
fetchAndStoreArticles,
isCacheValid,
newsSources,
clearCacheIfNeeded,
};
22 changes: 22 additions & 0 deletions src/util/articleSchema.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import { z } from "zod";

const articleSchema = z.object({
title: z
.string()
.refine((title) => title.split(" ").length >= 3, {
message: "Title must contain at least 3 words",
})
.refine(
(title) =>
!["Video Duration", "play", "play-inverse"].some((prefix) =>
title.startsWith(prefix),
),
{
message: "Title starts with an invalid prefix",
},
),
link: z.string().url(),
source: z.string(),
});

export default articleSchema;
70 changes: 70 additions & 0 deletions src/util/articleUtils.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import db from "./db";
import { Article } from "../types";
import articleSchema from "./articleSchema";

const isValidArticle = (article: Article) => {
try {
articleSchema.parse(article);
return true;
} catch (e) {
if (process.env["DEBUG"] === "true") {
console.log(
`*** INVALID: ${article.source}: ${article.title} - ${e.errors.map((err: any) => err.message).join(", ")}`,
);
}
return false;
}
};

const insertArticle = (article: Article) => {
const insert = db.prepare(
"INSERT INTO articles (id, title, link, source, created_at) VALUES (?, ?, ?, ?, ?)",
);

const checkExistence = db.prepare(
"SELECT COUNT(*) as count FROM articles WHERE title = ?",
);

const result = checkExistence.get(article.title) as { count: number };
if (result.count === 0) {
try {
insert.run(
article.id,
article.title,
article.link,
article.source,
article.created_at,
);
} catch (error) {
if (process.env["DEBUG"] === "true") {
console.log(`*** ERROR: ${error.message}`);
}
}
} else {
if (process.env["DEBUG"] === "true") {
console.log(`*** DUPLICATE: ${article.title}`);
}
}
};

const clearCacheIfNeeded = () => {
const oldestArticle = db
.prepare("SELECT created_at FROM articles ORDER BY created_at ASC LIMIT 1")
.get() as { created_at: string } | undefined;

if (oldestArticle) {
const articleDate = new Date(oldestArticle.created_at);
const now = new Date();
const hoursDifference =
(now.getTime() - articleDate.getTime()) / (1000 * 60 * 60);

if (hoursDifference >= 8) {
if (process.env["DEBUG"] === "true") {
console.log("*** CLEARING CACHE");
}
db.prepare("DELETE FROM articles").run();
}
}
};

export { isValidArticle, insertArticle, clearCacheIfNeeded };
Loading

0 comments on commit 2b83051

Please sign in to comment.