diff --git a/Makefile b/Makefile index 29beabb..c5e2167 100644 --- a/Makefile +++ b/Makefile @@ -1,9 +1,13 @@ install: pnpm install --frozen-lockfile +test: + pnpm test + start: pnpm start +check: lint test helm-verify lint: pnpm lint diff --git a/src/app.js b/src/app.js index 126ea2a..8da3598 100644 --- a/src/app.js +++ b/src/app.js @@ -1,12 +1,11 @@ const express = require("express"); -const dns = require("node:dns/promises"); -const net = require("node:net"); const { JSDOM } = require("jsdom"); const { Readability } = require("@mozilla/readability"); const createDOMPurify = require("dompurify"); const { loadConfig, validateConfig } = require("./config"); -const { createLogger } = require("./logger"); +const { normalizeFetchError } = require("./errors"); +const { fetchArticleHtml } = require("./fetcher"); const { toMarkdown } = require("./markdown"); const { mapArticleResponse } = require("./response"); @@ -36,19 +35,6 @@ const domPurifyOptions = { ], }; -const USER_AGENT = - "Mozilla/5.0 (X11; Linux x86_64; rv:136.0) Gecko/20100101 Firefox/136.0"; -const HTML_CONTENT_TYPES = new Set(["text/html", "application/xhtml+xml"]); - -class FetchArticleError extends Error { - constructor(code, message, details = {}) { - super(message); - this.name = "FetchArticleError"; - this.code = code; - this.details = details; - } -} - function createConcurrencyGate(maxConcurrentRequests) { let activeRequests = 0; @@ -116,348 +102,57 @@ function sanitizeArticleContent(content) { return DOMPurify.sanitize(content, domPurifyOptions); } -function isPrivateIpv4(address) { - const parts = address.split(".").map((part) => Number.parseInt(part, 10)); +function parseArticle(html, url, config, contentFormat) { + const dom = new JSDOM(html, { url }); + const parsed = new Readability( + dom.window.document, + createReadabilityOptions(config), + ).parse(); - if (parts.length !== 4 || parts.some(Number.isNaN)) { - return false; - } + const sanitizedContent = sanitizeArticleContent(parsed?.content ?? null); + const finalContent = + sanitizedContent !== null && contentFormat === "markdown" + ? toMarkdown(sanitizedContent) + : sanitizedContent; - const [first, second] = parts; + const article = parsed ? { ...parsed, content: finalContent } : null; - return ( - first === 10 || - first === 127 || - (first === 169 && second === 254) || - (first === 172 && second >= 16 && second <= 31) || - (first === 192 && second === 168) - ); + return mapArticleResponse(url, article, dom.window.document); } -function isPrivateIpv6(address) { - const normalized = address.toLowerCase(); - - if (normalized === "::1") { - return true; - } - - if (normalized.startsWith("fe80:")) { - return true; - } - - if (normalized.startsWith("fc") || normalized.startsWith("fd")) { - return true; - } - - if (normalized.startsWith("::ffff:")) { - return isPrivateIpv4(normalized.slice("::ffff:".length)); - } - - return false; -} - -function isPrivateIp(address) { - const family = net.isIP(address); - - if (family === 4) { - return isPrivateIpv4(address); - } - - if (family === 6) { - return isPrivateIpv6(address); - } - - return false; -} - -async function assertAllowedUrl(rawUrl, config) { - let parsedUrl; - - try { - parsedUrl = new URL(rawUrl); - } catch (_error) { - throw new FetchArticleError( - "FETCH_INVALID_URL", - "URL must be a valid absolute URL", - { url: rawUrl }, - ); - } - - if (!["http:", "https:"].includes(parsedUrl.protocol)) { - throw new FetchArticleError( - "FETCH_UNSUPPORTED_PROTOCOL", - "Only http and https URLs are supported", - { - protocol: parsedUrl.protocol, - url: parsedUrl.toString(), - }, - ); - } - - if (!config.blockPrivateNetworks) { - return parsedUrl; - } - - const hostname = parsedUrl.hostname; - - if (net.isIP(hostname)) { - if (isPrivateIp(hostname)) { - throw new FetchArticleError( - "FETCH_PRIVATE_NETWORK_BLOCKED", - "Requests to private or loopback addresses are blocked", - { - address: hostname, - hostname, - url: parsedUrl.toString(), - }, - ); - } - - return parsedUrl; - } - - let addresses; - - try { - addresses = await dns.lookup(hostname, { - all: true, - verbatim: true, - }); - } catch (error) { - throw new FetchArticleError( - "FETCH_DNS_ERROR", - `Failed to resolve hostname ${hostname}`, - { - hostname, - cause: error.code || error.name, - }, - ); - } - - const blockedAddress = addresses.find(({ address }) => isPrivateIp(address)); - - if (blockedAddress) { - throw new FetchArticleError( - "FETCH_PRIVATE_NETWORK_BLOCKED", - "Requests to private or loopback addresses are blocked", - { - address: blockedAddress.address, - family: blockedAddress.family, - hostname, - url: parsedUrl.toString(), - }, - ); - } - - return parsedUrl; -} - -function validateContentType(response) { - const contentTypeHeader = response.headers.get("content-type"); - const mediaType = contentTypeHeader - ? contentTypeHeader.split(";")[0].trim().toLowerCase() - : ""; - - if (HTML_CONTENT_TYPES.has(mediaType)) { - return; - } - - throw new FetchArticleError( - "FETCH_NON_HTML_RESPONSE", - "Fetched response must be HTML content", - { - contentType: contentTypeHeader || null, - status: response.status, - url: response.url, +function createApp(configInput, logger) { + const config = validateConfig(configInput); + const log = logger || { + info(msg) { + console.log("[%s] %s", new Date().toISOString(), msg); }, - ); -} - -async function readBodyWithLimit(response, maxBytes) { - const contentLengthHeader = response.headers.get("content-length"); - - if (contentLengthHeader) { - const contentLength = Number.parseInt(contentLengthHeader, 10); - - if (Number.isInteger(contentLength) && contentLength > maxBytes) { - throw new FetchArticleError( - "FETCH_RESPONSE_TOO_LARGE", - `Fetched response exceeded byte limit of ${maxBytes}`, - { - contentLength, - maxBytes, - url: response.url, - }, - ); - } - } - - if (!response.body) { - return ""; - } - - const reader = response.body.getReader(); - const chunks = []; - let totalBytes = 0; - - while (true) { - const { done, value } = await reader.read(); - - if (done) { - break; - } - - totalBytes += value.byteLength; - - if (totalBytes > maxBytes) { - await reader.cancel("response exceeded configured byte limit"); - - throw new FetchArticleError( - "FETCH_RESPONSE_TOO_LARGE", - `Fetched response exceeded byte limit of ${maxBytes}`, - { - bytesRead: totalBytes, - maxBytes, - url: response.url, - }, - ); - } - - chunks.push(Buffer.from(value)); - } - - return Buffer.concat(chunks).toString("utf8"); -} - -function normalizeFetchError(error) { - if (error instanceof FetchArticleError) { - return error; - } - - if (error?.name === "TimeoutError" || error?.name === "AbortError") { - return new FetchArticleError("FETCH_TIMEOUT", "Fetch request timed out"); - } - - return new FetchArticleError("FETCH_NETWORK_ERROR", "Fetch request failed", { - cause: error?.code || error?.name || "UNKNOWN", - }); -} - -async function fetchArticleHtml(url, config) { - let currentUrl = await assertAllowedUrl(url, config); - - for (let redirectCount = 0; ; redirectCount += 1) { - let response; - - try { - response = await fetch(currentUrl, { - headers: { - "User-Agent": USER_AGENT, - }, - redirect: "manual", - signal: AbortSignal.timeout(config.fetchTimeoutMs), - }); - } catch (error) { - throw normalizeFetchError(error); - } - - if (response.status >= 300 && response.status < 400) { - if (redirectCount >= config.fetchMaxRedirects) { - throw new FetchArticleError( - "FETCH_REDIRECT_LIMIT_EXCEEDED", - `Fetch exceeded redirect limit of ${config.fetchMaxRedirects}`, - { - maxRedirects: config.fetchMaxRedirects, - status: response.status, - url: currentUrl.toString(), - }, - ); + error(msg, err) { + if (err) { + console.error("[%s] %s", new Date().toISOString(), msg, err); + } else { + console.error("[%s] %s", new Date().toISOString(), msg); } - - const location = response.headers.get("location"); - - if (!location) { - throw new FetchArticleError( - "FETCH_REDIRECT_WITHOUT_LOCATION", - "Redirect response did not include a Location header", - { - status: response.status, - url: currentUrl.toString(), - }, - ); - } - - currentUrl = await assertAllowedUrl( - new URL(location, currentUrl).toString(), - config, - ); - continue; - } - - if (!response.ok) { - throw new FetchArticleError( - "FETCH_HTTP_ERROR", - `Fetch failed with status code ${response.status}`, - { - status: response.status, - url: currentUrl.toString(), - }, - ); - } - - validateContentType(response); - - return { - body: await readBodyWithLimit(response, config.fetchMaxBytes), - finalUrl: response.url || currentUrl.toString(), - }; - } -} - -function normalizeErrorDetails(error) { - const normalizedError = normalizeFetchError(error); - - return { - code: normalizedError.code, - message: normalizedError.message, - ...normalizedError.details, + }, }; -} - -function isInvalidRequestError(error) { - return ( - error instanceof FetchArticleError && - ["FETCH_INVALID_URL", "FETCH_UNSUPPORTED_PROTOCOL"].includes(error.code) - ); -} -function createApp(configInput, loggerInput) { - const config = validateConfig(configInput); - const logger = loggerInput || createLogger(); const app = express(); app.get("/healthz", (_req, res) => { - res.status(200).json({ - ok: true, - }); + res.status(200).json({ ok: true }); }); app.use(express.json({ limit: config.requestBodyLimit })); app.use(createConcurrencyGate(config.maxConcurrentRequests)); app.get("/", (_req, res) => { - res.status(400).send({ - error: INVALID_GET_MESSAGE, - }); + res.status(400).send({ error: INVALID_GET_MESSAGE }); }); app.post("/", async (req, res) => { const url = req.body?.url; if (url === undefined || url === "") { - res.status(400).send({ - error: INVALID_REQUEST_MESSAGE, - }); + res.status(400).send({ error: INVALID_REQUEST_MESSAGE }); return; } @@ -468,51 +163,34 @@ function createApp(configInput, loggerInput) { config.contentFormat, ); } catch (error) { - res.status(400).send({ - error: error.message, - }); + res.status(400).send({ error: error.message }); return; } - logger.info(`Fetching ${url}...`); + log.info(`Fetching ${url}...`); try { const response = await fetchArticleHtml(url, config); - // Intentionally rely on jsdom defaults so inline scripts and external - // resource loading remain disabled during parsing. - const dom = new JSDOM(response.body, { url: response.finalUrl }); - const parsed = new Readability( - dom.window.document, - createReadabilityOptions(config), - ).parse(); - const sanitizedContent = sanitizeArticleContent(parsed?.content ?? null); - const finalContent = - sanitizedContent !== null && contentFormat === "markdown" - ? toMarkdown(sanitizedContent) - : sanitizedContent; - const article = parsed - ? { - ...parsed, - content: finalContent, - } - : null; - - logger.info(`Fetched and parsed ${url} successfully`); - - res - .status(200) - .send(mapArticleResponse(url, article, dom.window.document)); + const article = parseArticle( + response.body, + response.finalUrl, + config, + contentFormat, + ); + + log.info(`Fetched and parsed ${url} successfully`); + res.status(200).send(article); } catch (error) { - logger.error(`Failed to fetch or parse ${url}`, error); + log.error(`Failed to fetch or parse ${url}`, error); - const status = isInvalidRequestError(error) ? 400 : 500; + const normalized = normalizeFetchError(error); - res.status(status).send({ + res.status(normalized.statusCode).send({ error: - status === 400 + normalized.statusCode === 400 ? INVALID_REQUEST_MESSAGE : "Some weird error fetching the content", - details: normalizeErrorDetails(error), + details: normalized.toResponseBody(), }); } }); diff --git a/src/errors.js b/src/errors.js new file mode 100644 index 0000000..d95b3a3 --- /dev/null +++ b/src/errors.js @@ -0,0 +1,43 @@ +"use strict"; + +const INVALID_REQUEST_CODES = new Set([ + "FETCH_INVALID_URL", + "FETCH_UNSUPPORTED_PROTOCOL", +]); + +class FetchArticleError extends Error { + constructor(code, message, details = {}) { + super(message); + this.name = "FetchArticleError"; + this.code = code; + this.details = details; + } + + get statusCode() { + return INVALID_REQUEST_CODES.has(this.code) ? 400 : 500; + } + + toResponseBody() { + return { + code: this.code, + message: this.message, + ...this.details, + }; + } +} + +function normalizeFetchError(error) { + if (error instanceof FetchArticleError) { + return error; + } + + if (error?.name === "TimeoutError" || error?.name === "AbortError") { + return new FetchArticleError("FETCH_TIMEOUT", "Fetch request timed out"); + } + + return new FetchArticleError("FETCH_NETWORK_ERROR", "Fetch request failed", { + cause: error?.code || error?.name || "UNKNOWN", + }); +} + +module.exports = { FetchArticleError, normalizeFetchError }; diff --git a/src/fetcher.js b/src/fetcher.js new file mode 100644 index 0000000..309be9e --- /dev/null +++ b/src/fetcher.js @@ -0,0 +1,158 @@ +"use strict"; + +const { FetchArticleError, normalizeFetchError } = require("./errors"); +const { assertAllowedUrl } = require("./network"); + +const USER_AGENT = + "Mozilla/5.0 (X11; Linux x86_64; rv:136.0) Gecko/20100101 Firefox/136.0"; +const HTML_CONTENT_TYPES = new Set(["text/html", "application/xhtml+xml"]); + +function validateContentType(response) { + const contentTypeHeader = response.headers.get("content-type"); + const mediaType = contentTypeHeader + ? contentTypeHeader.split(";")[0].trim().toLowerCase() + : ""; + + if (HTML_CONTENT_TYPES.has(mediaType)) { + return; + } + + throw new FetchArticleError( + "FETCH_NON_HTML_RESPONSE", + "Fetched response must be HTML content", + { + contentType: contentTypeHeader || null, + status: response.status, + url: response.url, + }, + ); +} + +async function readBodyWithLimit(response, maxBytes) { + const contentLengthHeader = response.headers.get("content-length"); + + if (contentLengthHeader) { + const contentLength = Number.parseInt(contentLengthHeader, 10); + + if (Number.isInteger(contentLength) && contentLength > maxBytes) { + throw new FetchArticleError( + "FETCH_RESPONSE_TOO_LARGE", + `Fetched response exceeded byte limit of ${maxBytes}`, + { + contentLength, + maxBytes, + url: response.url, + }, + ); + } + } + + if (!response.body) { + return ""; + } + + const reader = response.body.getReader(); + const chunks = []; + let totalBytes = 0; + + while (true) { + const { done, value } = await reader.read(); + + if (done) { + break; + } + + totalBytes += value.byteLength; + + if (totalBytes > maxBytes) { + await reader.cancel("response exceeded configured byte limit"); + + throw new FetchArticleError( + "FETCH_RESPONSE_TOO_LARGE", + `Fetched response exceeded byte limit of ${maxBytes}`, + { + bytesRead: totalBytes, + maxBytes, + url: response.url, + }, + ); + } + + chunks.push(Buffer.from(value)); + } + + return Buffer.concat(chunks).toString("utf8"); +} + +async function fetchArticleHtml(url, config) { + let currentUrl = await assertAllowedUrl(url, config); + + for (let redirectCount = 0; ; redirectCount += 1) { + let response; + + try { + response = await fetch(currentUrl, { + headers: { + "User-Agent": USER_AGENT, + }, + redirect: "manual", + signal: AbortSignal.timeout(config.fetchTimeoutMs), + }); + } catch (error) { + throw normalizeFetchError(error); + } + + if (response.status >= 300 && response.status < 400) { + if (redirectCount >= config.fetchMaxRedirects) { + throw new FetchArticleError( + "FETCH_REDIRECT_LIMIT_EXCEEDED", + `Fetch exceeded redirect limit of ${config.fetchMaxRedirects}`, + { + maxRedirects: config.fetchMaxRedirects, + status: response.status, + url: currentUrl.toString(), + }, + ); + } + + const location = response.headers.get("location"); + + if (!location) { + throw new FetchArticleError( + "FETCH_REDIRECT_WITHOUT_LOCATION", + "Redirect response did not include a Location header", + { + status: response.status, + url: currentUrl.toString(), + }, + ); + } + + currentUrl = await assertAllowedUrl( + new URL(location, currentUrl).toString(), + config, + ); + continue; + } + + if (!response.ok) { + throw new FetchArticleError( + "FETCH_HTTP_ERROR", + `Fetch failed with status code ${response.status}`, + { + status: response.status, + url: currentUrl.toString(), + }, + ); + } + + validateContentType(response); + + return { + body: await readBodyWithLimit(response, config.fetchMaxBytes), + finalUrl: response.url || currentUrl.toString(), + }; + } +} + +module.exports = { fetchArticleHtml }; diff --git a/src/logger.js b/src/logger.js deleted file mode 100644 index ee380f5..0000000 --- a/src/logger.js +++ /dev/null @@ -1,29 +0,0 @@ -function timestamp() { - return new Date().toISOString(); -} - -function write(method, message, error) { - const prefix = `[${timestamp()}]`; - - if (error) { - console[method](`${prefix} ${message}`, error); - return; - } - - console[method](`${prefix} ${message}`); -} - -function createLogger() { - return { - info(message) { - write("log", message); - }, - error(message, error) { - write("error", message, error); - }, - }; -} - -module.exports = { - createLogger, -}; diff --git a/src/network.js b/src/network.js new file mode 100644 index 0000000..0074251 --- /dev/null +++ b/src/network.js @@ -0,0 +1,149 @@ +"use strict"; + +const dns = require("node:dns/promises"); +const net = require("node:net"); + +const { FetchArticleError } = require("./errors"); + +function isPrivateIpv4(address) { + const parts = address.split(".").map((part) => Number.parseInt(part, 10)); + + if (parts.length !== 4 || parts.some(Number.isNaN)) { + return false; + } + + const [first, second] = parts; + + return ( + first === 10 || + first === 127 || + (first === 169 && second === 254) || + (first === 172 && second >= 16 && second <= 31) || + (first === 192 && second === 168) + ); +} + +function isPrivateIpv6(address) { + const normalized = address.toLowerCase(); + + if (normalized === "::1") { + return true; + } + + if (normalized.startsWith("fe80:")) { + return true; + } + + if (normalized.startsWith("fc") || normalized.startsWith("fd")) { + return true; + } + + if (normalized.startsWith("::ffff:")) { + return isPrivateIpv4(normalized.slice("::ffff:".length)); + } + + return false; +} + +function isPrivateIp(address) { + const family = net.isIP(address); + + if (family === 4) { + return isPrivateIpv4(address); + } + + if (family === 6) { + return isPrivateIpv6(address); + } + + return false; +} + +async function assertAllowedUrl(rawUrl, config) { + let parsedUrl; + + try { + parsedUrl = new URL(rawUrl); + } catch (_error) { + throw new FetchArticleError( + "FETCH_INVALID_URL", + "URL must be a valid absolute URL", + { url: rawUrl }, + ); + } + + if (!["http:", "https:"].includes(parsedUrl.protocol)) { + throw new FetchArticleError( + "FETCH_UNSUPPORTED_PROTOCOL", + "Only http and https URLs are supported", + { + protocol: parsedUrl.protocol, + url: parsedUrl.toString(), + }, + ); + } + + if (!config.blockPrivateNetworks) { + return parsedUrl; + } + + const hostname = parsedUrl.hostname; + + if (net.isIP(hostname)) { + if (isPrivateIp(hostname)) { + throw new FetchArticleError( + "FETCH_PRIVATE_NETWORK_BLOCKED", + "Requests to private or loopback addresses are blocked", + { + address: hostname, + hostname, + url: parsedUrl.toString(), + }, + ); + } + + return parsedUrl; + } + + let addresses; + + try { + addresses = await dns.lookup(hostname, { + all: true, + verbatim: true, + }); + } catch (error) { + throw new FetchArticleError( + "FETCH_DNS_ERROR", + `Failed to resolve hostname ${hostname}`, + { + hostname, + cause: error.code || error.name, + }, + ); + } + + const blockedAddress = addresses.find(({ address }) => isPrivateIp(address)); + + if (blockedAddress) { + throw new FetchArticleError( + "FETCH_PRIVATE_NETWORK_BLOCKED", + "Requests to private or loopback addresses are blocked", + { + address: blockedAddress.address, + family: blockedAddress.family, + hostname, + url: parsedUrl.toString(), + }, + ); + } + + return parsedUrl; +} + +module.exports = { + assertAllowedUrl, + isPrivateIp, + isPrivateIpv4, + isPrivateIpv6, +}; diff --git a/src/server.js b/src/server.js index f30d71b..1410c7c 100644 --- a/src/server.js +++ b/src/server.js @@ -1,16 +1,24 @@ const { loadConfig } = require("./config"); -const { createLogger } = require("./logger"); const app = require("./app"); const { version } = require("../package.json"); const config = loadConfig(); -const logger = createLogger(); + +function log(message) { + console.log(`[${new Date().toISOString()}] ${message}`); +} + +function logError(message, error) { + if (error) { + console.error(`[${new Date().toISOString()}] ${message}`, error); + } else { + console.error(`[${new Date().toISOString()}] ${message}`); + } +} const shutdownTimeoutMs = 10_000; const server = app.listen(config.port, () => { - logger.info( - `Readability.js server v${version} listening on port ${config.port}!`, - ); + log(`Readability.js server v${version} listening on port ${config.port}!`); }); let isShuttingDown = false; @@ -23,7 +31,7 @@ function closeServer(signal) { isShuttingDown = true; process.exitCode = 0; - logger.info( + log( `Received ${signal}, starting graceful shutdown with a ${shutdownTimeoutMs}ms timeout...`, ); @@ -32,7 +40,7 @@ function closeServer(signal) { } const forceCloseTimer = setTimeout(() => { - logger.info( + log( `Graceful shutdown timed out after ${shutdownTimeoutMs}ms, closing remaining connections...`, ); @@ -47,11 +55,11 @@ function closeServer(signal) { clearTimeout(forceCloseTimer); if (error) { - logger.error("HTTP server shutdown failed", error); + logError("HTTP server shutdown failed", error); return; } - logger.info("HTTP server closed cleanly, exiting."); + log("HTTP server closed cleanly, exiting."); process.exitCode = 0; }); }