Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions scripts/memory-soak.js
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@ function formatMegabytes(bytes) {
}

function snapshotMemory(completed, failures) {
if (global.gc) {
global.gc();
}

const usage = process.memoryUsage();

return {
Expand All @@ -55,6 +59,7 @@ function snapshotMemory(completed, failures) {
rss: usage.rss,
heapUsed: usage.heapUsed,
external: usage.external,
arrayBuffers: usage.arrayBuffers,
};
}

Expand All @@ -74,6 +79,7 @@ function logSample(sample) {
`rss=${formatMegabytes(sample.rss)}`,
`heapUsed=${formatMegabytes(sample.heapUsed)}`,
`external=${formatMegabytes(sample.external)}`,
`arrayBuffers=${formatMegabytes(sample.arrayBuffers)}`,
].join(" "),
);
}
Expand Down
56 changes: 3 additions & 53 deletions src/app.js
Original file line number Diff line number Diff line change
@@ -1,40 +1,15 @@
const express = require("express");
const { JSDOM } = require("jsdom");
const { Readability } = require("@mozilla/readability");
const createDOMPurify = require("dompurify");

const { loadConfig, validateConfig } = require("./config");
const { normalizeFetchError } = require("./errors");
const { fetchArticleHtml } = require("./fetcher");
const { toMarkdown } = require("./markdown");
const { mapArticleResponse } = require("./response");

const DOMPurify = createDOMPurify(new JSDOM("").window);
const { createParserPool } = require("./parser");

const INVALID_REQUEST_MESSAGE =
'Send JSON, like so: {"url": "https://url/to/whatever"}';
const INVALID_GET_MESSAGE =
'POST (not GET) JSON, like so: {"url": "https://url/to/whatever"}';

const domPurifyOptions = {
ADD_TAGS: ["iframe", "video"],
ADD_ATTR: [
"allow",
"allowfullscreen",
"autoplay",
"controls",
"frameborder",
"loading",
"loop",
"muted",
"playsinline",
"poster",
"preload",
"referrerpolicy",
"scrolling",
],
};

function createConcurrencyGate(maxConcurrentRequests) {
let activeRequests = 0;

Expand Down Expand Up @@ -94,32 +69,6 @@ function createReadabilityOptions(config) {
};
}

function sanitizeArticleContent(content) {
if (!content) {
return null;
}

return DOMPurify.sanitize(content, domPurifyOptions);
}

function parseArticle(html, url, config, contentFormat) {
const dom = new JSDOM(html, { url });
const parsed = new Readability(
dom.window.document,
createReadabilityOptions(config),
).parse();

const sanitizedContent = sanitizeArticleContent(parsed?.content ?? null);
const finalContent =
sanitizedContent !== null && contentFormat === "markdown"
? toMarkdown(sanitizedContent)
: sanitizedContent;

const article = parsed ? { ...parsed, content: finalContent } : null;

return mapArticleResponse(url, article, dom.window.document);
}

function createApp(configInput, logger) {
const config = validateConfig(configInput);
const log = logger || {
Expand All @@ -135,6 +84,7 @@ function createApp(configInput, logger) {
},
};

const parser = createParserPool();
const app = express();

app.get("/healthz", (_req, res) => {
Expand Down Expand Up @@ -171,7 +121,7 @@ function createApp(configInput, logger) {

try {
const response = await fetchArticleHtml(url, config);
const article = parseArticle(
const article = await parser.parse(
response.body,
response.finalUrl,
config,
Expand Down
100 changes: 100 additions & 0 deletions src/parser-worker.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
"use strict";

const { JSDOM } = require("jsdom");
const { Readability } = require("@mozilla/readability");
const createDOMPurify = require("dompurify");

const { toMarkdown } = require("./markdown");
const { mapArticleResponse } = require("./response");

const maxParses = parseInt(process.env.PARSER_MAX_PARSES || "500", 10);
let parseCount = 0;

const sharedWindow = new JSDOM("").window;
const DOMPurify = createDOMPurify(sharedWindow);
const sharedDOMParser = new sharedWindow.DOMParser();

const domPurifyOptions = {
ADD_TAGS: ["iframe", "video"],
ADD_ATTR: [
"allow",
"allowfullscreen",
"autoplay",
"controls",
"frameborder",
"loading",
"loop",
"muted",
"playsinline",
"poster",
"preload",
"referrerpolicy",
"scrolling",
],
RETURN_DOM_FRAGMENT: true,
};

function createReadabilityOptions(config) {
if (config.readabilityMaxElems === undefined) {
return undefined;
}
return { maxElemsToParse: config.readabilityMaxElems };
}

function sanitizeArticleContent(content) {
if (!content) {
return null;
}
const fragment = DOMPurify.sanitize(content, domPurifyOptions);
const container = fragment.ownerDocument.createElement("div");
container.appendChild(fragment);
return container.innerHTML;
}

function parseArticle(html, url, config, contentFormat) {
const baseTag = `<base href="${url.replace(/"/g, "&quot;")}">`;
const htmlWithBase = /<head[^>]*>/i.test(html)
? html.replace(/(<head[^>]*>)/i, `$1${baseTag}`)
: html.replace(/(<html[^>]*>)/i, `$1<head>${baseTag}</head>`);

const doc = sharedDOMParser.parseFromString(htmlWithBase, "text/html");

const parsed = new Readability(
doc,
createReadabilityOptions(config),
).parse();

const sanitizedContent = sanitizeArticleContent(parsed?.content ?? null);
const finalContent =
sanitizedContent !== null && contentFormat === "markdown"
? toMarkdown(sanitizedContent)
: sanitizedContent;

const article = parsed ? { ...parsed, content: finalContent } : null;

return mapArticleResponse(url, article, doc);
}

process.on("message", (msg) => {
try {
const result = parseArticle(
msg.html,
msg.url,
msg.config,
msg.contentFormat,
);
process.send({ id: msg.id, result });
} catch (error) {
process.send({
id: msg.id,
error: { message: error.message, code: error.code },
});
}

parseCount += 1;
if (parseCount >= maxParses) {
process.send({ type: "recycle" });
}
});

process.send({ type: "ready" });
129 changes: 129 additions & 0 deletions src/parser.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
"use strict";

const { fork } = require("node:child_process");
const path = require("node:path");

const WORKER_SCRIPT = path.join(__dirname, "parser-worker.js");

function createParserPool(maxParses = 500) {
let child = null;
let childReady = false;
let recycleRequested = false;
let readyResolvers = [];
const pending = new Map();
let nextId = 0;

function spawn() {
childReady = false;
recycleRequested = false;
child = fork(WORKER_SCRIPT, [], {
env: { ...process.env, PARSER_MAX_PARSES: String(maxParses) },
serialization: "advanced",
stdio: "inherit",
});
child.unref();
child.channel.unref();

child.on("message", onMessage);
child.on("error", onError);
child.on("exit", onExit);
}

function onMessage(msg) {
if (msg.type === "ready") {
childReady = true;
for (const resolve of readyResolvers) resolve();
readyResolvers = [];
return;
}

if (msg.type === "recycle") {
recycleRequested = true;
maybeRecycle();
return;
}

const entry = pending.get(msg.id);
if (!entry) return;
pending.delete(msg.id);

if (msg.error) {
entry.reject(new Error(msg.error.message));
} else {
entry.resolve(msg.result);
}

if (pending.size === 0 && child) {
child.unref();
child.channel.unref();
}

if (recycleRequested) {
maybeRecycle();
}
}

function maybeRecycle() {
if (!recycleRequested || pending.size > 0) return;

const old = child;
child = null;
childReady = false;
recycleRequested = false;

if (old) {
old.removeAllListeners();
old.kill();
}
}

function onError(err) {
rejectAll(err);
child = null;
childReady = false;
}

function onExit() {
rejectAll(new Error("Parser process exited unexpectedly"));
child = null;
childReady = false;
}

function rejectAll(err) {
for (const entry of pending.values()) {
entry.reject(err);
}
pending.clear();
}

function ensureChild() {
if (child && childReady && !recycleRequested) {
return Promise.resolve();
}
if (child && !childReady) {
return new Promise((r) => readyResolvers.push(r));
}
spawn();
return new Promise((r) => readyResolvers.push(r));
}

function parse(html, url, config, contentFormat) {
const id = nextId;
nextId += 1;

return ensureChild().then(() => {
return new Promise((resolve, reject) => {
pending.set(id, { resolve, reject });
if (child) {
child.ref();
child.channel.ref();
}
child.send({ id, html, url, config, contentFormat });
});
});
}

return { parse };
}

module.exports = { createParserPool };
Loading