Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 17 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,15 @@ Body:

```json
{
"url": "https://example.com/article"
"url": "https://example.com/article",
"contentFormat": "markdown"
}
```

The `url` field is required. Only absolute `http:` and `https:` URLs are accepted.

The `contentFormat` field is optional and controls the format of the `content` response field. Valid values are `"markdown"` (default) or `"html"`. This field overrides the server-wide `CONTENT_FORMAT` environment variable on a per-request basis.

### Success response

HTTP 200 returns the requested URL plus the parsed article fields:
Expand All @@ -47,7 +50,7 @@ HTTP 200 returns the requested URL plus the parsed article fields:
"title": "Article title",
"byline": "Author name",
"dir": "ltr",
"content": "<article>...</article>",
"content": "# Article title\n\nAuthor name\n\n...",
"length": 12345,
"excerpt": "Short summary",
"siteName": "Site name",
Expand All @@ -57,6 +60,8 @@ HTTP 200 returns the requested URL plus the parsed article fields:
}
```

The `content` field is formatted as markdown by default. To receive HTML instead, set `contentFormat: "html"` in the request body or the `CONTENT_FORMAT` environment variable to `"html"`.

Fields are emitted in the exact response shape defined by the service. Nullable fields may come back as `null`.

### Error response
Expand Down Expand Up @@ -93,6 +98,7 @@ All configuration is driven by environment variables.
| `BLOCK_PRIVATE_NETWORKS` | `true` | Block loopback and private-network targets by default. |
| `READABILITY_MAX_ELEMS` | unset | Optional Readability parse cap for very large documents. |
| `MAX_CONCURRENT_REQUESTS` | `10` | Maximum in-flight requests per process before returning `429`. |
| `CONTENT_FORMAT` | `"markdown"` | Default content format for the `content` response field. Valid values: `"markdown"` or `"html"`. Can be overridden per-request via the `contentFormat` parameter. |

Example:

Expand Down Expand Up @@ -182,6 +188,15 @@ This service is still an untrusted content fetcher. Do not relax the defaults wi
- Per-process concurrency is capped by `MAX_CONCURRENT_REQUESTS`
- The response shape is fixed; do not add fields casually

## Breaking change: contentFormat default

The `content` response field is now returned as **markdown by default** instead of HTML. Existing consumers that expect HTML must either:

1. Set the `CONTENT_FORMAT=html` environment variable (server-wide default), or
2. Pass `contentFormat: "html"` in each request

This change makes article content more portable and easier to consume, but requires explicit opt-in to preserve the previous HTML output.

## Memory behavior

The service does not keep article state between requests, but each fetch still allocates DOM and Readability objects while it parses. Short memory soaks show growth in `rss` and `heapUsed` during active work, while `external` stays comparatively flat. That is the signal to watch for leak regressions: sustained growth across longer runs, not a single small sample.
Expand Down
4 changes: 3 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@
"@mozilla/readability": "^0.6.0",
"dompurify": "^3.4.11",
"express": "^5.2.1",
"jsdom": "^29.1.1"
"jsdom": "^29.1.1",
"turndown": "^7.2.4",
"turndown-plugin-gfm": "^1.0.2"
},
"scripts": {
"start": "nodemon src/server.js",
Expand Down
24 changes: 24 additions & 0 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

36 changes: 35 additions & 1 deletion src/app.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ const createDOMPurify = require("dompurify");

const { loadConfig, validateConfig } = require("./config");
const { createLogger } = require("./logger");
const { toMarkdown } = require("./markdown");
const { mapArticleResponse } = require("./response");

const DOMPurify = createDOMPurify(new JSDOM("").window);
Expand Down Expand Up @@ -82,6 +83,21 @@ function createConcurrencyGate(maxConcurrentRequests) {
};
}

function validateRequestContentFormat(rawValue, defaultFormat) {
if (rawValue === undefined) {
return defaultFormat;
}

const validFormats = ["markdown", "html"];
const normalized = String(rawValue).trim().toLowerCase();

if (!validFormats.includes(normalized)) {
throw new Error(`contentFormat must be one of: ${validFormats.join(", ")}`);
}

return normalized;
}

function createReadabilityOptions(config) {
if (config.readabilityMaxElems === undefined) {
return undefined;
Expand Down Expand Up @@ -439,6 +455,19 @@ function createApp(configInput, loggerInput) {
return;
}

let contentFormat;
try {
contentFormat = validateRequestContentFormat(
req.body?.contentFormat,
config.contentFormat,
);
} catch (error) {
res.status(400).send({
error: error.message,
});
return;
}

logger.info(`Fetching ${url}...`);

try {
Expand All @@ -450,10 +479,15 @@ function createApp(configInput, loggerInput) {
dom.window.document,
createReadabilityOptions(config),
).parse();
const sanitizedContent = sanitizeArticleContent(parsed?.content ?? null);
const finalContent =
sanitizedContent !== null && contentFormat === "markdown"
? toMarkdown(sanitizedContent)
: sanitizedContent;
const article = parsed
? {
...parsed,
content: sanitizeArticleContent(parsed.content),
content: finalContent,
}
: null;

Expand Down
14 changes: 14 additions & 0 deletions src/config.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ const DEFAULTS = Object.freeze({
FETCH_MAX_REDIRECTS: 5,
BLOCK_PRIVATE_NETWORKS: true,
MAX_CONCURRENT_REQUESTS: 10,
CONTENT_FORMAT: "markdown",
});

const BODY_LIMIT_PATTERN = /^\d+(b|kb|mb|gb)?$/i;
Expand Down Expand Up @@ -73,6 +74,17 @@ function parseBodyLimit(rawValue) {
return rawValue.trim().toLowerCase();
}

function parseContentFormat(name, rawValue) {
const validFormats = ["markdown", "html"];
const normalized = String(rawValue).trim().toLowerCase();

if (!validFormats.includes(normalized)) {
throw new Error(`${name} must be one of: ${validFormats.join(", ")}`);
}

return normalized;
}

function validateConfig(configInput) {
const config = configInput || {};

Expand Down Expand Up @@ -100,6 +112,7 @@ function validateConfig(configInput) {
"MAX_CONCURRENT_REQUESTS",
config.maxConcurrentRequests,
),
contentFormat: parseContentFormat("CONTENT_FORMAT", config.contentFormat),
};
}

Expand All @@ -118,6 +131,7 @@ function loadConfig(env = process.env) {
: env.READABILITY_MAX_ELEMS,
maxConcurrentRequests:
env.MAX_CONCURRENT_REQUESTS ?? DEFAULTS.MAX_CONCURRENT_REQUESTS,
contentFormat: env.CONTENT_FORMAT ?? DEFAULTS.CONTENT_FORMAT,
});
}

Expand Down
48 changes: 48 additions & 0 deletions src/markdown.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"use strict";

const TurndownService = require("turndown");
const { gfm } = require("turndown-plugin-gfm");

const KNOWN_EMBED_DOMAINS = [
{ pattern: /youtube\.com|youtu\.be/, label: "YouTube" },
{ pattern: /vimeo\.com/, label: "Vimeo" },
{ pattern: /dailymotion\.com/, label: "Dailymotion" },
];

const turndownService = new TurndownService();
turndownService.use(gfm);

turndownService.addRule("iframe", {
filter: "iframe",
replacement(content, node) {
const src = node.getAttribute("src");
if (!src) {
return "";
}

const match = KNOWN_EMBED_DOMAINS.find(({ pattern }) => pattern.test(src));
if (match) {
return `[Video: ${match.label}](${src})`;
}

return `[Embedded content](${src})`;
},
});

turndownService.addRule("video", {
filter: "video",
replacement(content, node) {
const src = node.getAttribute("src");
if (!src) {
return "";
}

return `[Video](${src})`;
},
});

function toMarkdown(sanitizedHtml) {
return turndownService.turndown(sanitizedHtml);
}

module.exports = { toMarkdown };
Loading
Loading