diff --git a/README.md b/README.md index e2d78e4..d9280ee 100644 --- a/README.md +++ b/README.md @@ -31,12 +31,15 @@ Body: ```json { - "url": "https://example.com/article" + "url": "https://example.com/article", + "contentFormat": "markdown" } ``` The `url` field is required. Only absolute `http:` and `https:` URLs are accepted. +The `contentFormat` field is optional and controls the format of the `content` response field. Valid values are `"markdown"` (default) or `"html"`. This field overrides the server-wide `CONTENT_FORMAT` environment variable on a per-request basis. + ### Success response HTTP 200 returns the requested URL plus the parsed article fields: @@ -47,7 +50,7 @@ HTTP 200 returns the requested URL plus the parsed article fields: "title": "Article title", "byline": "Author name", "dir": "ltr", - "content": "
...
", + "content": "# Article title\n\nAuthor name\n\n...", "length": 12345, "excerpt": "Short summary", "siteName": "Site name", @@ -57,6 +60,8 @@ HTTP 200 returns the requested URL plus the parsed article fields: } ``` +The `content` field is formatted as markdown by default. To receive HTML instead, set `contentFormat: "html"` in the request body or the `CONTENT_FORMAT` environment variable to `"html"`. + Fields are emitted in the exact response shape defined by the service. Nullable fields may come back as `null`. ### Error response @@ -93,6 +98,7 @@ All configuration is driven by environment variables. | `BLOCK_PRIVATE_NETWORKS` | `true` | Block loopback and private-network targets by default. | | `READABILITY_MAX_ELEMS` | unset | Optional Readability parse cap for very large documents. | | `MAX_CONCURRENT_REQUESTS` | `10` | Maximum in-flight requests per process before returning `429`. | +| `CONTENT_FORMAT` | `"markdown"` | Default content format for the `content` response field. Valid values: `"markdown"` or `"html"`. Can be overridden per-request via the `contentFormat` parameter. | Example: @@ -182,6 +188,15 @@ This service is still an untrusted content fetcher. Do not relax the defaults wi - Per-process concurrency is capped by `MAX_CONCURRENT_REQUESTS` - The response shape is fixed; do not add fields casually +## Breaking change: contentFormat default + +The `content` response field is now returned as **markdown by default** instead of HTML. Existing consumers that expect HTML must either: + +1. Set the `CONTENT_FORMAT=html` environment variable (server-wide default), or +2. Pass `contentFormat: "html"` in each request + +This change makes article content more portable and easier to consume, but requires explicit opt-in to preserve the previous HTML output. + ## Memory behavior The service does not keep article state between requests, but each fetch still allocates DOM and Readability objects while it parses. Short memory soaks show growth in `rss` and `heapUsed` during active work, while `external` stays comparatively flat. That is the signal to watch for leak regressions: sustained growth across longer runs, not a single small sample. diff --git a/package.json b/package.json index ce1d043..d9eb72f 100644 --- a/package.json +++ b/package.json @@ -9,7 +9,9 @@ "@mozilla/readability": "^0.6.0", "dompurify": "^3.4.11", "express": "^5.2.1", - "jsdom": "^29.1.1" + "jsdom": "^29.1.1", + "turndown": "^7.2.4", + "turndown-plugin-gfm": "^1.0.2" }, "scripts": { "start": "nodemon src/server.js", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 68d7207..56feb26 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -20,6 +20,12 @@ importers: jsdom: specifier: ^29.1.1 version: 29.1.1(@noble/hashes@1.8.0) + turndown: + specifier: ^7.2.4 + version: 7.2.4 + turndown-plugin-gfm: + specifier: ^1.0.2 + version: 1.0.2 devDependencies: nodemon: specifier: ^3.1.14 @@ -97,6 +103,9 @@ packages: '@noble/hashes': optional: true + '@mixmark-io/domino@2.2.0': + resolution: {integrity: sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw==} + '@mozilla/readability@0.6.0': resolution: {integrity: sha512-juG5VWh4qAivzTAeMzvY9xs9HY5rAcr2E4I7tiSSCokRFi7XIZCAu92ZkSTsIj1OPceCifL3cpfteP3pDT9/QQ==} engines: {node: '>=14.0.0'} @@ -612,6 +621,13 @@ packages: resolution: {integrity: sha512-bLVMLPtstlZ4iMQHpFHTR7GAGj2jxi8Dg0s2h2MafAE4uSWF98FC/3MomU51iQAMf8/qDUbKWf5GxuvvVcXEhw==} engines: {node: '>=20'} + turndown-plugin-gfm@1.0.2: + resolution: {integrity: sha512-vwz9tfvF7XN/jE0dGoBei3FXWuvll78ohzCZQuOb+ZjWrs3a0XhQVomJEb2Qh4VHTPNRO4GPZh0V7VRbiWwkRg==} + + turndown@7.2.4: + resolution: {integrity: sha512-I8yFsfRzmzK0WV1pNNOA4A7y4RDfFxPRxb3t+e3ui14qSGOxGtiSP6GjeX+Y6CHb7HYaFj7ECUD7VE5kQMZWGQ==} + engines: {node: '>=18', npm: '>=9'} + type-is@2.1.0: resolution: {integrity: sha512-faYHw0anBbc/kWF3zFTEnxSFOAGUX9GFbOBthvDdLsIlEoWOFOtS0zgCiQYwIskL9iGXZL3kAXD8OoZ4GmMATA==} engines: {node: '>= 18'} @@ -711,6 +727,8 @@ snapshots: optionalDependencies: '@noble/hashes': 1.8.0 + '@mixmark-io/domino@2.2.0': {} + '@mozilla/readability@0.6.0': {} '@noble/hashes@1.8.0': {} @@ -1277,6 +1295,12 @@ snapshots: dependencies: punycode: 2.3.1 + turndown-plugin-gfm@1.0.2: {} + + turndown@7.2.4: + dependencies: + '@mixmark-io/domino': 2.2.0 + type-is@2.1.0: dependencies: content-type: 2.0.0 diff --git a/src/app.js b/src/app.js index 1ae1d7d..cbc6c9f 100644 --- a/src/app.js +++ b/src/app.js @@ -7,6 +7,7 @@ const createDOMPurify = require("dompurify"); const { loadConfig, validateConfig } = require("./config"); const { createLogger } = require("./logger"); +const { toMarkdown } = require("./markdown"); const { mapArticleResponse } = require("./response"); const DOMPurify = createDOMPurify(new JSDOM("").window); @@ -82,6 +83,21 @@ function createConcurrencyGate(maxConcurrentRequests) { }; } +function validateRequestContentFormat(rawValue, defaultFormat) { + if (rawValue === undefined) { + return defaultFormat; + } + + const validFormats = ["markdown", "html"]; + const normalized = String(rawValue).trim().toLowerCase(); + + if (!validFormats.includes(normalized)) { + throw new Error(`contentFormat must be one of: ${validFormats.join(", ")}`); + } + + return normalized; +} + function createReadabilityOptions(config) { if (config.readabilityMaxElems === undefined) { return undefined; @@ -439,6 +455,19 @@ function createApp(configInput, loggerInput) { return; } + let contentFormat; + try { + contentFormat = validateRequestContentFormat( + req.body?.contentFormat, + config.contentFormat, + ); + } catch (error) { + res.status(400).send({ + error: error.message, + }); + return; + } + logger.info(`Fetching ${url}...`); try { @@ -450,10 +479,15 @@ function createApp(configInput, loggerInput) { dom.window.document, createReadabilityOptions(config), ).parse(); + const sanitizedContent = sanitizeArticleContent(parsed?.content ?? null); + const finalContent = + sanitizedContent !== null && contentFormat === "markdown" + ? toMarkdown(sanitizedContent) + : sanitizedContent; const article = parsed ? { ...parsed, - content: sanitizeArticleContent(parsed.content), + content: finalContent, } : null; diff --git a/src/config.js b/src/config.js index c3c3db6..53d761c 100644 --- a/src/config.js +++ b/src/config.js @@ -6,6 +6,7 @@ const DEFAULTS = Object.freeze({ FETCH_MAX_REDIRECTS: 5, BLOCK_PRIVATE_NETWORKS: true, MAX_CONCURRENT_REQUESTS: 10, + CONTENT_FORMAT: "markdown", }); const BODY_LIMIT_PATTERN = /^\d+(b|kb|mb|gb)?$/i; @@ -73,6 +74,17 @@ function parseBodyLimit(rawValue) { return rawValue.trim().toLowerCase(); } +function parseContentFormat(name, rawValue) { + const validFormats = ["markdown", "html"]; + const normalized = String(rawValue).trim().toLowerCase(); + + if (!validFormats.includes(normalized)) { + throw new Error(`${name} must be one of: ${validFormats.join(", ")}`); + } + + return normalized; +} + function validateConfig(configInput) { const config = configInput || {}; @@ -100,6 +112,7 @@ function validateConfig(configInput) { "MAX_CONCURRENT_REQUESTS", config.maxConcurrentRequests, ), + contentFormat: parseContentFormat("CONTENT_FORMAT", config.contentFormat), }; } @@ -118,6 +131,7 @@ function loadConfig(env = process.env) { : env.READABILITY_MAX_ELEMS, maxConcurrentRequests: env.MAX_CONCURRENT_REQUESTS ?? DEFAULTS.MAX_CONCURRENT_REQUESTS, + contentFormat: env.CONTENT_FORMAT ?? DEFAULTS.CONTENT_FORMAT, }); } diff --git a/src/markdown.js b/src/markdown.js new file mode 100644 index 0000000..0eea134 --- /dev/null +++ b/src/markdown.js @@ -0,0 +1,48 @@ +"use strict"; + +const TurndownService = require("turndown"); +const { gfm } = require("turndown-plugin-gfm"); + +const KNOWN_EMBED_DOMAINS = [ + { pattern: /youtube\.com|youtu\.be/, label: "YouTube" }, + { pattern: /vimeo\.com/, label: "Vimeo" }, + { pattern: /dailymotion\.com/, label: "Dailymotion" }, +]; + +const turndownService = new TurndownService(); +turndownService.use(gfm); + +turndownService.addRule("iframe", { + filter: "iframe", + replacement(content, node) { + const src = node.getAttribute("src"); + if (!src) { + return ""; + } + + const match = KNOWN_EMBED_DOMAINS.find(({ pattern }) => pattern.test(src)); + if (match) { + return `[Video: ${match.label}](${src})`; + } + + return `[Embedded content](${src})`; + }, +}); + +turndownService.addRule("video", { + filter: "video", + replacement(content, node) { + const src = node.getAttribute("src"); + if (!src) { + return ""; + } + + return `[Video](${src})`; + }, +}); + +function toMarkdown(sanitizedHtml) { + return turndownService.turndown(sanitizedHtml); +} + +module.exports = { toMarkdown }; diff --git a/test/app.test.js b/test/app.test.js index c149172..ddd44d2 100644 --- a/test/app.test.js +++ b/test/app.test.js @@ -7,6 +7,7 @@ const supertest = require("supertest"); const { DEFAULTS, loadConfig } = require("../src/config"); const { RESPONSE_FIELDS } = require("../src/response"); const { createApp, createReadabilityOptions, messages } = require("../src/app"); +const { toMarkdown } = require("../src/markdown"); function createTestApp(configOverrides) { return createApp( @@ -148,6 +149,7 @@ test("configuration defaults are loaded and validated", () => { blockPrivateNetworks: DEFAULTS.BLOCK_PRIVATE_NETWORKS, readabilityMaxElems: undefined, maxConcurrentRequests: DEFAULTS.MAX_CONCURRENT_REQUESTS, + contentFormat: DEFAULTS.CONTENT_FORMAT, }); assert.throws( @@ -302,9 +304,8 @@ test("POST / sanitizes returned article content", async (t) => { assert.doesNotMatch(response.body.content, /javascript:/i); assert.match( response.body.content, - /]*src="https:\/\/cdn\.example\/image\.jpg"/, + /!\[\]\(https:\/\/cdn\.example\/image\.jpg\)/, ); - assert.match(response.body.content, /