From c50be0aa081f0e93a76dde62d693a0910162fb6a Mon Sep 17 00:00:00 2001 From: Luis Pabon Date: Wed, 17 Jun 2026 14:31:33 +0100 Subject: [PATCH 1/9] docs: add planning artifacts for markdown output feature MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #52 — return content as markdown by default with configurable contentFormat param and CONTENT_FORMAT env var. Co-Authored-By: Claude Opus 4.6 --- .../2026-06-17_markdown-output/overview.md | 65 +++++++ .../2026-06-17_markdown-output/plan.yaml | 166 ++++++++++++++++++ 2 files changed, 231 insertions(+) create mode 100644 .project_planning/2026-06-17_markdown-output/overview.md create mode 100644 .project_planning/2026-06-17_markdown-output/plan.yaml diff --git a/.project_planning/2026-06-17_markdown-output/overview.md b/.project_planning/2026-06-17_markdown-output/overview.md new file mode 100644 index 0000000..13a30c9 --- /dev/null +++ b/.project_planning/2026-06-17_markdown-output/overview.md @@ -0,0 +1,65 @@ +## Request + +GitHub issue #52: Return content as markdown by default instead of HTML. Make output format configurable per-request. + +## Overview + +Add HTML-to-markdown conversion to the response pipeline using Turndown + turndown-plugin-gfm. The `content` field changes from sanitized HTML to sanitized-then-converted markdown by default. Consumers control output format via a `contentFormat` request body param (`"markdown"` or `"html"`), with a server-wide default set by the `CONTENT_FORMAT` env var. + +Pipeline becomes: raw HTML → DOMPurify sanitize → Turndown convert (if markdown) → response. + +## Key Decisions + +- **Turndown over alternatives**: Battle-tested with Readability output, pure JS, lightweight (~50KB), plugin ecosystem. node-html-markdown is faster but smaller community; mdream needs native bindings problematic on Alpine. +- **`contentFormat` not `outputFormat`**: More descriptive — only the `content` field changes format. +- **Always sanitize regardless of format**: Defense-in-depth. Turndown can pass unrecognized HTML through as raw markdown, and crafted HTML could produce markdown that re-renders maliciously. DOMPurify first eliminates these vectors. +- **Per-request param + env var default**: Per-request gives consumers flexibility. Env var (`CONTENT_FORMAT`) lets operators pin `html` globally for backward compatibility during migration. +- **Default to markdown**: Per issue intent. Breaking change, mitigated by env var. + +## Tradeoffs + +- **Breaking default**: Existing consumers expecting HTML in `content` will get markdown. Accepted — env var provides escape hatch, and the security posture improvement justifies the default. +- **Extra dependency (Turndown + GFM plugin)**: Adds ~60KB to node_modules. Acceptable for the functionality gained. Both are pure JS, no native modules. +- **Sanitize-then-convert vs convert-only**: Small perf cost from always sanitizing. Worth it for defense-in-depth — can't guarantee downstream consumers won't re-render markdown as HTML. + +## Scope Boundaries + +**In scope:** +- `turndown` and `turndown-plugin-gfm` dependencies +- Conversion logic in app.js or a new converter module +- `contentFormat` request body parameter with validation +- `CONTENT_FORMAT` env var with config parsing and validation +- Tests: unit for conversion, integration for request param, config parsing +- README and env var documentation updates +- Docker compose example update if env vars are documented there + +**Out of scope:** +- No new response fields — `content` changes format, field set stays frozen +- No changes to error handling, status codes, or error envelope +- No changes to `textContent` (already plain text) +- No changes to other response fields (all plain strings) +- No Turndown customization beyond GFM plugin and media embed rules + +## Verification Strategy + +| Check | Command | Cost | Notes | +|-------------|----------------------|--------|------------------------------------------| +| Lint | `pnpm lint` | Cheap | Prettier check on src/ test/ scripts/ | +| Test | `pnpm test` | Cheap | `node --test`, fixture-based | +| Lint fix | `pnpm lint:fix` | Cheap | Prettier auto-fix, prefer over check | +| Memory soak | `pnpm memory:soak` | Medium | Only if allocation behavior may change | +| Docker build| `docker build -t readability-js .` | Medium | Verify container builds cleanly | + +Run `pnpm lint:fix` then `pnpm test` after each step. Memory soak and Docker build at the end. + +## Decision Log + +| # | Decision | Rationale | +|---|----------|-----------| +| 1 | Turndown as converter | Proven with Readability, pure JS, Alpine-safe | +| 2 | `contentFormat` param name | Descriptive of what changes — only `content` field | +| 3 | Always sanitize first | Defense-in-depth: converter bugs, markdown injection, raw HTML passthrough | +| 4 | Env var `CONTENT_FORMAT` | Backward compat for operators, overridable per-request | +| 5 | Default `markdown` | Issue intent, security improvement, env var mitigates breakage | +| 6 | Custom Turndown rules for media embeds | Pattern-match iframe `src` against known embed domains (YouTube, Vimeo, Dailymotion, etc.) → `[Video: Provider](url)`. Unknown iframes → `[Embedded content](url)`. Video tags → `[Video](src)`. Preserves media references that would otherwise be silently dropped. | +| 7 | Invalid `contentFormat` returns 400 | Descriptive error message echoing valid options. Consistent with existing input validation pattern. | diff --git a/.project_planning/2026-06-17_markdown-output/plan.yaml b/.project_planning/2026-06-17_markdown-output/plan.yaml new file mode 100644 index 0000000..0e25bf8 --- /dev/null +++ b/.project_planning/2026-06-17_markdown-output/plan.yaml @@ -0,0 +1,166 @@ +steps: + - id: step-1 + title: Add Turndown dependencies + scope: | + Install turndown and turndown-plugin-gfm as production dependencies via pnpm. + files: + - package.json + - pnpm-lock.yaml + constraints: + - Use exact versions pinned by pnpm + - No native modules — both must be pure JS + acceptance: + - turndown and turndown-plugin-gfm appear in package.json dependencies + - pnpm install --frozen-lockfile succeeds after lockfile update + verification: + - "pnpm install --frozen-lockfile" + + - id: step-2 + title: Add CONTENT_FORMAT config option + scope: | + Add CONTENT_FORMAT env var support to config.js. Valid values: "markdown", "html". + Default: "markdown". Parsed and validated alongside existing config options. + files: + - src/config.js + constraints: + - Follow existing config parsing patterns (parseBoolean, parseInteger style) + - Invalid values must throw with a descriptive message listing valid options + - Add to DEFAULTS object + acceptance: + - loadConfig() returns contentFormat field + - validateConfig() validates contentFormat + - Invalid CONTENT_FORMAT values throw descriptive error + - Default is "markdown" + verification: + - "pnpm lint:fix" + - "pnpm test" + + - id: step-3 + title: Add contentFormat request parameter with validation + scope: | + Accept optional contentFormat in POST / request body. Validate against allowed + values ("markdown", "html"). Invalid values return 400 with descriptive error + listing valid options. Falls back to server config default when omitted. + files: + - src/app.js + depends_on: + - step-2 + constraints: + - 400 error message must echo valid options + - Follow existing request validation patterns + - Do not change error envelope shape + acceptance: + - Valid contentFormat values accepted + - Missing contentFormat uses server config default + - Invalid contentFormat returns 400 with descriptive message + verification: + - "pnpm lint:fix" + - "pnpm test" + + - id: step-4 + title: Create markdown converter module + scope: | + New module src/markdown.js that initializes Turndown with GFM plugin and custom + media embed rules. Exports a function that converts sanitized HTML to markdown. + + Custom Turndown rules: + - iframe: pattern-match src against known embed domains (YouTube, Vimeo, + Dailymotion, etc.) → [Video: Provider](url). Unknown iframes → [Embedded content](url). + - video: convert to [Video](src). + + Module creates a single Turndown instance (reusable, stateless per conversion). + files: + - src/markdown.js + constraints: + - Single shared Turndown instance, not per-request + - Known embed domain list should be easy to extend + - Pure function: sanitized HTML string in, markdown string out + acceptance: + - Converts standard article HTML (headings, paragraphs, lists, links, images, bold, italic, blockquotes, code) to clean markdown + - YouTube/Vimeo/Dailymotion iframes become labeled video links + - Unknown iframes become generic embedded content links + - Video tags become video links + - GFM tables convert correctly + verification: + - "pnpm lint:fix" + - "pnpm test" + + - id: step-5 + title: Wire conversion into response pipeline + scope: | + Modify POST / handler in app.js to apply markdown conversion when contentFormat + is "markdown". Pipeline: HTML → DOMPurify sanitize → convert to markdown (if needed). + When contentFormat is "html", existing behavior (sanitize only). + + Update response.js if mapArticleResponse needs awareness of content format. + files: + - src/app.js + - src/response.js + depends_on: + - step-3 + - step-4 + constraints: + - Sanitization ALWAYS runs regardless of format + - Conversion happens after sanitization + - Do not change response field set + acceptance: + - Default requests return markdown content + - contentFormat=html returns sanitized HTML (existing behavior) + - contentFormat=markdown returns sanitized-then-converted markdown + - All other response fields unchanged + verification: + - "pnpm lint:fix" + - "pnpm test" + + - id: step-6 + title: Add tests for markdown output and contentFormat param + scope: | + Add test coverage for: + - Default response returns markdown content + - contentFormat=html returns HTML content + - contentFormat=markdown explicitly returns markdown + - Invalid contentFormat returns 400 with descriptive error + - Media embed conversion (iframe → video link, video tag → link) + - GFM table conversion + - Config parsing for CONTENT_FORMAT env var + - Sanitization still runs before markdown conversion (script tags, event handlers stripped) + files: + - test/app.test.js + depends_on: + - step-5 + constraints: + - Follow existing test patterns (node:test, supertest, fixture servers) + - Test both formats explicitly, don't rely on default alone + - Verify sanitization + conversion interaction + acceptance: + - All new tests pass + - Existing tests updated where content assertions assumed HTML + - No test regressions + verification: + - "pnpm lint:fix" + - "pnpm test" + + - id: step-7 + title: Update documentation + scope: | + Update README.md with: + - contentFormat request parameter documentation + - CONTENT_FORMAT env var documentation + - Updated example request/response showing markdown output + - Note about backward compatibility (CONTENT_FORMAT=html) + + Update examples/compose.yaml if env vars are documented there. + files: + - README.md + - examples/compose.yaml + depends_on: + - step-6 + constraints: + - Follow existing README structure and style + - Document breaking change clearly + acceptance: + - README documents contentFormat param and CONTENT_FORMAT env var + - Compose example shows CONTENT_FORMAT option + verification: + - "pnpm lint:fix" + - "pnpm test" From fac9c855a7ba61f64be4adc8b33e6f32801f668d Mon Sep 17 00:00:00 2001 From: Luis Pabon Date: Wed, 17 Jun 2026 14:37:18 +0100 Subject: [PATCH 2/9] chore: add turndown and turndown-plugin-gfm dependencies Co-Authored-By: Claude Opus 4.6 --- package.json | 4 +++- pnpm-lock.yaml | 24 ++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/package.json b/package.json index ce1d043..d9eb72f 100644 --- a/package.json +++ b/package.json @@ -9,7 +9,9 @@ "@mozilla/readability": "^0.6.0", "dompurify": "^3.4.11", "express": "^5.2.1", - "jsdom": "^29.1.1" + "jsdom": "^29.1.1", + "turndown": "^7.2.4", + "turndown-plugin-gfm": "^1.0.2" }, "scripts": { "start": "nodemon src/server.js", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 68d7207..56feb26 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -20,6 +20,12 @@ importers: jsdom: specifier: ^29.1.1 version: 29.1.1(@noble/hashes@1.8.0) + turndown: + specifier: ^7.2.4 + version: 7.2.4 + turndown-plugin-gfm: + specifier: ^1.0.2 + version: 1.0.2 devDependencies: nodemon: specifier: ^3.1.14 @@ -97,6 +103,9 @@ packages: '@noble/hashes': optional: true + '@mixmark-io/domino@2.2.0': + resolution: {integrity: sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw==} + '@mozilla/readability@0.6.0': resolution: {integrity: sha512-juG5VWh4qAivzTAeMzvY9xs9HY5rAcr2E4I7tiSSCokRFi7XIZCAu92ZkSTsIj1OPceCifL3cpfteP3pDT9/QQ==} engines: {node: '>=14.0.0'} @@ -612,6 +621,13 @@ packages: resolution: {integrity: sha512-bLVMLPtstlZ4iMQHpFHTR7GAGj2jxi8Dg0s2h2MafAE4uSWF98FC/3MomU51iQAMf8/qDUbKWf5GxuvvVcXEhw==} engines: {node: '>=20'} + turndown-plugin-gfm@1.0.2: + resolution: {integrity: sha512-vwz9tfvF7XN/jE0dGoBei3FXWuvll78ohzCZQuOb+ZjWrs3a0XhQVomJEb2Qh4VHTPNRO4GPZh0V7VRbiWwkRg==} + + turndown@7.2.4: + resolution: {integrity: sha512-I8yFsfRzmzK0WV1pNNOA4A7y4RDfFxPRxb3t+e3ui14qSGOxGtiSP6GjeX+Y6CHb7HYaFj7ECUD7VE5kQMZWGQ==} + engines: {node: '>=18', npm: '>=9'} + type-is@2.1.0: resolution: {integrity: sha512-faYHw0anBbc/kWF3zFTEnxSFOAGUX9GFbOBthvDdLsIlEoWOFOtS0zgCiQYwIskL9iGXZL3kAXD8OoZ4GmMATA==} engines: {node: '>= 18'} @@ -711,6 +727,8 @@ snapshots: optionalDependencies: '@noble/hashes': 1.8.0 + '@mixmark-io/domino@2.2.0': {} + '@mozilla/readability@0.6.0': {} '@noble/hashes@1.8.0': {} @@ -1277,6 +1295,12 @@ snapshots: dependencies: punycode: 2.3.1 + turndown-plugin-gfm@1.0.2: {} + + turndown@7.2.4: + dependencies: + '@mixmark-io/domino': 2.2.0 + type-is@2.1.0: dependencies: content-type: 2.0.0 From 82e771e7ce6e6728f7e2542c54127ab3ce39bdf7 Mon Sep 17 00:00:00 2001 From: Luis Pabon Date: Wed, 17 Jun 2026 14:38:59 +0100 Subject: [PATCH 3/9] feat(config): add CONTENT_FORMAT env var with markdown default - Add parseContentFormat() validator function supporting "markdown" and "html" - Add CONTENT_FORMAT to DEFAULTS with "markdown" as default - Add contentFormat field to config object returned by loadConfig() - Add contentFormat validation in validateConfig() - Update test expectations to include new contentFormat field Co-Authored-By: Claude Opus 4.6 --- src/config.js | 14 ++++++++++++++ test/app.test.js | 1 + 2 files changed, 15 insertions(+) diff --git a/src/config.js b/src/config.js index c3c3db6..53d761c 100644 --- a/src/config.js +++ b/src/config.js @@ -6,6 +6,7 @@ const DEFAULTS = Object.freeze({ FETCH_MAX_REDIRECTS: 5, BLOCK_PRIVATE_NETWORKS: true, MAX_CONCURRENT_REQUESTS: 10, + CONTENT_FORMAT: "markdown", }); const BODY_LIMIT_PATTERN = /^\d+(b|kb|mb|gb)?$/i; @@ -73,6 +74,17 @@ function parseBodyLimit(rawValue) { return rawValue.trim().toLowerCase(); } +function parseContentFormat(name, rawValue) { + const validFormats = ["markdown", "html"]; + const normalized = String(rawValue).trim().toLowerCase(); + + if (!validFormats.includes(normalized)) { + throw new Error(`${name} must be one of: ${validFormats.join(", ")}`); + } + + return normalized; +} + function validateConfig(configInput) { const config = configInput || {}; @@ -100,6 +112,7 @@ function validateConfig(configInput) { "MAX_CONCURRENT_REQUESTS", config.maxConcurrentRequests, ), + contentFormat: parseContentFormat("CONTENT_FORMAT", config.contentFormat), }; } @@ -118,6 +131,7 @@ function loadConfig(env = process.env) { : env.READABILITY_MAX_ELEMS, maxConcurrentRequests: env.MAX_CONCURRENT_REQUESTS ?? DEFAULTS.MAX_CONCURRENT_REQUESTS, + contentFormat: env.CONTENT_FORMAT ?? DEFAULTS.CONTENT_FORMAT, }); } diff --git a/test/app.test.js b/test/app.test.js index c149172..2f4b5ad 100644 --- a/test/app.test.js +++ b/test/app.test.js @@ -148,6 +148,7 @@ test("configuration defaults are loaded and validated", () => { blockPrivateNetworks: DEFAULTS.BLOCK_PRIVATE_NETWORKS, readabilityMaxElems: undefined, maxConcurrentRequests: DEFAULTS.MAX_CONCURRENT_REQUESTS, + contentFormat: DEFAULTS.CONTENT_FORMAT, }); assert.throws( From f17f1c2dce946ff84cf195ea53fd5283594a9bfa Mon Sep 17 00:00:00 2001 From: Luis Pabon Date: Wed, 17 Jun 2026 14:40:30 +0100 Subject: [PATCH 4/9] feat(app): accept and validate contentFormat request param Add optional contentFormat field to POST / request body validation. Accepts "markdown" or "html" values, falls back to server config default when omitted. Returns HTTP 400 with descriptive error message listing valid options for invalid values. Co-Authored-By: Claude Opus 4.6 --- src/app.js | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/app.js b/src/app.js index 1ae1d7d..e17dcf8 100644 --- a/src/app.js +++ b/src/app.js @@ -82,6 +82,21 @@ function createConcurrencyGate(maxConcurrentRequests) { }; } +function validateRequestContentFormat(rawValue, defaultFormat) { + if (rawValue === undefined) { + return defaultFormat; + } + + const validFormats = ["markdown", "html"]; + const normalized = String(rawValue).trim().toLowerCase(); + + if (!validFormats.includes(normalized)) { + throw new Error(`contentFormat must be one of: ${validFormats.join(", ")}`); + } + + return normalized; +} + function createReadabilityOptions(config) { if (config.readabilityMaxElems === undefined) { return undefined; @@ -439,6 +454,19 @@ function createApp(configInput, loggerInput) { return; } + let contentFormat; + try { + contentFormat = validateRequestContentFormat( + req.body?.contentFormat, + config.contentFormat, + ); + } catch (error) { + res.status(400).send({ + error: error.message, + }); + return; + } + logger.info(`Fetching ${url}...`); try { From 49b8ea7789b1aeedba83ad64ef46d69c06429d78 Mon Sep 17 00:00:00 2001 From: Luis Pabon Date: Wed, 17 Jun 2026 14:42:15 +0100 Subject: [PATCH 5/9] feat(markdown): add HTML-to-markdown converter module Co-Authored-By: Claude Sonnet 4.6 --- src/markdown.js | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 src/markdown.js diff --git a/src/markdown.js b/src/markdown.js new file mode 100644 index 0000000..0eea134 --- /dev/null +++ b/src/markdown.js @@ -0,0 +1,48 @@ +"use strict"; + +const TurndownService = require("turndown"); +const { gfm } = require("turndown-plugin-gfm"); + +const KNOWN_EMBED_DOMAINS = [ + { pattern: /youtube\.com|youtu\.be/, label: "YouTube" }, + { pattern: /vimeo\.com/, label: "Vimeo" }, + { pattern: /dailymotion\.com/, label: "Dailymotion" }, +]; + +const turndownService = new TurndownService(); +turndownService.use(gfm); + +turndownService.addRule("iframe", { + filter: "iframe", + replacement(content, node) { + const src = node.getAttribute("src"); + if (!src) { + return ""; + } + + const match = KNOWN_EMBED_DOMAINS.find(({ pattern }) => pattern.test(src)); + if (match) { + return `[Video: ${match.label}](${src})`; + } + + return `[Embedded content](${src})`; + }, +}); + +turndownService.addRule("video", { + filter: "video", + replacement(content, node) { + const src = node.getAttribute("src"); + if (!src) { + return ""; + } + + return `[Video](${src})`; + }, +}); + +function toMarkdown(sanitizedHtml) { + return turndownService.turndown(sanitizedHtml); +} + +module.exports = { toMarkdown }; From 836dcef024774d5ccf5cc3ccf27392f124816b41 Mon Sep 17 00:00:00 2001 From: Luis Pabon Date: Wed, 17 Jun 2026 14:44:31 +0100 Subject: [PATCH 6/9] feat(app): wire markdown conversion into response pipeline Import toMarkdown from markdown.js and apply it after DOMPurify sanitization when contentFormat is "markdown". Update existing sanitization and media-tag test assertions to expect markdown output since the default contentFormat is now "markdown". Co-Authored-By: Claude Sonnet 4.6 --- src/app.js | 8 +++++++- test/app.test.js | 7 +++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/app.js b/src/app.js index e17dcf8..cbc6c9f 100644 --- a/src/app.js +++ b/src/app.js @@ -7,6 +7,7 @@ const createDOMPurify = require("dompurify"); const { loadConfig, validateConfig } = require("./config"); const { createLogger } = require("./logger"); +const { toMarkdown } = require("./markdown"); const { mapArticleResponse } = require("./response"); const DOMPurify = createDOMPurify(new JSDOM("").window); @@ -478,10 +479,15 @@ function createApp(configInput, loggerInput) { dom.window.document, createReadabilityOptions(config), ).parse(); + const sanitizedContent = sanitizeArticleContent(parsed?.content ?? null); + const finalContent = + sanitizedContent !== null && contentFormat === "markdown" + ? toMarkdown(sanitizedContent) + : sanitizedContent; const article = parsed ? { ...parsed, - content: sanitizeArticleContent(parsed.content), + content: finalContent, } : null; diff --git a/test/app.test.js b/test/app.test.js index 2f4b5ad..696c885 100644 --- a/test/app.test.js +++ b/test/app.test.js @@ -303,9 +303,8 @@ test("POST / sanitizes returned article content", async (t) => { assert.doesNotMatch(response.body.content, /javascript:/i); assert.match( response.body.content, - /]*src="https:\/\/cdn\.example\/image\.jpg"/, + /!\[\]\(https:\/\/cdn\.example\/image\.jpg\)/, ); - assert.match(response.body.content, /