diff --git a/Cargo.lock b/Cargo.lock index ab20718..74f50ac 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2420,6 +2420,7 @@ dependencies = [ "base64", "clap", "dialoguer", + "flate2", "fs2", "hex", "indicatif", @@ -2431,6 +2432,7 @@ dependencies = [ "serial_test", "sha2", "socket-patch-core", + "tar", "tempfile", "testcontainers", "tokio", diff --git a/README.md b/README.md index 72809b5..19fec6a 100644 --- a/README.md +++ b/README.md @@ -233,7 +233,10 @@ socket-patch scan -g # Scan + apply + emit an OpenVEX attestation in one pass socket-patch scan --json --sync --yes --vex socket.vex.json -# Vendor every patched dependency (committable; see the vendor command) +# Vendor every patched dependency (committable; see the vendor command). +# Works on a completely fresh clone: dependencies listed in the lockfile +# but not yet installed are fetched pristine from their registry and +# integrity-verified against the lockfile before vendoring. socket-patch scan --json --vendor --yes # Same, but keep the manifest out of it entirely diff --git a/crates/socket-patch-cli/CLI_CONTRACT.md b/crates/socket-patch-cli/CLI_CONTRACT.md index 7fb1d59..9b02e11 100644 --- a/crates/socket-patch-cli/CLI_CONTRACT.md +++ b/crates/socket-patch-cli/CLI_CONTRACT.md @@ -15,7 +15,7 @@ This document defines the **public surface** of the `socket-patch` binary. Anyth | `list` | — | Print patches in the local manifest | | `remove` | — | Remove patch from manifest (rolls back first); requires positional `identifier` | | `setup` | — | Wire automatic-patching install hooks (npm/pypi/gem) | -| `repair` | `gc` | Download missing blobs + clean up unused ones | +| `repair` | `gc` | Download missing blobs, rebuild missing/corrupt vendored artifacts, clean up unused ones | | `vendor` | — | Eject patched dependencies into committable `.socket/vendor/` and rewire lockfiles | | `vex` | — | Emit an OpenVEX 0.2.0 attestation derived from the local manifest | @@ -55,7 +55,8 @@ Beyond the globals above, each subcommand defines a small set of local arguments | Subcommand | Local arg | Env var | Purpose | |---|---|---|---| | `apply` | `--force` / `-f` | `SOCKET_FORCE` | Bypass beforeHash check | -| `vendor` | `--force` / `-f` | `SOCKET_FORCE` | Bypass beforeHash check when staging the vendored copy | +| `vendor` | `--force` / `-f` | `SOCKET_FORCE` | Tolerate missing patch-target files in the stage + bypass the variant probe. A beforeHash mismatch no longer needs it: vendor staging auto-overwrites with the verified patched content (`vendor_content_mismatch_overwritten` warning) | +| (global) | `--strict` | `SOCKET_STRICT` | Treat a beforeHash mismatch as a hard error in the in-place apply paths (apply/get/scan --apply/hook/go redirect). DEFAULT (v3.4): a mismatched file is overwritten with the FULL verified patched content (the diff strategy self-disables on a wrong base; archive/blob writes are hash-gated to exactly afterHash; the missing blob is downloaded on demand) and surfaced as a `content_mismatch_overwritten` stderr warning + Skipped event. `--force` overrides `--strict` and additionally skips missing files. Vendor staging is unaffected (it always auto-overwrites into its private stage). | | `vendor` | `--revert` | `SOCKET_VENDOR_REVERT` | Undo vendoring: restore recorded original lockfile fragments + remove `.socket/vendor/` artifacts. Works without a manifest | | `apply`, `scan`, `vendor` | `--vex` | `SOCKET_VEX` | Generate an OpenVEX 0.2.0 document at this path on a successful run; see "embedded VEX" below | | `apply`, `scan`, `vendor` | `--vex-product`, `--vex-no-verify`, `--vex-doc-id`, `--vex-compact` | `SOCKET_VEX_PRODUCT`, `SOCKET_VEX_NO_VERIFY`, `SOCKET_VEX_DOC_ID`, `SOCKET_VEX_COMPACT` | Passthrough to the embedded VEX builder; mirror the standalone `vex` knobs. Inert unless `--vex` is set | @@ -71,13 +72,17 @@ Beyond the globals above, each subcommand defines a small set of local arguments `scan --apply` opts JSON callers into the full discover → select → apply pipeline. Without it, `scan --json` stays read-only (discovery + `updates` array only). No effect outside `--json` mode — the non-JSON path always prompts the user interactively. -`scan --prune` opts into garbage collection. When set, `scan` removes manifest entries for packages no longer present in the crawl, then deletes orphan blob, diff, and package-archive files from `.socket/`. Off by default (v3.0) so a temporary uninstall doesn't silently destroy manifest state. +`scan --prune` opts into garbage collection. When set, `scan` removes manifest entries for packages no longer present in the crawl, then deletes orphan blob, diff, and package-archive files from `.socket/`. Off by default (v3.0) so a temporary uninstall doesn't silently destroy manifest state. The pass also reconciles vendored state (runs FIRST, under the apply lock — lock contention skips it without failing the scan): vendored entries whose patch is gone from the manifest are reverted, vendored entries whose dependency is no longer in the lockfile graph are reverted AND their manifest entries dropped (detached entries are exempt from both — they are manifest- and lockfile-invisible by design; a missing or undeterminable lockfile keeps the entry, fail-safe), and orphan `.socket/vendor//` dirs with no ledger entry are swept. The JSON `gc` sub-object gains `revertedVendoredEntries` + `removedVendorOrphanDirs` (wet) / `revertableVendoredEntries` + `vendorOrphanDirs` (preview). `scan` queries the patch API in `--batch-size` chunks. Authenticated runs POST `/v0/orgs/{slug}/patches/batch`; token-less runs POST `{proxy}/patch/batch` on the public proxy and degrade to per-package `GET /patch/by-package/:purl` requests in two cases: the deployed proxy predates the batch endpoint (legacy proxies answer the POST with their `400 "Unsupported endpoint"` catch-all), or the all-or-nothing batch validation rejects the chunk (e.g. a crawled PURL type the server doesn't recognize, such as `pkg:jsr/…` — the per-package path tolerates those individually, preserving the pre-batch scan semantics). Rate limits and over-capacity 503s surface instead of silently degrading. +**Lockfile supplement (v3.4)**: `scan` discovery is no longer limited to installed trees. The project's lockfiles (`package-lock.json`/`npm-shrinkwrap.json`, `pnpm-lock.yaml` v9, `yarn.lock` classic + berry, `bun.lock`, `Cargo.lock`, `go.sum`, `composer.lock`, `Gemfile.lock`, `uv.lock`/`poetry.lock`/pinned `requirements.txt`) are inventoried and dependencies with NO installed copy join discovery — counts, the API lookup, the table (flagged ` [NOT INSTALLED]`, plus a stderr note), and the prune "scanned" set (a wiped node_modules no longer prunes lockfile-listed entries). JSON gains a top-level `lockfileOnlyPackages` count and an additive `notInstalled: true` on matching `packages[]` entries. `--apply` partitions lockfile-only patches out BEFORE download (calm `skipped`/`package_not_installed` records — never an error exit, never a manifest write); `--vendor` passes them through to the vendor engine's auto-fetch. Vendored-ledger entries likewise stay discoverable on a fresh clone (the committed artifact is the dependency). Global scans (`--global`) get no supplement. + +**Vendor auto-fetch (v3.4)**: `vendor`/`scan --vendor` no longer fail on lockfile-resolved packages with no installed copy. Already-vendored purls stage from their committed artifact (sha256-verified against the vendor ledger; offline-safe). Otherwise the pristine artifact is fetched per the lockfile resolution and verified against the lock's recorded integrity FAIL-CLOSED before any write: npm SRI (or yarn classic's sha1 fragment), yarn berry's cache-zip checksum (rebuilt from the fetched tarball; cacheKey 10c0 only), Cargo.lock sha256 over the .crate, go.sum `h1:` dirhash over the module zip, composer `dist.shasum` (sha1), Gemfile.lock `CHECKSUMS` sha256, uv.lock wheel sha256 (pure `py3-none-any` wheels only). Entries the lock cannot verify are NEVER fetched (`vendor_fetch_unverifiable` warning + the calm `package_not_installed` skip). Registry bases honor `SOCKET_NPM_REGISTRY`, `SOCKET_CRATES_REGISTRY`, `SOCKET_GOPROXY` (else `GOPROXY`); npm/yarn/composer/gem/uv lock-recorded URLs are used verbatim. `--offline` refuses the fetch with the calm skip (the detail names the lockfile resolution). The fetch stages into a private tempdir — the project tree is never touched. + `scan --sync` is sugar for `--apply --prune` — the canonical single-flag bot invocation. `scan --json --sync --yes` discovers, applies, and reconciles state in one pass. -`scan --vendor` swaps the in-place apply for the vendor pipeline: discover → download (manifest written, as `--apply`) → vendor every patched dependency via the same engine as the `vendor` command (under the same lock). The whole manifest is vendored, so a package vendored at an older patch uuid is **re-vendored automatically** (its old uuid dir is removed — `vendor_stale_artifact_removed`); same-uuid re-runs are `already_vendored` skips. With `--prune`, GC runs **before** the vendor step so stale manifest entries don't fail vendoring with `package_not_installed`. JSON output gains a `download` sub-object (the download phase; no `applied` field — nothing is applied in place) and a `vendor` sub-object (a full vendor Envelope). `--dry-run` previews per-patch `would_vendor` | `would_revendor` (+`oldUuid`) | `already_vendored` without network downloads or disk writes. Interactive mode prompts "Download and vendor N patch(es)?". +`scan --vendor` swaps the in-place apply for the vendor pipeline: discover → download (manifest written, as `--apply`) → vendor every patched dependency via the same engine as the `vendor` command (under the same lock). The whole manifest is vendored, so a package vendored at an older patch uuid is **re-vendored automatically** (its old uuid dir is removed — `vendor_stale_artifact_removed`); same-uuid re-runs are `already_vendored` skips. With `--prune`, GC runs **before** the vendor step so stale manifest entries don't fail vendoring with `package_not_installed`. JSON output gains a `download` sub-object (the download phase; no `applied` field — nothing is applied in place) and a `vendor` sub-object (a full vendor Envelope). The download phase writes only `.socket/manifest.json`; patch blobs are held in memory (see "Patch sources stay in memory" under the vendor contract). `--dry-run` previews per-patch `would_vendor` | `would_revendor` (+`oldUuid`) | `already_vendored` without network downloads or disk writes. Interactive mode prompts "Download and vendor N patch(es)?". `scan --vendor --detached` performs the same vendoring **without ever writing `.socket/manifest.json`**: records are fetched into memory (`download.detached: true`), the artifacts are built + wired, and the ledger entry carries `detached: true` plus an embedded copy of the patch record (`record`) as the verification source. Detached patches are invisible to apply/rollback/repair (nothing is in the manifest), exempt from `vendor`'s manifest reconcile, and exit via `remove ` (which reverts them) or `vendor --revert`. Idempotent re-runs reuse the embedded record and skip the patch-view fetch entirely. @@ -321,6 +326,45 @@ machines with **no socket-patch installed and no Socket API access** (registry a unvendored dependencies may still be needed). Every mechanism below was validated against the real package managers (`spikes/PHASE0-FINDINGS.txt`). +**Patch sources stay in memory (v3.4)**: vendoring never writes `.socket/blobs/`, `.socket/diffs/`, +or temporary patch files. Pre-existing `.socket/` artifacts (from a prior `apply`/`get`/`repair`) +are read in place; already-vendored purls re-stage patch content from the committed artifact itself +(uuid-matched against the ledger, every harvested blob self-verified by its afterHash — so in-sync +re-runs and fresh clones of vendored projects need no network); anything still missing is fetched +into memory via the patch-view endpoint. A vendored project's `.socket/` holds only +`manifest.json` (omitted in detached mode) and `vendor/`. + +**Vendored artifact repair (v3.5)**: `repair` health-checks every ledger entry — per-file +afterHashes inside the artifact plus, for file-shaped artifacts (`.tgz`/`.whl`), the whole file +against the ledger's recorded sha256 (the rewired lock integrity references those exact bytes) — +and REBUILDS missing/corrupt artifacts through the normal vendor backends. The wired hot paths +rebuild the artifact only: lockfiles stay byte-identical and the ledger entry is not re-recorded +(the first run's entry holds the only pre-vendor originals). Pristine sources follow the same +ladder as vendor: the installed copy first (works under `--offline`), then a lockfile-verified +registry fetch, then the pre-vendor registry fragment recovered from the ledger's wiring +`original`s (`recover_lock_entry`) — always integrity-verified fail-closed, and the rebuilt +artifact is re-verified against the recorded fingerprint before the run counts it (`rebuilt` +event; a mismatch removes the artifact and fails with `vendor_artifact_rebuild_failed`). +Lockfile references to `.socket/vendor///...` with NO ledger coverage (the ledger was +deleted wholesale) are RECONSTRUCTED: the uuid comes from the path (the recovery rule above), the +record from the manifest — or the patch API, yielding a *detached* entry with the record embedded +— and a fresh ledger entry is persisted with the rebuilt artifact's fingerprint. When nothing is +installed and the ledger is gone, npm-family reconstruction has one more rung: the REWIRED +lockfile still records the integrity of the packed vendored tarball, so the pristine copy is +fetched (unverified, conventional registry URL, `SOCKET_NPM_REGISTRY` honored) and the +deterministically REBUILT artifact must reproduce that wired integrity — a tampered pristine +source changes the rebuilt bytes and fails closed (`vendor_artifact_rebuild_failed`, nothing +kept). Reconstructed entries carry no pre-vendor wiring originals, so a later `--revert` degrades +to the documented `vendor_lock_entry_drifted` guidance (re-resolve with the package manager). Because of this +phase, `repair` no longer errors with `manifest_not_found` when the project has a vendor ledger +or vendor-path lockfile references — it runs the vendored phase alone. Step 1's source download +likewise skips vendored-in-sync manifest entries (their content lives in the committed artifact), +so repairing a vendored project never re-litters `.socket/blobs`. `--dry-run` previews +(`details.wouldRebuild`); `--offline` rebuilds only from fully local sources and fails per-entry +otherwise; `vendor`/`scan --vendor` re-runs get the same rebuild for wired-but-broken artifacts +(`vendor_artifact_rebuilt` warning) and recover registry resolutions for missing committed +artifacts instead of failing. + ### Path convention + patch-UUID recovery (stable) ```text @@ -442,7 +486,8 @@ worse, lets a warm cache silently serve unpatched bytes): moved past the vendored uuid (that would break VEX verification with `vendor_uuid_mismatch` until a vendor run). The skip rides `apply.patches[]` as `skipped`/`vendored`; a newer available patch still surfaces in `updates[]` — the signal to run `scan --vendor`. `scan --prune` exempts - vendored purls (an absent installed copy is their NORMAL state, not grounds to prune). An + vendored purls from the crawl-based manifest prune (an absent installed copy is their NORMAL + state) but reconciles vendored state via the lockfile instead — see the `--prune` section. An explicit `get` is allowed to move the manifest past the vendored uuid and warns (`warnings[]` + stderr) that a `vendor` run must refresh the artifact. * **Old-binary skew caveat**: a pre-detached `socket-patch` binary running `vendor` against a @@ -576,6 +621,7 @@ Every `--json` invocation emits a single JSON object that follows the **unified | `failed` | every command | A specific patch attempt failed. `errorCode` + `error` set. | | `removed` | `gc`/`repair`, `remove`, `rollback` | Data was removed from `.socket/` (or files rolled back). `bytes` optional. | | `verified` | `apply --dry-run`, `scan --dry-run` | The patch *would* apply cleanly. `files` lists previewed changes. | +| `rebuilt` | `repair` | A missing/corrupt vendored artifact was rebuilt in place (or its lost ledger entry restored — `details.ledgerRestored`). `summary.rebuilt` counts these (the field is omitted while zero). | ### Stable `errorCode` tags @@ -602,6 +648,17 @@ Every `--json` invocation emits a single JSON object that follows the **unified | `vendor_yarn_berry_cache_unsupported` | `failed` | vendor (yarn berry): lock `cacheKey ≠ 10c0` or non-default `.yarnrc.yml` `compressionLevel` — the cache-zip checksum is not reproducible. | | `vendor_override_conflict` | `failed` | vendor (pnpm/yarn-berry): a user-authored override/resolution for the package already exists. | | `vendor_integrity_unverified` | `skipped` (warning) | vendor (pipenv): the lockfile format does not hash-check file entries; the committed wheel bytes are the protection. | +| `vendor_content_mismatch_overwritten` | `skipped` (warning) | vendor: a staged file matched NEITHER beforeHash nor afterHash (patch built against different bytes, or local edits); the stage was overwritten with the verified patched content and the vendor succeeded. | +| `vendor_fetched_missing` | `skipped` (warning) | vendor: the package was not installed; its pristine artifact was fetched per the lockfile resolution (or staged from the committed vendor artifact), integrity-verified, and vendored — the project tree was not touched. | +| `vendor_fetch_failed` | `failed` | vendor: the lockfile-resolved fetch was attempted and failed (HTTP error, size cap, integrity mismatch, or a PRESENT-but-corrupt committed artifact — pointed at `socket-patch repair`). A MISSING committed artifact no longer lands here: it falls through to the ledger-recovered registry fetch. Suppresses the duplicate `package_not_installed` skip. | +| `vendor_fetch_unverifiable` | `skipped` (warning) | vendor: the lockfile records no usable integrity for the missing package; nothing was fetched (fail-closed) and the `package_not_installed` skip follows. | +| `vendor_artifact_missing` | `skipped` (warning) / `failed` | vendor: the committed artifact is gone — the registry resolution is recovered from the ledger and the artifact rebuilt (warning); repair `--offline` with no local source surfaces it as the per-entry failure instead. | +| `vendor_artifact_corrupt` | `failed` | repair `--offline`: the committed artifact fails verification (member afterHashes or the ledger's whole-file sha256) and no local source can rebuild it. Online repairs rebuild instead. | +| `vendor_artifact_rebuilt` | `skipped` (warning) | vendor / scan `--vendor`: a wired-but-missing/stale artifact was rebuilt in place; lockfiles and the ledger entry untouched. (Under `repair` the `rebuilt` event carries this signal.) | +| `vendor_artifact_rebuild_failed` | `failed` | repair: the rebuild ran but the result failed verification against the recorded fingerprint (e.g. an edited state.json sha); the unverifiable artifact was removed. | +| `vendor_artifact_unrepairable` | `failed` | repair: no verifiable pristine source exists (not installed + lockfile rewired + no recoverable ledger fragment), the wheel is platform-locked with no installed copy, or the ledger entry itself cannot be trusted. | +| `vendor_uuid_mismatch` | `skipped` | repair: the manifest's patch uuid moved past the vendored artifact — a re-vendor (`vendor` / `scan --vendor`) is pending; repair does not cross patch generations. | +| `content_mismatch_overwritten` | `skipped` (warning) | apply (default policy): a file matched NEITHER beforeHash nor afterHash and was overwritten with the full verified patched content. `--strict` turns this case into a `failed` event instead. | | `vendor_lock_checksums_unsupported` / `vendor_stale_lock_checksum` | `failed` | vendor (gem): an ambiguous/platform CHECKSUMS entry, or a v1-wired lock whose stale token blocks the hot path (run `vendor --revert` + re-vendor). | | `pypi_{poetry,pdm,pipenv}_no_lockfile` | `failed` | vendor (pypi): a lock-less tool marker with no `requirements.txt` fallback — run ` lock`. | | `vendor_*` / `pypi_*` / `gemfile_*` / `lock_*` / `locked_version_mismatch` / `user_authored_*` / `native_extensions_unsupported` / `platform_gem_unsupported` | `failed`/`skipped` | vendor: per-ecosystem refusal + drift vocabulary; see the Vendor command contract section. New tags are additive (MINOR). | @@ -610,7 +667,7 @@ Every `--json` invocation emits a single JSON object that follows the **unified | Code | Subcommands | Meaning | |-----------------------|----------------------------------|---------| -| `manifest_not_found` | list, remove, repair, rollback | `.socket/manifest.json` doesn't exist. | +| `manifest_not_found` | list, remove, repair, rollback | `.socket/manifest.json` doesn't exist. v3.5: `repair` proceeds anyway (vendored phase only) when a vendor ledger or vendor-path lockfile references exist. | | `manifest_invalid` | list, remove | Manifest exists but is unparseable. | | `manifest_unreadable` | list, remove | I/O error reading manifest. | | `apply_failed` | apply | apply pipeline error before any patch ran. | @@ -624,7 +681,7 @@ Every `--json` invocation emits a single JSON object that follows the **unified | `apply` | `Applied` · `Updated` · `Skipped` (already_patched / package_not_installed / vendored) · `Failed` · `Verified` (dry-run) | | `vendor` | `Applied` (= vendored; `command` routes) · `Skipped` (refusals, warnings, unsupported ecosystems) · `Failed` · `Removed` (reconcile + `--revert`) · `Verified` (dry-run) | | `list` | `Discovered` (with `details.vulnerabilities`, `details.tier`, `details.license`, `details.description`, `details.exportedAt`) | -| `repair`/`gc`| `Downloaded` (or `Verified` on dry-run) · `Removed` (or `Verified`) · `Failed` artifact events | +| `repair`/`gc`| `Downloaded` (or `Verified` on dry-run) · `Rebuilt` (vendored artifacts; `Verified` previews on dry-run) · `Skipped` (vendor_uuid_mismatch) · `Removed` (or `Verified`) · `Failed` events | | `remove` | `Removed` (per purl) · artifact-level `Removed` event (with `details.blobsRemoved`, `details.rolledBack`) | ### Migration status (v3.0) diff --git a/crates/socket-patch-cli/Cargo.toml b/crates/socket-patch-cli/Cargo.toml index cedba95..e983539 100644 --- a/crates/socket-patch-cli/Cargo.toml +++ b/crates/socket-patch-cli/Cargo.toml @@ -59,6 +59,9 @@ setup-e2e = [] [dev-dependencies] sha2 = { workspace = true } +# scan_vendor_e2e builds pristine registry tarballs for the auto-fetch tests. +tar = { workspace = true } +flate2 = { workspace = true } hex = { workspace = true } wiremock = { workspace = true } portable-pty = { workspace = true } diff --git a/crates/socket-patch-cli/src/args.rs b/crates/socket-patch-cli/src/args.rs index 784b0fe..1fde519 100644 --- a/crates/socket-patch-cli/src/args.rs +++ b/crates/socket-patch-cli/src/args.rs @@ -144,6 +144,19 @@ pub struct GlobalArgs { )] pub offline: bool, + /// Treat a beforeHash mismatch as a hard error. By DEFAULT a file whose + /// on-disk content matches neither the patch's beforeHash nor its + /// afterHash is overwritten with the full verified patched content and + /// surfaced as a stderr warning (`content_mismatch_overwritten`); this + /// flag restores the fail-closed behavior. `--force` overrides it. + #[arg( + long, + env = "SOCKET_STRICT", + default_value_t = false, + value_parser = parse_bool_flag, + )] + pub strict: bool, + /// Operate on globally-installed packages. #[arg( long = "global", @@ -378,6 +391,7 @@ impl Default for GlobalArgs { ecosystems: None, download_mode: "diff".to_string(), offline: false, + strict: false, global: false, global_prefix: None, json: false, diff --git a/crates/socket-patch-cli/src/commands/apply.rs b/crates/socket-patch-cli/src/commands/apply.rs index 07d11db..f839b7f 100644 --- a/crates/socket-patch-cli/src/commands/apply.rs +++ b/crates/socket-patch-cli/src/commands/apply.rs @@ -6,8 +6,111 @@ use socket_patch_core::crawlers::{ use socket_patch_core::manifest::operations::read_manifest; use socket_patch_core::manifest::schema::PatchRecord; use socket_patch_core::patch::apply::{ - apply_package_patch, verify_file_patch, ApplyResult, PatchSources, VerifyStatus, + apply_package_patch, verify_file_patch, ApplyResult, MismatchPolicy, PatchSources, VerifyStatus, }; +/// Files whose pre-apply content matched NEITHER hash and were (or would +/// be) overwritten with the verified patched content — the promoted +/// verify signature `apply_package_patch` leaves behind under the default +/// mismatch policy. +pub(crate) fn mismatch_overwritten_files(result: &ApplyResult) -> Vec { + result + .files_verified + .iter() + .filter(|v| { + v.status == VerifyStatus::Ready + && v.expected_hash.is_some() + && v.current_hash != v.expected_hash + }) + .map(|v| v.file.clone()) + .collect() +} + +/// Surface one mismatch-overwrite per file on stderr (human mode). +fn warn_mismatch_overwrites(result: &ApplyResult, common: &GlobalArgs) { + if common.json || common.silent { + return; + } + for file in mismatch_overwritten_files(result) { + eprintln!( + "Warning (content_mismatch_overwritten): {} {file} did not match the patch's \ + expected original content; applied the full verified patched content instead \ + (pass --strict to fail on mismatches)", + socket_patch_core::utils::purl::normalize_purl(&result.package_key) + ); + } +} + +/// The default mismatch policy applies the FULL patched content for +/// mismatched files — and the full content lives in the afterHash blob, +/// which the default `--download-mode diff` may not have staged. Probe the +/// in-scope packages for mismatches and fetch the missing afterHash blobs +/// by hash (online only) so the apply below can fall through diff → blob. +async fn ensure_blobs_for_mismatches( + args: &ApplyArgs, + manifest: &socket_patch_core::manifest::schema::PatchManifest, + all_packages: &HashMap, + blobs_path: &Path, +) { + if args.common.strict && !args.force { + return; // strict fails on mismatch — nothing to fetch + } + let mut needed: std::collections::HashSet = std::collections::HashSet::new(); + for (purl, pkg_path) in all_packages { + let Some(record) = manifest.patches.get(purl) else { + continue; + }; + for (file_name, info) in &record.files { + if info.before_hash.is_empty() { + continue; + } + let verify = verify_file_patch(pkg_path, file_name, info).await; + if verify.status == socket_patch_core::patch::apply::VerifyStatus::HashMismatch + && tokio::fs::metadata(blobs_path.join(&info.after_hash)) + .await + .is_err() + { + needed.insert(info.after_hash.clone()); + } + } + } + if needed.is_empty() { + return; + } + if args.common.offline { + if !args.common.silent && !args.common.json { + eprintln!( + "Warning: {} mismatched file(s) need their full patched blob, but --offline \ + prevents fetching; those files will fail to apply", + needed.len() + ); + } + return; + } + if !args.common.silent && !args.common.json { + eprintln!( + "Downloading {} full patched blob(s) for mismatched file(s)...", + needed.len() + ); + } + let (client, _) = get_api_client_with_overrides(args.common.api_client_overrides()).await; + let _ = socket_patch_core::api::blob_fetcher::fetch_blobs_by_hash( + &needed, blobs_path, &client, None, + ) + .await; +} + +/// The mismatch policy this run applies with: `--force` ⊃ default +/// (adds the missing-file skip), `--strict` restores fail-closed. +pub(crate) fn mismatch_policy(force: bool, strict: bool) -> MismatchPolicy { + if force { + MismatchPolicy::Force + } else if strict { + MismatchPolicy::Strict + } else { + MismatchPolicy::Warn + } +} + #[cfg(feature = "golang")] use socket_patch_core::patch::go_redirect::{ apply_go_redirect, reconcile_go_redirects, verify_go_redirect_state, @@ -102,7 +205,7 @@ async fn try_local_go_apply( patch: &PatchRecord, sources: &PatchSources<'_>, common: &GlobalArgs, - force: bool, + policy: MismatchPolicy, ) -> Option { if !is_local_go(purl, common) { return None; @@ -126,7 +229,7 @@ async fn try_local_go_apply( sources, Some(&patch.uuid), common.dry_run, - force, + policy, ) .await, ) @@ -139,7 +242,7 @@ async fn try_local_go_apply( _patch: &PatchRecord, _sources: &PatchSources<'_>, _common: &GlobalArgs, - _force: bool, + _policy: MismatchPolicy, ) -> Option { None } @@ -538,6 +641,21 @@ pub async fn run(args: ApplyArgs) -> i32 { } for result in &results { env.record(result_to_event(result, args.common.dry_run)); + // Mismatch overwrites ride as Skipped warning events + // (same pattern as the vendor warnings): the package's + // Applied event stands, the warning is per-file. + for file in mismatch_overwritten_files(result) { + env.record( + PatchEvent::new(PatchAction::Skipped, result.package_key.clone()) + .with_reason( + "content_mismatch_overwritten", + format!( + "{file} did not match the patch's expected original \ + content; the full verified patched content was applied" + ), + ), + ); + } // Sidecar records live on the envelope, not on // individual events. Consumers iterate // `envelope.sidecars[]` and JOIN against @@ -609,9 +727,16 @@ pub async fn run(args: ApplyArgs) -> i32 { } else { format!(" (via {})", tags.join("+")) }; - println!(" {}{}", result.package_key, suffix); + println!( + " {}{}", + socket_patch_core::utils::purl::normalize_purl(&result.package_key), + suffix + ); } else if all_files_already_patched(result) { - println!(" {} (already patched)", result.package_key); + println!( + " {} (already patched)", + socket_patch_core::utils::purl::normalize_purl(&result.package_key) + ); } } } @@ -888,6 +1013,7 @@ async fn apply_patches_inner( } // Apply patches + ensure_blobs_for_mismatches(args, &manifest, &all_packages, &blobs_path).await; let mut has_errors = false; // Group release-variant PURLs by base. PyPI (`?artifact_id=`), @@ -969,6 +1095,7 @@ async fn apply_patches_inner( blobs_path: &blobs_path, packages_path: Some(&packages_path), diffs_path: Some(&diffs_path), + mem_blobs: None, }; let result = apply_package_patch( variant_purl, @@ -977,10 +1104,11 @@ async fn apply_patches_inner( &sources, Some(&patch.uuid), args.common.dry_run, - args.force, + mismatch_policy(args.force, args.common.strict), ) .await; + warn_mismatch_overwrites(&result, &args.common); // A variant that reached apply is the installed distribution // (it passed the first-file check, or `--force` bypassed it), // so record it as matched whether or not the patch succeeded. @@ -1052,6 +1180,7 @@ async fn apply_patches_inner( blobs_path: &blobs_path, packages_path: Some(&packages_path), diffs_path: Some(&diffs_path), + mem_blobs: None, }; // Local go redirects to a project-local patched copy under // `.socket/go-patches/` wired via a `go.mod` `replace` (the module @@ -1059,25 +1188,32 @@ async fn apply_patches_inner( // Everything else — npm/pypi/gem and cargo (vendored or registry // cache) — patches in place via `apply_package_patch`. Without the // `golang` feature `try_local_go_apply` is an inert `None`. - let result = - match try_local_go_apply(purl, pkg_path, patch, &sources, &args.common, args.force) + let result = match try_local_go_apply( + purl, + pkg_path, + patch, + &sources, + &args.common, + mismatch_policy(args.force, args.common.strict), + ) + .await + { + Some(r) => r, + None => { + apply_package_patch( + purl, + pkg_path, + &patch.files, + &sources, + Some(&patch.uuid), + args.common.dry_run, + mismatch_policy(args.force, args.common.strict), + ) .await - { - Some(r) => r, - None => { - apply_package_patch( - purl, - pkg_path, - &patch.files, - &sources, - Some(&patch.uuid), - args.common.dry_run, - args.force, - ) - .await - } - }; + } + }; + warn_mismatch_overwrites(&result, &args.common); if !result.success { has_errors = true; if !args.common.silent && !args.common.json { @@ -1111,7 +1247,10 @@ async fn apply_patches_inner( unmatched.len() ); for purl in &unmatched { - eprintln!(" - {}", purl); + eprintln!( + " - {}", + socket_patch_core::utils::purl::normalize_purl(purl) + ); } } @@ -1289,7 +1428,7 @@ mod tests { .enumerate() .map(|(i, status)| VerifyResult { file: format!("package/f{i}.js"), - status: status.clone(), + status: *status, message: None, current_hash: None, expected_hash: None, diff --git a/crates/socket-patch-cli/src/commands/fetch_stage.rs b/crates/socket-patch-cli/src/commands/fetch_stage.rs index b1ffbb9..17db89b 100644 --- a/crates/socket-patch-cli/src/commands/fetch_stage.rs +++ b/crates/socket-patch-cli/src/commands/fetch_stage.rs @@ -7,6 +7,7 @@ //! cache is `repair`'s job, keeping these commands read-only against //! `.socket/`). +use std::collections::HashMap; use std::path::{Path, PathBuf}; use socket_patch_core::api::blob_fetcher::{ @@ -36,6 +37,7 @@ impl StagedSources { blobs_path: &self.blobs, packages_path: Some(&self.packages), diffs_path: Some(&self.diffs), + mem_blobs: None, } } } @@ -199,6 +201,7 @@ pub async fn stage_patch_sources( blobs_path: &stage_blobs, packages_path: Some(&stage_packages), diffs_path: Some(&stage_diffs), + mem_blobs: None, }; let fetch_result = fetch_missing_sources(manifest, &sources, download_mode, &client, None).await; @@ -244,3 +247,196 @@ pub async fn stage_patch_sources( _stage: Some(stage), })) } + +/// In-memory staged sources for the VENDOR flows. +/// +/// Existing `.socket/` artifacts are read in place (never copied, never +/// rewritten); patch content that is missing locally is fetched into +/// MEMORY via the patch view endpoint — vendoring writes no +/// `.socket/blobs` entries and no temporary files. The committed +/// `.socket/vendor/` artifact is the patch; nothing else should land on +/// disk. +pub struct MemStagedSources { + blobs: PathBuf, + diffs: PathBuf, + packages: PathBuf, + mem: HashMap>, +} + +impl MemStagedSources { + /// Borrow as the core pipeline's source set (memory overlay first, + /// on-disk artifacts as the read-only fallback). + pub fn as_patch_sources(&self) -> PatchSources<'_> { + PatchSources { + blobs_path: &self.blobs, + packages_path: Some(&self.packages), + diffs_path: Some(&self.diffs), + mem_blobs: Some(&self.mem), + } + } +} + +/// The in-memory staging outcome (mirror of [`StageOutcome`]). +pub enum MemStageOutcome { + Ready(MemStagedSources), + Unavailable, +} + +/// Stage patch sources for a VENDOR run without writing anything: +/// a record is locally satisfied when all its after-blobs are on disk or +/// a package archive is (a diff archive is NOT sufficient — vendor's +/// auto-force policy can need the full after-blob for files a diff cannot +/// reproduce); anything else has its full per-file content fetched into +/// memory from the patch view endpoint (`blobContent`), preceded by the +/// committed-artifact harvest. Offline runs with missing sources are +/// `Unavailable` with the same diagnostics as the disk stager. +pub async fn stage_vendor_sources_in_memory( + common: &GlobalArgs, + manifest: &PatchManifest, + socket_dir: &Path, + project_root: &Path, +) -> Result { + let blobs = socket_dir.join("blobs"); + let diffs = socket_dir.join("diffs"); + let packages = socket_dir.join("packages"); + + let missing_blobs = get_missing_blobs(manifest, &blobs).await; + let missing_package_archives = get_missing_archives(manifest, &packages).await; + + // A diff archive alone is NOT a sufficient source here, unlike the disk + // stager: vendoring runs the auto-force policy, where a beforeHash + // mismatch (already-applied tree, patch built against different bytes) + // is overwritten with the FULL after-blob — which a diff cannot + // produce. On-disk diffs still serve Strategy 2 for clean files; the + // after-blob content must additionally exist (disk, harvest, or fetch). + let mut to_fetch: Vec<(&str, &str)> = manifest + .patches + .iter() + .filter_map(|(purl, record)| { + let all_blobs_present = record + .files + .values() + .all(|f| !missing_blobs.contains(&f.after_hash)); + let pkg_present = !missing_package_archives.contains(&record.uuid); + if all_blobs_present || pkg_present { + None + } else { + Some((purl.as_str(), record.uuid.as_str())) + } + }) + .collect(); + + if to_fetch.is_empty() { + return Ok(MemStageOutcome::Ready(MemStagedSources { + blobs, + diffs, + packages, + mem: HashMap::new(), + })); + } + + // The committed vendor artifact IS the patched content: harvest its + // afterHash blobs into memory so in-sync re-runs and fresh clones of + // already-vendored projects stage with no network and no disk blobs. + let mut mem = + socket_patch_core::patch::vendor::harvest_artifact_blobs(project_root, &manifest.patches) + .await; + if !mem.is_empty() { + to_fetch.retain(|(purl, _)| { + manifest.patches.get(*purl).is_none_or(|record| { + !record.files.values().all(|f| { + !missing_blobs.contains(&f.after_hash) || mem.contains_key(&f.after_hash) + }) + }) + }); + if to_fetch.is_empty() { + return Ok(MemStageOutcome::Ready(MemStagedSources { + blobs, + diffs, + packages, + mem, + })); + } + } + + if common.offline { + if !common.silent && !common.json { + eprintln!( + "Error: {} patch(es) have no local source and --offline is set:", + to_fetch.len() + ); + for (purl, _) in to_fetch.iter().take(5) { + eprintln!(" - {}", purl); + } + if to_fetch.len() > 5 { + eprintln!(" ... and {} more", to_fetch.len() - 5); + } + eprintln!("Run \"socket-patch repair\" to download missing artifacts."); + } + return Ok(MemStageOutcome::Unavailable); + } + + if !common.silent && !common.json { + println!( + "Fetching {} patch(es)' content (kept in memory)...", + to_fetch.len() + ); + } + + let (client, _) = get_api_client_with_overrides(common.api_client_overrides()).await; + let mut failed: Vec<&str> = Vec::new(); + for (purl, uuid) in &to_fetch { + match client.fetch_patch(common.org.as_deref(), uuid).await { + Ok(Some(patch)) => { + let mut complete = true; + for (file, info) in &patch.files { + let (Some(b64), Some(hash)) = (&info.blob_content, &info.after_hash) else { + if !common.silent && !common.json { + eprintln!(" [error] {purl}: no blob content served for {file}"); + } + complete = false; + break; + }; + // Same key guard as the disk writer: the hash names the + // lookup key the apply pipeline gates writes on. + if hash.len() != 64 || !hash.bytes().all(|b| b.is_ascii_hexdigit()) { + complete = false; + break; + } + match super::get::base64_decode(b64) { + Ok(bytes) => { + mem.insert(hash.clone(), bytes); + } + Err(_) => { + complete = false; + break; + } + } + } + if !complete { + failed.push(purl); + } + } + _ => failed.push(purl), + } + } + if !failed.is_empty() { + if !common.silent && !common.json { + eprintln!( + "Error: could not fetch patch content for {} patch(es):", + failed.len() + ); + for purl in failed.iter().take(5) { + eprintln!(" - {}", purl); + } + } + return Ok(MemStageOutcome::Unavailable); + } + + Ok(MemStageOutcome::Ready(MemStagedSources { + blobs, + diffs, + packages, + mem, + })) +} diff --git a/crates/socket-patch-cli/src/commands/get.rs b/crates/socket-patch-cli/src/commands/get.rs index e7359aa..0ba397f 100644 --- a/crates/socket-patch-cli/src/commands/get.rs +++ b/crates/socket-patch-cli/src/commands/get.rs @@ -13,7 +13,7 @@ use socket_patch_core::manifest::schema::{ }; use socket_patch_core::patch::apply::select_installed_variants; use socket_patch_core::utils::fuzzy_match::fuzzy_match_packages; -use socket_patch_core::utils::purl::{is_purl, strip_purl_qualifiers}; +use socket_patch_core::utils::purl::{is_purl, normalize_purl, strip_purl_qualifiers}; use socket_patch_core::utils::telemetry::{track_patch_fetch_failed, track_patch_fetched}; use std::collections::HashMap; use std::fmt; @@ -346,6 +346,25 @@ fn build_patch_record(patch: &PatchResponse, files: HashMap (String, PatchRecord) { + let mut files = HashMap::new(); + for (file_path, file_info) in &patch.files { + if let (Some(before), Some(after)) = (&file_info.before_hash, &file_info.after_hash) { + files.insert( + file_path.clone(), + PatchFileInfo { + before_hash: before.clone(), + after_hash: after.clone(), + }, + ); + } + } + (patch.purl.clone(), build_patch_record(patch, files)) +} + #[derive(Args)] pub struct GetArgs { /// Patch identifier (UUID, CVE ID, GHSA ID, PURL, or package name). @@ -580,6 +599,14 @@ pub struct DownloadParams { /// `true` (`--all-releases`), every variant is downloaded. No effect /// on ecosystems without per-release artifact_id variants. pub all_releases: bool, + /// `--strict` forwarded to the nested apply (a beforeHash mismatch + /// fails instead of warn-and-overwrite). + pub strict: bool, + /// Persist downloaded blob content into `.socket/blobs` (the apply + /// flows need it for later hook/rollback runs). Vendor flows pass + /// `false`: their patch content is staged in memory and the committed + /// artifact is the patch — nothing should land in `.socket/blobs`. + pub persist_blobs: bool, } /// Narrow a selection of patches down to the release variant(s) present @@ -765,14 +792,16 @@ pub(crate) async fn download_patch_records( let socket_dir = params.cwd.join(".socket"); let blobs_dir = socket_dir.join("blobs"); - if let Err(e) = tokio::fs::create_dir_all(&blobs_dir).await { - let err = format!("Failed to create blobs directory: {}", e); - report_error(params.json, &err); - return ( - 1, - serde_json::json!({"status": "error", "error": err}), - HashMap::new(), - ); + if params.persist_blobs { + if let Err(e) = tokio::fs::create_dir_all(&blobs_dir).await { + let err = format!("Failed to create blobs directory: {}", e); + report_error(params.json, &err); + return ( + 1, + serde_json::json!({"status": "error", "error": err}), + HashMap::new(), + ); + } } let mut narrow_warnings: Vec = Vec::new(); @@ -851,9 +880,13 @@ pub(crate) async fn download_patch_records( } } let quiet = params.json || params.silent; - if write_all_patch_blobs(&blobs_dir, &patch, quiet) - .await - .is_err() + // Vendor flows keep blob content in memory (the vendor + // step re-fetches what it needs); persisting blobs here + // would litter .socket/blobs for no consumer. + if params.persist_blobs + && write_all_patch_blobs(&blobs_dir, &patch, quiet) + .await + .is_err() { failed += 1; patch_records_json.push(serde_json::json!({ @@ -977,10 +1010,12 @@ pub async fn download_and_apply_patches( report_error(params.json, &err); return (1, serde_json::json!({"status": "error", "error": err})); } - if let Err(e) = tokio::fs::create_dir_all(&blobs_dir).await { - let err = format!("Failed to create blobs directory: {}", e); - report_error(params.json, &err); - return (1, serde_json::json!({"status": "error", "error": err})); + if params.persist_blobs { + if let Err(e) = tokio::fs::create_dir_all(&blobs_dir).await { + let err = format!("Failed to create blobs directory: {}", e); + report_error(params.json, &err); + return (1, serde_json::json!({"status": "error", "error": err})); + } } let mut manifest = match read_manifest(&manifest_path).await { @@ -1030,7 +1065,10 @@ pub async fn download_and_apply_patches( let action = decide_patch_action(&manifest, &patch.purl, &patch.uuid); if let PatchAction::Skipped = action { if !params.json && !params.silent { - eprintln!(" [skip] {} (already in manifest)", patch.purl); + eprintln!( + " [skip] {} (already in manifest)", + normalize_purl(&patch.purl) + ); } downloaded_patches.push(serde_json::json!({ "purl": patch.purl, @@ -1060,9 +1098,13 @@ pub async fn download_and_apply_patches( } let quiet = params.json || params.silent; - if write_all_patch_blobs(&blobs_dir, &patch, quiet) - .await - .is_err() + // Vendor flows keep blob content in memory (the vendor + // step re-fetches what it needs); persisting blobs here + // would litter .socket/blobs for no consumer. + if params.persist_blobs + && write_all_patch_blobs(&blobs_dir, &patch, quiet) + .await + .is_err() { patches_failed += 1; downloaded_patches.push(serde_json::json!({ @@ -1193,6 +1235,7 @@ pub async fn download_and_apply_patches( global_prefix: params.global_prefix.clone(), silent: params.json || params.silent, download_mode: params.download_mode.clone(), + strict: params.strict, ..crate::args::GlobalArgs::default() }, force: false, @@ -1621,6 +1664,8 @@ pub async fn run(args: GetArgs) -> i32 { download_mode: args.common.download_mode.clone(), api_overrides: args.common.api_client_overrides(), all_releases: args.all_releases, + strict: args.common.strict, + persist_blobs: true, }; let (code, result_json) = download_and_apply_patches(&selected, ¶ms).await; @@ -1810,6 +1855,7 @@ async fn save_and_apply_patch(args: &GetArgs, patch: &PatchResponse) -> i32 { global_prefix: args.common.global_prefix.clone(), silent: quiet, download_mode: args.common.download_mode.clone(), + strict: args.common.strict, ..crate::args::GlobalArgs::default() }, force: false, @@ -1863,7 +1909,7 @@ async fn save_and_apply_patch(args: &GetArgs, patch: &PatchResponse) -> i32 { exit_code } -fn base64_decode(input: &str) -> Result, String> { +pub(crate) fn base64_decode(input: &str) -> Result, String> { let chars = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; let mut table = [255u8; 256]; for (i, &c) in chars.iter().enumerate() { diff --git a/crates/socket-patch-cli/src/commands/mod.rs b/crates/socket-patch-cli/src/commands/mod.rs index 1fa97c9..51b1829 100644 --- a/crates/socket-patch-cli/src/commands/mod.rs +++ b/crates/socket-patch-cli/src/commands/mod.rs @@ -5,6 +5,7 @@ pub mod list; pub mod lock_cli; pub mod remove; pub mod repair; +pub mod repair_vendor; pub mod rollback; pub mod scan; pub mod setup; diff --git a/crates/socket-patch-cli/src/commands/repair.rs b/crates/socket-patch-cli/src/commands/repair.rs index f83b9fb..4050e8c 100644 --- a/crates/socket-patch-cli/src/commands/repair.rs +++ b/crates/socket-patch-cli/src/commands/repair.rs @@ -61,18 +61,37 @@ pub async fn run(args: RepairArgs) -> i32 { let manifest_path = args.common.resolved_manifest_path(); if tokio::fs::metadata(&manifest_path).await.is_err() { - if args.common.json { - let mut env = Envelope::new(Command::Repair); - env.dry_run = args.common.dry_run; - env.mark_error(EnvelopeError::new( - "manifest_not_found", - format!("Manifest not found at {}", manifest_path.display()), - )); - println!("{}", env.to_pretty_json()); - } else { - eprintln!("Manifest not found at {}", manifest_path.display()); + // No manifest is still repairable when the project carries vendored + // state: a committed ledger, or lockfiles rewired to + // `.socket/vendor/...` paths (the ledger itself may be the thing + // that needs repairing). Only a project with neither is an error. + let state_file = args + .common + .cwd + .join(socket_patch_core::patch::vendor::VENDOR_STATE_REL); + let has_vendor_traces = tokio::fs::metadata(&state_file).await.is_ok() + || !crate::commands::repair_vendor::scan_vendor_references(&args.common.cwd) + .await + .is_empty(); + if !has_vendor_traces { + if args.common.json { + let mut env = Envelope::new(Command::Repair); + env.dry_run = args.common.dry_run; + env.mark_error(EnvelopeError::new( + "manifest_not_found", + format!("Manifest not found at {}", manifest_path.display()), + )); + println!("{}", env.to_pretty_json()); + } else { + eprintln!("Manifest not found at {}", manifest_path.display()); + } + return 1; + } + // The vendor-only repair still serializes on the .socket lock; the + // lock layer deliberately refuses to mkdir. + if let Some(dir) = manifest_path.parent() { + let _ = tokio::fs::create_dir_all(dir).await; } - return 1; } // Serialize against concurrent socket-patch runs targeting the @@ -165,10 +184,11 @@ pub(crate) async fn repair_inner( args: &RepairArgs, manifest_path: &Path, ) -> Result<(Envelope, RepairCounts), String> { + // `Ok(None)` = no manifest (vendor-only repair); present-but-invalid + // stays a hard error. let manifest = read_manifest(manifest_path) .await - .map_err(|e| e.to_string())? - .ok_or_else(|| "Invalid manifest".to_string())?; + .map_err(|e| e.to_string())?; let socket_dir = manifest_path.parent().unwrap(); let blobs_path = socket_dir.join("blobs"); @@ -192,19 +212,69 @@ pub(crate) async fn repair_inner( let mut blobs_checked = 0usize; let mut bytes_freed = 0u64; + // The envelope is built up-front: the vendored-artifact phase records + // its events inline; the download/cleanup aggregates are appended at + // the end (event ordering is documented best-effort). + let mut env = Envelope::new(Command::Repair); + env.dry_run = args.common.dry_run; + // Step 1: Check for and download missing artifacts in the requested // mode. Counts below refer to whatever kind of artifact was requested // (file blobs, diff archives, or package archives). - let missing_artifacts: Vec = match download_mode { - DownloadMode::File => get_missing_blobs(&manifest, &blobs_path) + // + // VENDORED-in-sync manifest entries are excluded: vendor flows keep + // patch content in memory and the committed artifact IS the patch, so + // a fully-vendored project legitimately has no `.socket/blobs|diffs| + // packages` — repair must not re-litter them (or fail trying). The + // cleanup phase below still uses the FULL manifest, so it never sweeps + // sources an in-place apply may need for rollback. + let vendor_state = socket_patch_core::patch::vendor::load_state(&args.common.cwd) + .await + .unwrap_or_default(); + // Lockfile vendor references count as vendored even before the ledger + // is reconstructed, so a no-ledger repair doesn't download sources for + // entries the vendored phase is about to own. + let referenced_uuids: std::collections::HashSet = + crate::commands::repair_vendor::scan_vendor_references(&args.common.cwd) + .await + .into_iter() + .map(|(_, uuid, _)| uuid) + .collect(); + let scoped_manifest = manifest.as_ref().map(|m| { + let patches = m + .patches + .iter() + .filter(|(purl, rec)| { + !referenced_uuids.contains(&rec.uuid) + && vendor_state + .entries + .get(*purl) + .or_else(|| { + vendor_state + .entries + .values() + .find(|e| &e.base_purl == *purl) + }) + .is_none_or(|e| e.uuid != rec.uuid) + }) + .map(|(k, v)| (k.clone(), v.clone())) + .collect(); + socket_patch_core::manifest::schema::PatchManifest { + patches, + setup: m.setup.clone(), + } + }); + let missing_artifacts: Vec = match (&scoped_manifest, download_mode) { + (None, _) => Vec::new(), + (Some(m), DownloadMode::File) => get_missing_blobs(m, &blobs_path) .await .into_iter() .collect(), - DownloadMode::Diff => get_missing_archives(&manifest, &diffs_path) + (Some(m), DownloadMode::Diff) => get_missing_archives(m, &diffs_path) .await .into_iter() .collect(), - DownloadMode::Package => get_missing_archives(&manifest, &packages_path) + (Some(m), DownloadMode::Package) => get_missing_archives(m, &packages_path) .await .into_iter() .collect(), @@ -241,9 +311,15 @@ pub(crate) async fn repair_inner( blobs_path: &blobs_path, packages_path: Some(&packages_path), diffs_path: Some(&diffs_path), + mem_blobs: None, }; + // Step 1 only runs with a manifest (missing_artifacts is + // empty otherwise), so the expect is unreachable. + let m = scoped_manifest + .as_ref() + .expect("step 1 requires a manifest"); let fetch_result = - fetch_missing_sources(&manifest, &sources, download_mode, &client, None).await; + fetch_missing_sources(m, &sources, download_mode, &client, None).await; downloaded_count = fetch_result.downloaded; download_failed_count = fetch_result.failed; if !quiet { @@ -277,8 +353,24 @@ pub(crate) async fn repair_inner( ); } + // Step 1.5: vendored artifacts — health-check the ledger (and any + // lockfile vendor references with no ledger coverage) and rebuild + // missing/corrupt artifacts. Runs under `--download-only` too: + // restoring artifacts IS repair's download half. + let vendor_counts = crate::commands::repair_vendor::repair_vendored_artifacts( + &args.common, + manifest.as_ref(), + socket_dir, + &mut env, + ) + .await; + if !quiet && vendor_counts.rebuilt > 0 { + println!("Rebuilt {} vendored artifact(s).", vendor_counts.rebuilt); + } + // Step 2: Clean up unused artifacts across all three directories. - if !args.download_only { + if let (false, Some(manifest)) = (args.download_only, manifest.as_ref()) { + let manifest = manifest.clone(); if !quiet { println!(); } @@ -360,8 +452,6 @@ pub(crate) async fn repair_inner( // Translate the aggregate counts into envelope events. `repair` // operates on artifacts (not specific patches), so events use the // `PatchEvent::artifact` form (no PURL/UUID). - let mut env = Envelope::new(Command::Repair); - env.dry_run = args.common.dry_run; let action_for_repair = if args.common.dry_run { PatchAction::Verified } else { diff --git a/crates/socket-patch-cli/src/commands/repair_vendor.rs b/crates/socket-patch-cli/src/commands/repair_vendor.rs new file mode 100644 index 0000000..ee9801c --- /dev/null +++ b/crates/socket-patch-cli/src/commands/repair_vendor.rs @@ -0,0 +1,853 @@ +//! `repair`'s vendored-artifact phase: rebuild committed vendor artifacts +//! that are referenced (ledger entry and/or rewired lockfile) but missing +//! or corrupt on disk. +//! +//! Detection is the core health check ([`check_vendored_artifact`]: per-file +//! afterHashes + the whole-file ledger sha256 for file-shaped artifacts). +//! Rebuilds re-dispatch the normal vendor backends — their wired hot paths +//! rebuild the ARTIFACT only and never touch lockfiles or re-record ledger +//! originals — fed by the same pristine-source ladder as `vendor` (installed +//! copy → lockfile-verified registry fetch → ledger-recovered pre-vendor +//! fragment), with patch content staged in memory. +//! +//! Lockfile references with NO ledger coverage (`.socket/vendor` deleted +//! wholesale, state.json included) are RECONSTRUCTED: the uuid is recovered +//! from the lockfile path itself (the contract's uuid-in-path rule), the +//! record from the manifest (or the patch API, yielding a detached entry), +//! and a fresh ledger entry is re-synthesized so sweep/GC/revert know the +//! artifact again. Reconstructed entries carry no pre-vendor wiring +//! originals — `--revert` degrades to its documented +//! `vendor_lock_entry_drifted` re-resolve guidance. + +use std::collections::{HashMap, HashSet}; +use std::path::Path; + +use socket_patch_core::api::client::get_api_client_with_overrides; +use socket_patch_core::crawlers::CrawlerOptions; +use socket_patch_core::manifest::schema::{PatchManifest, PatchRecord}; +use socket_patch_core::patch::copy_tree::remove_tree; +use socket_patch_core::patch::vendor::{ + self, check_vendored_artifact, file_sha256_hex, load_state, lock_inventory, parse_vendor_path, + registry_fetch, ArtifactHealth, VendorEntry, +}; +use socket_patch_core::utils::purl::strip_purl_qualifiers; +use socket_patch_core::vex::time::now_rfc3339; + +use crate::args::GlobalArgs; +use crate::commands::fetch_stage::{stage_vendor_sources_in_memory, MemStageOutcome}; +use crate::commands::vendor::{ + dispatch_vendor_one, ecosystem_in_scope, fetch_pristine_package, persist_vendor_entry, + record_warning, PristineFetch, +}; +use crate::ecosystem_dispatch::{find_packages_for_purls, partition_purls}; +use crate::json_envelope::{Envelope, PatchAction, PatchEvent}; + +/// Counts surfaced to `repair_inner` for telemetry/human output. +#[derive(Default)] +pub(crate) struct RepairVendorCounts { + pub rebuilt: usize, + pub failed: usize, + pub healthy: usize, +} + +/// One broken vendored unit queued for rebuild. +struct Candidate { + purl: String, + entry: VendorEntry, + record: PatchRecord, + detached: bool, + /// True when the ledger entry was re-synthesized from a lockfile + /// reference (it must be persisted after a successful rebuild). + reconstructed: bool, + reason: &'static str, +} + +/// Files the vendor backends rewire — the search space for +/// `.socket/vendor///` references when the ledger is gone. +const WIRING_FILES: &[&str] = &[ + "package-lock.json", + "npm-shrinkwrap.json", + "pnpm-lock.yaml", + "yarn.lock", + "bun.lock", + "package.json", + "Cargo.toml", + "Cargo.lock", + ".cargo/config.toml", + "go.mod", + "composer.json", + "composer.lock", + "Gemfile", + "Gemfile.lock", + "uv.lock", + "pyproject.toml", + "poetry.lock", + "pdm.lock", + "Pipfile.lock", + "requirements.txt", +]; + +/// Scan the wiring-bearing files for vendored-artifact references, +/// returning deduped `(ecosystem, uuid, artifact relpath)` triples. Pure +/// text scan + the canonical path parser — the same recovery rule the CLI +/// contract documents for external tools. +pub(crate) async fn scan_vendor_references(project_root: &Path) -> Vec<(String, String, String)> { + let mut seen: HashSet<(String, String)> = HashSet::new(); + let mut out = Vec::new(); + for file in WIRING_FILES { + let Ok(text) = tokio::fs::read_to_string(project_root.join(file)).await else { + continue; + }; + let mut rest = text.as_str(); + while let Some(idx) = rest.find(".socket") { + let slice = &rest[idx..]; + // `:` ends a reference too: pnpm snapshot keys are + // `name@file::` and yaml mappings suffix the path with a + // colon — npm names/versions never contain one. + let end = slice + .find([ + '"', '\'', '`', ' ', '\t', '\n', '\r', ',', ')', ']', '}', ';', ':', + ]) + .unwrap_or(slice.len()); + let candidate = slice[..end].replace('\\', "/"); + if let Some(parts) = parse_vendor_path(&candidate) { + if seen.insert((parts.eco.to_string(), parts.uuid.clone())) { + out.push(( + parts.eco.to_string(), + parts.uuid.clone(), + candidate.trim_start_matches("./").to_string(), + )); + } + } + rest = &rest[idx + ".socket".len()..]; + } + } + out.sort(); + out +} + +fn synth_entry(eco: &str, uuid: &str, artifact_path: &str, base_purl: &str) -> VendorEntry { + VendorEntry { + ecosystem: eco.to_string(), + base_purl: base_purl.to_string(), + uuid: uuid.to_string(), + artifact: socket_patch_core::patch::vendor::state::VendorArtifact { + path: artifact_path.to_string(), + sha256: String::new(), + size: None, + platform_locked: None, + }, + wiring: Vec::new(), + lock: None, + took_over_go_patches: false, + detached: false, + record: None, + flavor: None, + uv: None, + pnpm: None, + poetry: None, + pdm: None, + pipenv: None, + } +} + +fn fail( + env: &mut Envelope, + counts: &mut RepairVendorCounts, + quiet: bool, + purl: &str, + code: &str, + detail: String, +) { + if !quiet { + eprintln!( + "Cannot repair vendored artifact for {}: {detail}", + socket_patch_core::utils::purl::normalize_purl(purl) + ); + } + env.record(PatchEvent::new(PatchAction::Failed, purl.to_string()).with_error(code, detail)); + env.mark_partial_failure(); + counts.failed += 1; +} + +/// The vendored-artifact phase of `repair`. Runs between the download and +/// cleanup phases (and under `--download-only` — restoring artifacts IS +/// repair's job). `manifest` is `None` when the project has no +/// `.socket/manifest.json` (detached/reconstruction-only repairs). +pub(crate) async fn repair_vendored_artifacts( + common: &GlobalArgs, + manifest: Option<&PatchManifest>, + socket_dir: &Path, + env: &mut Envelope, +) -> RepairVendorCounts { + let quiet = common.json || common.silent; + let mut counts = RepairVendorCounts::default(); + + let mut state = match load_state(&common.cwd).await { + Ok(s) => s, + Err(e) => { + env.record( + PatchEvent::artifact(PatchAction::Failed) + .with_error("vendor_state_unreadable", e.to_string()), + ); + env.mark_partial_failure(); + counts.failed += 1; + return counts; + } + }; + + // ── Pass 1: ledger-driven health check ─────────────────────────────── + let mut candidates: Vec = Vec::new(); + let mut ledger_purls: Vec = state.entries.keys().cloned().collect(); + ledger_purls.sort(); + for purl in &ledger_purls { + let entry = state.entries[purl].clone(); + if !ecosystem_in_scope(common, &entry.ecosystem) { + continue; + } + let record = match (&entry.record, manifest) { + (Some(r), _) => r.clone(), + (None, Some(m)) => { + match m + .patches + .get(purl) + .cloned() + .or_else(|| m.patches.values().find(|r| r.uuid == entry.uuid).cloned()) + { + Some(r) => r, + // Dropped from the manifest: the vendor reconcile owns + // reverting it — not repair's call. + None => continue, + } + } + // Non-detached entry with no manifest at all: recover the + // record from the API below, like a reconstruction. + (None, None) => match fetch_record_by_uuid(common, &entry.uuid).await { + Some((_, r)) => r, + None => { + fail( + env, + &mut counts, + quiet, + purl, + "vendor_artifact_unrepairable", + format!( + "no manifest record for patch {} and the patch view could not \ + be fetched (offline or API failure)", + entry.uuid + ), + ); + continue; + } + }, + }; + if record.uuid != entry.uuid { + env.record( + PatchEvent::new(PatchAction::Skipped, purl.clone()).with_reason( + "vendor_uuid_mismatch", + "the manifest's patch uuid moved on; run `socket-patch vendor` (or \ + `scan --vendor`) to re-vendor", + ), + ); + continue; + } + match check_vendored_artifact(&common.cwd, &entry, &record).await { + ArtifactHealth::Healthy => counts.healthy += 1, + ArtifactHealth::StaleUuid => { + env.record( + PatchEvent::new(PatchAction::Skipped, purl.clone()).with_reason( + "vendor_uuid_mismatch", + "a re-vendor is pending for this package; run `socket-patch vendor`", + ), + ); + } + ArtifactHealth::Unverifiable { reason } => { + fail( + env, + &mut counts, + quiet, + purl, + "vendor_artifact_unrepairable", + format!("the ledger entry cannot be verified ({reason}); fix state.json"), + ); + } + ArtifactHealth::Missing => { + let detached = entry.detached; + candidates.push(Candidate { + purl: purl.clone(), + entry, + record, + detached, + reconstructed: false, + reason: "vendor_artifact_missing", + }); + } + ArtifactHealth::Corrupt { .. } => { + let detached = entry.detached; + candidates.push(Candidate { + purl: purl.clone(), + entry, + record, + detached, + reconstructed: false, + reason: "vendor_artifact_corrupt", + }); + } + } + } + + // ── Pass 2: lockfile references with no ledger coverage ───────────── + let covered: HashSet<(String, String)> = state + .entries + .values() + .map(|e| (e.ecosystem.clone(), e.uuid.clone())) + .collect(); + for (eco, uuid, relpath) in scan_vendor_references(&common.cwd).await { + if covered.contains(&(eco.clone(), uuid.clone())) || !ecosystem_in_scope(common, &eco) { + continue; + } + // The record: manifest by uuid first, else the patch API (the entry + // is then detached — exactly the manifest-less vendoring shape). + let (purl, record, detached) = + match manifest.and_then(|m| m.patches.iter().find(|(_, r)| r.uuid == uuid)) { + Some((p, r)) => (p.clone(), r.clone(), false), + None => match fetch_record_by_uuid(common, &uuid).await { + Some((purl, r)) => (purl, r, true), + None => { + fail( + env, + &mut counts, + quiet, + &format!("pkg:{eco}/unknown@{uuid}"), + "vendor_artifact_missing", + format!( + "the lockfile references .socket/vendor/{eco}/{uuid}/ but the \ + vendor ledger is gone and the patch view could not be fetched \ + (offline or API failure); restore .socket/vendor/state.json or \ + re-run online" + ), + ); + continue; + } + }, + }; + let mut entry = synth_entry(&eco, &uuid, &relpath, strip_purl_qualifiers(&purl)); + entry.detached = detached; + if detached { + entry.record = Some(record.clone()); + } + match check_vendored_artifact(&common.cwd, &entry, &record).await { + ArtifactHealth::Healthy => { + // The artifact survived; only the ledger was lost. Restore + // the entry (sha/size recomputed) so GC/sweep/revert know + // the artifact again — without it the next `scan --prune` + // would sweep the uuid dir as an orphan. + if common.dry_run { + env.record( + PatchEvent::new(PatchAction::Verified, purl.clone()).with_details( + serde_json::json!({ + "vendorArtifact": true, + "wouldRestoreLedgerEntry": true, + "path": relpath, + }), + ), + ); + continue; + } + fill_artifact_fingerprint(&common.cwd, &mut entry).await; + let save_failed = + persist_vendor_entry(common, env, &mut state, &purl, entry, detached, &record) + .await; + if save_failed { + counts.failed += 1; + continue; + } + env.record( + PatchEvent::new(PatchAction::Rebuilt, purl.clone()).with_details( + serde_json::json!({ + "path": relpath, + "ledgerRestored": true, + "artifactRebuilt": false, + }), + ), + ); + counts.rebuilt += 1; + } + _ => { + candidates.push(Candidate { + purl, + entry, + record, + detached, + reconstructed: true, + reason: "vendor_artifact_missing", + }); + } + } + } + + if candidates.is_empty() { + return counts; + } + + // ── Dry run: preview only ──────────────────────────────────────────── + if common.dry_run { + for c in &candidates { + env.record( + PatchEvent::new(PatchAction::Verified, c.purl.clone()).with_details( + serde_json::json!({ + "vendorArtifact": true, + "wouldRebuild": true, + "reason": c.reason, + "path": c.entry.artifact.path, + }), + ), + ); + } + return counts; + } + + if !quiet { + println!( + "\nRebuilding {} broken vendored artifact(s)...", + candidates.len() + ); + } + + // ── Corrupt artifacts are deleted first ────────────────────────────── + // The backends' wired hot paths rebuild on MISSING; turning corrupt + // into missing gives every ecosystem one uniform rebuild trigger (and + // never leaves tampered bytes to be blended into a rebuild). + for c in &candidates { + if c.reason == "vendor_artifact_corrupt" { + if let Some(rel) = vendor::path::vendor_uuid_dir_rel(&c.entry.ecosystem, &c.entry.uuid) + { + let _ = remove_tree(&common.cwd.join(rel)).await; + } + } + } + + // ── Patch content (in memory, like all vendor flows) ──────────────── + let records_map: HashMap = candidates + .iter() + .map(|c| (c.purl.clone(), c.record.clone())) + .collect(); + let synth = PatchManifest { + patches: records_map, + setup: None, + }; + let staged = match stage_vendor_sources_in_memory(common, &synth, socket_dir, &common.cwd).await + { + Ok(MemStageOutcome::Ready(s)) => s, + Ok(MemStageOutcome::Unavailable) => { + for c in &candidates { + fail( + env, + &mut counts, + quiet, + &c.purl, + c.reason, + format!( + "the vendored artifact at {} is broken and its patch content has \ + no local source ({})", + c.entry.artifact.path, + if common.offline { + "--offline prevents fetching it" + } else { + "download failed" + } + ), + ); + } + return counts; + } + Err(e) => { + env.record(PatchEvent::artifact(PatchAction::Failed).with_error("stage_failed", e)); + env.mark_partial_failure(); + counts.failed += candidates.len(); + return counts; + } + }; + let sources = staged.as_patch_sources(); + + // ── Pristine package sources ───────────────────────────────────────── + let purls: Vec = candidates.iter().map(|c| c.purl.clone()).collect(); + let partitioned = partition_purls(&purls, common.ecosystems.as_deref()); + let crawler_options = CrawlerOptions { + cwd: common.cwd.clone(), + global: common.global, + global_prefix: common.global_prefix.clone(), + batch_size: 100, + }; + let mut all_packages = find_packages_for_purls(&partitioned, &crawler_options, quiet).await; + let inventory = lock_inventory::inventory_project(&common.cwd).await; + let client = registry_fetch::build_registry_client(); + let mut holders: Vec = Vec::new(); + let mut unrebuildable: HashSet = HashSet::new(); + // Reconstructed npm candidates fetched UNVERIFIED from the conventional + // registry: their rebuilt tarball MUST match the integrity the rewired + // lockfile records (the trust anchor) before anything is persisted. + let mut must_verify: HashMap = HashMap::new(); + for c in &candidates { + if all_packages.contains_key(&c.purl) { + continue; // installed copy: works offline too + } + if common.offline { + fail( + env, + &mut counts, + quiet, + &c.purl, + c.reason, + format!( + "the vendored artifact at {} is broken, the package is not installed, \ + and --offline prevents fetching a pristine copy", + c.entry.artifact.path + ), + ); + unrebuildable.insert(c.purl.clone()); + continue; + } + match fetch_pristine_package(&common.cwd, &inventory, &client, &c.purl, Some(&c.entry)) + .await + { + PristineFetch::Fetched(fetched) => { + all_packages.insert(c.purl.clone(), fetched.dir().to_path_buf()); + holders.push(fetched); + } + PristineFetch::NoSource | PristineFetch::Unverifiable(_) => { + // Last rung (npm): the REWIRED lockfile still records the + // integrity of our packed tarball. Fetch the pristine copy + // unverified, rebuild deterministically, and verify the + // REBUILT artifact against that wired integrity below — + // end-to-end fail-closed without ledger or installed copy. + if c.entry.ecosystem == "npm" { + if let Some(wired) = + lock_inventory::wired_vendor_integrity(&common.cwd, &c.entry.artifact.path) + .await + { + if let Some((name, version)) = npm_coords(&c.entry.base_purl) { + match registry_fetch::fetch_npm_unverified(&name, &version, &client) + .await + { + Ok(fetched) => { + all_packages + .insert(c.purl.clone(), fetched.dir().to_path_buf()); + holders.push(fetched); + must_verify.insert(c.purl.clone(), wired); + continue; + } + Err(registry_fetch::FetchError::Failed(d)) + | Err(registry_fetch::FetchError::Unverifiable(d)) => { + fail( + env, + &mut counts, + quiet, + &c.purl, + "vendor_fetch_failed", + d, + ); + unrebuildable.insert(c.purl.clone()); + continue; + } + } + } + } + } + let detail = fetch_pristine_unrepairable_detail(c).unwrap_or_else(|| { + "no verifiable pristine source: the package is not installed, the \ + lockfile is rewired to the (broken) vendored artifact, and the \ + ledger records no recoverable registry fragment" + .to_string() + }); + fail( + env, + &mut counts, + quiet, + &c.purl, + "vendor_artifact_unrepairable", + detail, + ); + unrebuildable.insert(c.purl.clone()); + } + PristineFetch::Failed(detail) => { + fail( + env, + &mut counts, + quiet, + &c.purl, + "vendor_fetch_failed", + detail, + ); + unrebuildable.insert(c.purl.clone()); + } + } + } + + // ── Rebuild via the normal backends ────────────────────────────────── + let vendored_at = now_rfc3339(); + for c in candidates { + if unrebuildable.contains(&c.purl) { + continue; + } + let Some(pkg_path) = all_packages.get(&c.purl).cloned() else { + continue; // failed above + }; + let outcome = dispatch_vendor_one( + &c.purl, + &pkg_path, + &common.cwd, + &c.record, + &sources, + &vendored_at, + false, + false, + ) + .await; + match outcome { + None => { + fail( + env, + &mut counts, + quiet, + &c.purl, + "vendor_artifact_unrepairable", + "no vendor backend for this ecosystem in this build".to_string(), + ); + } + Some(socket_patch_core::patch::vendor::VendorOutcome::Refused { code, detail }) => { + fail(env, &mut counts, quiet, &c.purl, code, detail); + } + Some(socket_patch_core::patch::vendor::VendorOutcome::Done { + result, + entry, + warnings, + }) => { + if !result.success { + fail( + env, + &mut counts, + quiet, + &c.purl, + "vendor_artifact_rebuild_failed", + result.error.unwrap_or_else(|| "rebuild failed".to_string()), + ); + continue; + } + for w in &warnings { + // The Rebuilt event below carries the rebuild signal. + if w.code != "vendor_artifact_rebuilt" { + record_warning(env, &c.purl, w, common); + } + } + // Unverified pristine source: the rebuilt tarball must + // reproduce the integrity the rewired lockfile records. + if let Some(wired) = must_verify.get(&c.purl) { + let abs = common.cwd.join(&c.entry.artifact.path); + let verdict = match tokio::fs::read(&abs).await { + Ok(bytes) => { + let name = npm_coords(&c.entry.base_purl) + .map(|(n, _)| n) + .unwrap_or_default(); + registry_fetch::artifact_matches_integrity(&bytes, &name, wired) + } + Err(e) => Err(format!("cannot read the rebuilt artifact: {e}")), + }; + if let Err(detail) = verdict { + if let Some(rel) = + vendor::path::vendor_uuid_dir_rel(&c.entry.ecosystem, &c.entry.uuid) + { + let _ = remove_tree(&common.cwd.join(rel)).await; + } + fail( + env, + &mut counts, + quiet, + &c.purl, + "vendor_artifact_rebuild_failed", + format!( + "the rebuilt artifact does not match the integrity the \ + lockfile records ({detail}); the pristine source may have \ + been tampered with — nothing was kept" + ), + ); + continue; + } + } + // The entry whose recorded fingerprint the post-check must + // match: a backend-returned entry (drift healed / wiring + // re-recorded) wins; a reconstructed entry gets its + // fingerprint computed from the rebuilt bytes. + let mut check_entry = c.entry.clone(); + if let Some(e) = entry { + check_entry = e.clone(); + if persist_vendor_entry( + common, env, &mut state, &c.purl, e, c.detached, &c.record, + ) + .await + { + counts.failed += 1; + continue; + } + } else if c.reconstructed { + fill_artifact_fingerprint(&common.cwd, &mut check_entry).await; + if persist_vendor_entry( + common, + env, + &mut state, + &c.purl, + check_entry.clone(), + c.detached, + &c.record, + ) + .await + { + counts.failed += 1; + continue; + } + } + // ── Fail-closed post-verify ────────────────────────────── + match check_vendored_artifact(&common.cwd, &check_entry, &c.record).await { + ArtifactHealth::Healthy => { + if !quiet { + println!( + "Rebuilt {} ({})", + socket_patch_core::utils::purl::normalize_purl(&c.purl), + check_entry.artifact.path + ); + } + env.record( + PatchEvent::new(PatchAction::Rebuilt, c.purl.clone()).with_details( + serde_json::json!({ + "path": check_entry.artifact.path, + "reason": c.reason, + }), + ), + ); + counts.rebuilt += 1; + } + other => { + // The deterministic rebuild did not reproduce the + // recorded artifact (e.g. a tampered ledger sha): + // remove it rather than leave unverifiable bytes. + if let Some(rel) = vendor::path::vendor_uuid_dir_rel( + &check_entry.ecosystem, + &check_entry.uuid, + ) { + let _ = remove_tree(&common.cwd.join(rel)).await; + } + fail( + env, + &mut counts, + quiet, + &c.purl, + "vendor_artifact_rebuild_failed", + format!( + "the rebuilt artifact does not match the recorded \ + fingerprint ({other:?}); if state.json was edited, run \ + `socket-patch vendor` to re-vendor from scratch", + ), + ); + } + } + } + } + } + drop(holders); + counts +} + +/// Compute and record the artifact fingerprint (sha256 + size for +/// file-shaped artifacts) on a re-synthesized ledger entry. +async fn fill_artifact_fingerprint(project_root: &Path, entry: &mut VendorEntry) { + let norm = entry.artifact.path.replace('\\', "/"); + if !(norm.ends_with(".tgz") || norm.ends_with(".tar.gz") || norm.ends_with(".whl")) { + return; // dir-shaped: integrity is per-file afterHashes + } + let abs = project_root.join(&norm); + if let Some(hex) = file_sha256_hex(&abs).await { + entry.artifact.sha256 = hex; + } + if let Ok(meta) = tokio::fs::metadata(&abs).await { + entry.artifact.size = Some(meta.len()); + } +} + +/// Fetch one patch view by uuid (proxy-aware) and shape it as a manifest +/// record; `None` offline or on any API failure. +async fn fetch_record_by_uuid(common: &GlobalArgs, uuid: &str) -> Option<(String, PatchRecord)> { + if common.offline { + return None; + } + let (client, _) = get_api_client_with_overrides(common.api_client_overrides()).await; + let patch = client + .fetch_patch(common.org.as_deref(), uuid) + .await + .ok()??; + Some(crate::commands::get::record_from_patch_response(&patch)) +} + +/// `pkg:npm/@` → (name, version); the name may be scoped. +fn npm_coords(base_purl: &str) -> Option<(String, String)> { + let rest = strip_purl_qualifiers(base_purl).strip_prefix("pkg:npm/")?; + let (name, version) = rest.rsplit_once('@')?; + if name.is_empty() || version.is_empty() { + return None; + } + Some((name.to_string(), version.to_string())) +} + +/// A more specific unrepairable detail when one is knowable from the entry. +fn fetch_pristine_unrepairable_detail(c: &Candidate) -> Option { + if c.entry.artifact.platform_locked == Some(true) { + Some( + "the vendored wheel is platform-locked (compiled); reinstall the package on \ + this platform and re-run repair, or run `socket-patch vendor` to rebuild it" + .to_string(), + ) + } else { + None + } +} + +#[cfg(test)] +mod tests { + use super::*; + + /// pnpm writes vendored paths in THREE spellings — override values, + /// `tarball:` fields, and snapshot KEYS with a trailing colon. The + /// scanner must yield the clean relpath whichever form it meets first. + #[tokio::test] + async fn scan_handles_pnpm_snapshot_key_colons() { + let tmp = tempfile::tempdir().unwrap(); + let uuid = "11111111-1111-4111-8111-111111111111"; + let lock = format!( + "overrides:\n left-pad@1.3.0: file:.socket/vendor/npm/{uuid}/left-pad-1.3.0.tgz\n\n\ + snapshots:\n\n left-pad@file:.socket/vendor/npm/{uuid}/left-pad-1.3.0.tgz:\n {{}}\n" + ); + tokio::fs::write(tmp.path().join("pnpm-lock.yaml"), &lock) + .await + .unwrap(); + let refs = scan_vendor_references(tmp.path()).await; + assert_eq!(refs.len(), 1, "{refs:?}"); + assert_eq!( + refs[0].2, + format!(".socket/vendor/npm/{uuid}/left-pad-1.3.0.tgz"), + "no trailing colon: {refs:?}" + ); + + // Snapshot-key-only lock (the key form is the FIRST occurrence). + let lock = format!( + "snapshots:\n\n left-pad@file:.socket/vendor/npm/{uuid}/left-pad-1.3.0.tgz:\n {{}}\n" + ); + tokio::fs::write(tmp.path().join("pnpm-lock.yaml"), &lock) + .await + .unwrap(); + let refs = scan_vendor_references(tmp.path()).await; + assert_eq!(refs.len(), 1, "{refs:?}"); + assert!( + refs[0].2.ends_with("left-pad-1.3.0.tgz"), + "trailing colon must be cut: {refs:?}" + ); + } +} diff --git a/crates/socket-patch-cli/src/commands/scan.rs b/crates/socket-patch-cli/src/commands/scan.rs index f395763..7d75263 100644 --- a/crates/socket-patch-cli/src/commands/scan.rs +++ b/crates/socket-patch-cli/src/commands/scan.rs @@ -10,7 +10,7 @@ use socket_patch_core::patch::apply_lock; use socket_patch_core::utils::cleanup_blobs::{ cleanup_unused_archives, cleanup_unused_blobs, CleanupResult, }; -use socket_patch_core::utils::purl::strip_purl_qualifiers; +use socket_patch_core::utils::purl::{normalize_purl, strip_purl_qualifiers}; use socket_patch_core::utils::telemetry::{ track_patch_scan_failed, track_patch_scanned, track_patch_vendor_failed, track_patch_vendored, }; @@ -19,7 +19,7 @@ use std::path::Path; use std::time::Duration; use crate::args::{apply_env_toggles, GlobalArgs}; -use crate::commands::fetch_stage::{stage_patch_sources, StageOutcome}; +use crate::commands::fetch_stage::{stage_vendor_sources_in_memory, MemStageOutcome}; use crate::commands::vex::{generate_vex_from_manifest_path, VexEmbedArgs}; use crate::ecosystem_dispatch::crawl_all_ecosystems; use crate::json_envelope::{Command as EnvelopeCommand, Envelope}; @@ -54,6 +54,12 @@ pub(crate) struct GcSummary { pub blobs: CleanupResult, pub diffs: CleanupResult, pub packages: CleanupResult, + /// Vendored entries reverted (or revertable, preview mode) because + /// their patch is gone from the manifest or their dependency left the + /// lockfile graph — see `vendor::run_vendor_gc`. Sorted. + pub vendored_reverted: Vec, + /// Orphan `.socket/vendor//` dirs swept (or sweepable). + pub vendor_orphan_dirs: usize, /// `true` when `--no-prune` was set; the sub-object only carries the /// `skipped: true` field in that case. pub skipped: bool, @@ -64,6 +70,17 @@ impl GcSummary { self.blobs.bytes_freed + self.diffs.bytes_freed + self.packages.bytes_freed } + /// Fold a vendored-state GC pass into this summary. + fn absorb_vendor_gc(&mut self, v: super::vendor::VendorGcSummary) { + self.vendored_reverted = v + .dropped_reverted + .into_iter() + .chain(v.unused_reverted) + .collect(); + self.vendored_reverted.sort(); + self.vendor_orphan_dirs = v.orphan_dirs; + } + /// Serialize for a *mutating* GC pass (post-apply). fn to_apply_json(&self) -> serde_json::Value { if self.skipped { @@ -74,6 +91,8 @@ impl GcSummary { "removedBlobs": self.blobs.blobs_removed, "removedDiffArchives": self.diffs.blobs_removed, "removedPackageArchives": self.packages.blobs_removed, + "revertedVendoredEntries": self.vendored_reverted, + "removedVendorOrphanDirs": self.vendor_orphan_dirs, "bytesFreed": self.total_bytes(), }) } @@ -88,6 +107,8 @@ impl GcSummary { "orphanBlobs": self.blobs.blobs_removed, "orphanDiffArchives": self.diffs.blobs_removed, "orphanPackageArchives": self.packages.blobs_removed, + "revertableVendoredEntries": self.vendored_reverted, + "vendorOrphanDirs": self.vendor_orphan_dirs, "bytesReclaimable": self.total_bytes(), }) } @@ -118,6 +139,7 @@ async fn run_gc( diffs, packages, skipped: false, + ..Default::default() } } @@ -127,16 +149,28 @@ async fn run_gc( /// `prune` flag — when GC isn't requested, simply don't call this function and /// don't emit a `gc` sub-object. async fn run_apply_gc( + common: &crate::args::GlobalArgs, manifest_path: &Path, socket_dir: &Path, scanned_purls: &HashSet, vendored: &HashSet, ) -> GcSummary { + // Vendored-state GC FIRST: it reverts manifest-dropped and + // lockfile-unused vendored entries, dropping the latter's manifest + // entries — so the manifest prune + blob sweep below reclaims their + // blobs in this same pass (and the stale `vendored` exemption set is + // harmless: the entries it would exempt are already gone). + let vendor_gc = super::vendor::run_vendor_gc(common, manifest_path, /*dry_run=*/ false).await; + // Re-read the just-written manifest (the apply step may have added // or updated entries we now want to consider for pruning). let mut manifest = match read_manifest(manifest_path).await { Ok(Some(m)) => m, - _ => return GcSummary::default(), + _ => { + let mut gc = GcSummary::default(); + gc.absorb_vendor_gc(vendor_gc); + return gc; + } }; let prunable = detect_prunable(&manifest, scanned_purls, vendored); for purl in &prunable { @@ -147,22 +181,42 @@ async fn run_apply_gc( // file-level cleanup below still operates on the in-memory copy. let _ = write_manifest(manifest_path, &manifest).await; } - run_gc(&manifest, prunable, socket_dir, /*dry_run=*/ false).await + let mut gc = run_gc(&manifest, prunable, socket_dir, /*dry_run=*/ false).await; + gc.absorb_vendor_gc(vendor_gc); + gc } /// Dry-run preview of the apply-mode GC pass. Same shape as /// [`run_apply_gc`] but emits `prunable*`/`orphan*` field names and /// performs no mutation. async fn preview_apply_gc( + common: &crate::args::GlobalArgs, manifest_path: &Path, socket_dir: &Path, scanned_purls: &HashSet, vendored: &HashSet, ) -> GcSummary { + // Read-only preview of the vendored-state GC (lists, never reverts). + let vendor_gc = super::vendor::run_vendor_gc(common, manifest_path, /*dry_run=*/ true).await; + let mut manifest = match read_manifest(manifest_path).await { Ok(Some(m)) => m, - _ => return GcSummary::default(), + _ => { + let mut gc = GcSummary::default(); + gc.absorb_vendor_gc(vendor_gc); + return gc; + } }; + // Mirror the wet pass: an unused vendored entry's manifest keys are + // dropped before the blob sweep, so drop them from the in-memory copy + // too — otherwise the preview under-reports orphan blobs/bytes + // relative to what the real `--prune` run frees. + for purl in &vendor_gc.unused_reverted { + let base = strip_purl_qualifiers(purl).to_string(); + manifest + .patches + .retain(|k, _| k != purl && strip_purl_qualifiers(k) != base); + } let prunable = detect_prunable(&manifest, scanned_purls, vendored); // Mirror `run_apply_gc`: drop the prunable entries from the manifest // *before* computing orphans (no write — this is the preview). The @@ -174,7 +228,9 @@ async fn preview_apply_gc( for purl in &prunable { manifest.patches.remove(purl); } - run_gc(&manifest, prunable, socket_dir, /*dry_run=*/ true).await + let mut gc = run_gc(&manifest, prunable, socket_dir, /*dry_run=*/ true).await; + gc.absorb_vendor_gc(vendor_gc); + gc } /// PURL strings present in the manifest but absent from `scanned_purls`. @@ -197,28 +253,196 @@ async fn preview_apply_gc( /// copy is its NORMAL state, not "no longer installed". Without this, a /// wiped node_modules would prune the manifest entry — and the next /// `vendor` run would then reconcile-revert the vendoring itself. +/// +/// Both sides are compared in percent-DECODED form (`normalize_purl`): +/// manifest keys come from the API encoded (`pkg:npm/%40scope/x@1`) while +/// crawler purls carry the literal `@scope` — comparing the raw strings +/// would make every encoded scoped entry look prunable and `--prune`/ +/// `--sync` would GC the very patch it just downloaded. pub(crate) fn detect_prunable( manifest: &PatchManifest, scanned_purls: &HashSet, vendored: &HashSet, ) -> Vec { - let scanned_bases: HashSet<&str> = scanned_purls + let scanned_bases: HashSet = scanned_purls .iter() - .map(|p| strip_purl_qualifiers(p)) + .map(|p| normalize_purl(strip_purl_qualifiers(p)).into_owned()) .collect(); manifest .patches .keys() .filter(|p| { - let base = strip_purl_qualifiers(p); - !scanned_bases.contains(base) + let base = normalize_purl(strip_purl_qualifiers(p)); + !scanned_bases.contains(base.as_ref()) && !vendored.contains(p.as_str()) - && !vendored.contains(base) + && !vendored.contains(strip_purl_qualifiers(p)) }) .cloned() .collect() } +/// Lockfile-only packages: dependencies the project's lockfile resolves +/// that have no crawled (installed) counterpart. +#[derive(Default)] +struct LockfileSupplement { + packages: Vec, + /// Literal crawler-form purls, for fast membership tests. + purls: HashSet, + /// The lockfile the entries came from, for messages. + source: &'static str, +} + +/// Inventory the project's lockfile(s) and fabricate crawl entries for +/// dependencies that are not installed. The fabricated `path` is the +/// WOULD-BE install dir — every consumer degrades safely on a nonexistent +/// path (hash verify → NotFound, apply → partitioned skip, vendor → +/// auto-fetch). Global scans target the machine's global tree, not this +/// project's lockfile, so they get no supplement. +async fn lockfile_supplement( + common: &GlobalArgs, + crawled: &[socket_patch_core::crawlers::types::CrawledPackage], +) -> LockfileSupplement { + use socket_patch_core::patch::vendor::lock_inventory; + + let mut out = LockfileSupplement { + source: "project lockfiles", + ..Default::default() + }; + if common.global || common.global_prefix.is_some() { + return out; + } + let entries = lock_inventory::inventory_project(&common.cwd).await; + if entries.is_empty() { + return out; + } + let crawled_purls: HashSet<&str> = crawled.iter().map(|p| p.purl.as_str()).collect(); + for entry in entries { + if crawled_purls.contains(entry.purl.as_str()) { + continue; + } + let Some(pkg) = crawled_from_purl(&entry.purl, &common.cwd) else { + continue; + }; + out.purls.insert(entry.purl.clone()); + out.packages.push(pkg); + } + out +} + +/// A displayable crawl entry fabricated from a purl (decoded form). The +/// path is a placeholder consumers degrade safely on. +fn crawled_from_purl( + purl: &str, + cwd: &std::path::Path, +) -> Option { + let decoded = normalize_purl(strip_purl_qualifiers(purl)).into_owned(); + let rest = decoded.strip_prefix("pkg:")?; + let (_eco, rest) = rest.split_once('/')?; + let at = rest.rfind('@').filter(|&i| i > 0)?; + let (name_part, version) = (&rest[..at], &rest[at + 1..]); + let (namespace, name) = match name_part.rsplit_once('/') { + Some((ns, n)) => (Some(ns.to_string()), n.to_string()), + None => (None, name_part.to_string()), + }; + Some(socket_patch_core::crawlers::types::CrawledPackage { + name, + version: version.to_string(), + namespace, + purl: decoded.clone(), + path: cwd.join("node_modules").join(name_part), + }) +} + +/// Vendored-ledger packages with no crawled counterpart: on a fresh clone +/// the committed artifact IS the dependency, so these stay discoverable +/// (updates[] detection, the table, and `scan --vendor` re-vendor/in-sync +/// runs all keep working before any install). They are NOT "lockfile-only" +/// — nothing needs installing; the artifact satisfies the lock. +async fn vendored_ledger_supplement( + common: &GlobalArgs, + crawled: &[socket_patch_core::crawlers::types::CrawledPackage], +) -> Vec { + if common.global || common.global_prefix.is_some() { + return Vec::new(); + } + let Ok(state) = socket_patch_core::patch::vendor::load_state(&common.cwd).await else { + return Vec::new(); + }; + let crawled_norm: HashSet = crawled + .iter() + .map(|p| normalize_purl(&p.purl).into_owned()) + .collect(); + let mut seen: HashSet = HashSet::new(); + let mut out = Vec::new(); + for entry in state.entries.values() { + let base = strip_purl_qualifiers(&entry.base_purl); + let norm = normalize_purl(base).into_owned(); + if crawled_norm.contains(&norm) || !seen.insert(norm) { + continue; + } + if let Some(pkg) = crawled_from_purl(base, &common.cwd) { + out.push(pkg); + } + } + out.sort_by(|a, b| a.purl.cmp(&b.purl)); + out +} + +/// Vendor-mode pre-prompt check: uuids of selected patches whose installed +/// files match NEITHER beforeHash nor afterHash — the patch was built +/// against different bytes than the installed artifact. Vendoring still +/// succeeds for these (the vendor stage force-applies the verified patched +/// content; see `force_apply_staged`), but the user should learn it BEFORE +/// the confirm prompt, not from a post-hoc warning event. +/// +/// Best-effort and read-only: a detail-fetch failure or an unresolvable +/// installed path just skips the annotation — it never blocks the flow and +/// writes nothing (unlike `download_patch_records`, which stages blobs). +async fn preverify_vendor_baselines( + api_client: &socket_patch_core::api::client::ApiClient, + org_slug: Option<&str>, + selected: &[PatchSearchResult], + crawled: &[socket_patch_core::crawlers::types::CrawledPackage], + lockfile_only: &HashSet, +) -> HashSet { + use socket_patch_core::manifest::schema::PatchFileInfo; + use socket_patch_core::patch::apply::{verify_file_patch, VerifyStatus}; + use socket_patch_core::utils::purl::purl_eq; + + let mut mismatched: HashSet = HashSet::new(); + for patch in selected { + // API purls come percent-encoded, crawler purls literal — purl_eq + // bridges the two spellings. + let base = strip_purl_qualifiers(&patch.purl); + // Lockfile-only packages have no installed bytes to compare — the + // vendor engine fetches them pristine (nothing to annotate). + if lockfile_only.contains(normalize_purl(base).as_ref()) { + continue; + } + let Some(pkg) = crawled.iter().find(|c| purl_eq(&c.purl, base)) else { + continue; + }; + let Ok(Some(detail)) = api_client.fetch_patch(org_slug, &patch.uuid).await else { + continue; + }; + for (file, info) in &detail.files { + let info = PatchFileInfo { + before_hash: info.before_hash.clone().unwrap_or_default(), + after_hash: info.after_hash.clone().unwrap_or_default(), + }; + if info.before_hash.is_empty() { + continue; // a new file has no baseline to compare + } + if verify_file_patch(&pkg.path, file, &info).await.status == VerifyStatus::HashMismatch + { + mismatched.insert(patch.uuid.clone()); + break; + } + } + } + mismatched +} + /// Cross-reference an existing manifest against discovery results to find /// PURLs whose newest available patch UUID differs from the locally-recorded /// one. Used by both the discovery JSON path and the table-print path. @@ -509,16 +733,18 @@ async fn run_scan_vendor_step( patches: records.clone(), setup: None, }; - let staged = match stage_patch_sources(common, &synth, socket_dir).await { - Ok(StageOutcome::Ready(s)) => s, - Ok(StageOutcome::Unavailable) => { - return Err(( - "no_local_source", - "patch artifacts unavailable (offline or download failure)".to_string(), - )) - } - Err(e) => return Err(("stage_failed", e)), - }; + let staged = + match stage_vendor_sources_in_memory(common, &synth, socket_dir, &common.cwd).await + { + Ok(MemStageOutcome::Ready(s)) => s, + Ok(MemStageOutcome::Unavailable) => { + return Err(( + "no_local_source", + "patch artifacts unavailable (offline or download failure)".to_string(), + )) + } + Err(e) => return Err(("stage_failed", e)), + }; let sources = staged.as_patch_sources(); boxed_vendor_records(common, records, &sources, true, &mut env).await } @@ -536,16 +762,19 @@ async fn run_scan_vendor_step( // Same placement as the `vendor` command: dropped entries // are reverted even when zero in-scope patches remain. let mut has_errors = reconcile_dropped(&manifest, common, &mut env).await; - let staged = match stage_patch_sources(common, &manifest, socket_dir).await { - Ok(StageOutcome::Ready(s)) => s, - Ok(StageOutcome::Unavailable) => { - return Err(( - "no_local_source", - "patch artifacts unavailable (offline or download failure)".to_string(), - )) - } - Err(e) => return Err(("stage_failed", e)), - }; + let staged = + match stage_vendor_sources_in_memory(common, &manifest, socket_dir, &common.cwd) + .await + { + Ok(MemStageOutcome::Ready(s)) => s, + Ok(MemStageOutcome::Unavailable) => { + return Err(( + "no_local_source", + "patch artifacts unavailable (offline or download failure)".to_string(), + )) + } + Err(e) => return Err(("stage_failed", e)), + }; let sources = staged.as_patch_sources(); has_errors |= boxed_vendor_records(common, &manifest.patches, &sources, false, &mut env).await; @@ -612,8 +841,14 @@ async fn run_vendor_json_path( // and preview the GC, exactly like `--apply`'s dry run. result["vendor"] = preview_vendor_json(&args.common.cwd, &selected).await; if prune { - let gc = - preview_apply_gc(manifest_path, socket_dir, scanned_purls, vendored_purls).await; + let gc = preview_apply_gc( + &args.common, + manifest_path, + socket_dir, + scanned_purls, + vendored_purls, + ) + .await; result["gc"] = gc.to_preview_json(); } let final_code = @@ -639,6 +874,8 @@ async fn run_vendor_json_path( download_mode: args.common.download_mode.clone(), api_overrides: args.common.api_client_overrides(), all_releases: args.all_releases, + strict: args.common.strict, + persist_blobs: !args.vendor, }; let mut has_errors = false; let detached_records: Option> = if args.detached { @@ -674,7 +911,14 @@ async fn run_vendor_json_path( // package_not_installed; vendored entries are exempt from // the prune itself. if prune { - let gc = run_apply_gc(manifest_path, socket_dir, scanned_purls, vendored_purls).await; + let gc = run_apply_gc( + &args.common, + manifest_path, + socket_dir, + scanned_purls, + vendored_purls, + ) + .await; result["gc"] = gc.to_apply_json(); } @@ -760,7 +1004,14 @@ async fn run_vendor_interactive_path( // GC before the vendor step (see the JSON path): stale manifest // entries would fail vendoring with package_not_installed. if prune { - let gc = run_apply_gc(manifest_path, socket_dir, scanned_purls, vendored_purls).await; + let gc = run_apply_gc( + &args.common, + manifest_path, + socket_dir, + scanned_purls, + vendored_purls, + ) + .await; if !gc.pruned.is_empty() { println!("GC: pruned {} manifest entr{}.", gc.pruned.len(), { if gc.pruned.len() == 1 { @@ -770,6 +1021,19 @@ async fn run_vendor_interactive_path( } }); } + if !gc.vendored_reverted.is_empty() || gc.vendor_orphan_dirs > 0 { + println!( + "GC: reverted {} vendored entr{}; swept {} orphan vendor dir{}.", + gc.vendored_reverted.len(), + if gc.vendored_reverted.len() == 1 { + "y" + } else { + "ies" + }, + gc.vendor_orphan_dirs, + if gc.vendor_orphan_dirs == 1 { "" } else { "s" }, + ); + } } match boxed_scan_vendor_step( &args.common, @@ -830,6 +1094,38 @@ fn partition_vendored_selected( (kept, vendored_records) } +/// Lockfile-only patches are skipped BEFORE download in apply mode: the +/// package is not on disk to patch in place, and downloading its patch +/// into the manifest would create a not-yet-appliable entry (and flip the +/// apply path's exit code). `scan --vendor` is the route that handles them +/// (the vendor engine auto-fetches lockfile-resolved packages). Matching +/// bridges API purl encoding via `normalize_purl`. Same shape/mechanics as +/// [`partition_vendored_selected`]. +fn partition_not_installed_selected( + selected: Vec, + lockfile_only: &HashSet, +) -> (Vec, Vec) { + if lockfile_only.is_empty() { + return (selected, Vec::new()); + } + let is_lockfile_only = + |p: &str| lockfile_only.contains(normalize_purl(strip_purl_qualifiers(p)).as_ref()); + let (not_installed, kept): (Vec<_>, Vec<_>) = selected + .into_iter() + .partition(|p| is_lockfile_only(&p.purl)); + let mut records: Vec = not_installed + .iter() + .map(|p| { + serde_json::json!({ + "purl": p.purl, "uuid": p.uuid, + "action": "skipped", "errorCode": "package_not_installed", + }) + }) + .collect(); + records.sort_by(|a, b| a["purl"].as_str().cmp(&b["purl"].as_str())); + (kept, records) +} + /// Fold the pre-download vendored skips into the apply report returned by /// `download_and_apply_patches`: they were "found" by discovery and /// skipped here, never downloaded. Also strips the inner `status` (scan @@ -1063,7 +1359,28 @@ pub async fn run(args: ScanArgs) -> i32 { } // Crawl packages - let (all_crawled, eco_counts) = crawl_all_ecosystems(&crawler_options).await; + let (mut all_crawled, mut eco_counts) = crawl_all_ecosystems(&crawler_options).await; + + // Lockfile supplement: dependencies the project's lockfile resolves + // that have NO installed copy (fresh clone, partial install). They join + // discovery — counts, API lookup, table, the prune "scanned" set — and + // are flagged "not yet installed" everywhere a user could act on them. + let lockfile_only = lockfile_supplement(&args.common, &all_crawled).await; + if !lockfile_only.packages.is_empty() { + for pkg in &lockfile_only.packages { + if let Some(eco) = Ecosystem::from_purl(&pkg.purl) { + *eco_counts.entry(eco).or_insert(0) += 1; + } + } + all_crawled.extend(lockfile_only.packages.iter().cloned()); + } + let ledger_supplement = vendored_ledger_supplement(&args.common, &all_crawled).await; + for pkg in &ledger_supplement { + if let Some(eco) = Ecosystem::from_purl(&pkg.purl) { + *eco_counts.entry(eco).or_insert(0) += 1; + } + } + all_crawled.extend(ledger_supplement); // Every PURL the crawl found, captured BEFORE the `--ecosystems` // display/query filter is applied. Prune (below) must reference the @@ -1072,6 +1389,9 @@ pub async fn run(args: ScanArgs) -> i32 { // prune used the filtered set instead, `scan --ecosystems npm --prune` // would treat every cargo/go/pypi/gem manifest entry as "uninstalled" // and delete it (plus its blobs) — silent cross-ecosystem data loss. + // Lockfile-only purls are deliberately included: a dependency the + // lockfile still resolves must not be pruned just because node_modules + // is wiped or partially installed. let installed_purls: HashSet = all_crawled.iter().map(|p| p.purl.clone()).collect(); // Vendor-ledger purl keys, loaded once and shared by the prune @@ -1129,6 +1449,7 @@ pub async fn run(args: ScanArgs) -> i32 { let mut result = serde_json::json!({ "status": "success", "scannedPackages": 0, + "lockfileOnlyPackages": 0, "packagesWithPatches": 0, "totalPatches": 0, "freePatches": 0, @@ -1189,6 +1510,13 @@ pub async fn run(args: ScanArgs) -> i32 { } else { eprintln!("Found {package_count} packages{eco_summary}"); } + if !lockfile_only.purls.is_empty() { + eprintln!( + "Note: {} package(s) from {} are not yet installed (lockfile-only).", + lockfile_only.purls.len(), + lockfile_only.source, + ); + } } // Query API in batches @@ -1374,6 +1702,7 @@ pub async fn run(args: ScanArgs) -> i32 { let mut result = serde_json::json!({ "status": "success", "scannedPackages": package_count, + "lockfileOnlyPackages": lockfile_only.purls.len(), "packagesWithPatches": all_packages_with_patches.len(), "totalPatches": total_patches, "freePatches": free_patches, @@ -1386,6 +1715,19 @@ pub async fn run(args: ScanArgs) -> i32 { "newUuid": u.new_uuid, })).collect::>(), }); + // Flag lockfile-only packages so JSON consumers can tell "patch + // available but not installed" from the installed case. Additive + // field; absent means installed. + if let Some(packages) = result["packages"].as_array_mut() { + for pkg in packages { + let is_lockfile_only = pkg["purl"] + .as_str() + .is_some_and(|p| lockfile_only.purls.contains(p)); + if is_lockfile_only { + pkg["notInstalled"] = serde_json::json!(true); + } + } + } // `apply` and `prune` are computed once at the top of run() // (factoring in --sync, which implies both). They're independent @@ -1428,6 +1770,17 @@ pub async fn run(args: ScanArgs) -> i32 { // operator's signal to run `scan --vendor` (or `vendor`). let (selected, vendored_records) = partition_vendored_selected(selected, &vendored_purls); + // Lockfile-only purls leave the apply selection here (calm + // skip records, never an error); the union rides the same + // bookkeeping as the vendored skips. + let (selected, vendored_records) = { + let (kept, not_installed) = + partition_not_installed_selected(selected, &lockfile_only.purls); + let mut all = vendored_records; + all.extend(not_installed); + all.sort_by(|a, b| a["purl"].as_str().cmp(&b["purl"].as_str())); + (kept, all) + }; let mut apply_code = 0i32; if dry { @@ -1497,6 +1850,8 @@ pub async fn run(args: ScanArgs) -> i32 { download_mode: args.common.download_mode.clone(), api_overrides: args.common.api_client_overrides(), all_releases: args.all_releases, + strict: args.common.strict, + persist_blobs: !args.vendor, }; let (code, apply_json) = download_and_apply_patches(&selected, ¶ms).await; apply_code = code; @@ -1511,10 +1866,23 @@ pub async fn run(args: ScanArgs) -> i32 { // --- GC (if requested) -------------------------------------- if prune { let gc = if dry { - preview_apply_gc(&manifest_path, &socket_dir, &scanned_purls, &vendored_purls) - .await + preview_apply_gc( + &args.common, + &manifest_path, + &socket_dir, + &scanned_purls, + &vendored_purls, + ) + .await } else { - run_apply_gc(&manifest_path, &socket_dir, &scanned_purls, &vendored_purls).await + run_apply_gc( + &args.common, + &manifest_path, + &socket_dir, + &scanned_purls, + &vendored_purls, + ) + .await }; result["gc"] = if dry { gc.to_preview_json() @@ -1564,9 +1932,23 @@ pub async fn run(args: ScanArgs) -> i32 { // --- GC-only path (no --apply, just --prune) -------------------- if prune { let gc = if dry { - preview_apply_gc(&manifest_path, &socket_dir, &scanned_purls, &vendored_purls).await + preview_apply_gc( + &args.common, + &manifest_path, + &socket_dir, + &scanned_purls, + &vendored_purls, + ) + .await } else { - run_apply_gc(&manifest_path, &socket_dir, &scanned_purls, &vendored_purls).await + run_apply_gc( + &args.common, + &manifest_path, + &socket_dir, + &scanned_purls, + &vendored_purls, + ) + .await }; result["gc"] = if dry { gc.to_preview_json() @@ -1670,14 +2052,22 @@ pub async fn run(args: ScanArgs) -> i32 { } else { String::new() }; + // Lockfile-only packages can be patched by `scan --vendor` + // (which fetches them pristine) but not applied in place. + let not_installed_marker = if lockfile_only.purls.contains(pkg.purl.as_str()) { + color(" [NOT INSTALLED]", "33", use_color) + } else { + String::new() + }; println!( - "{:<40} {:>8} {:<16} {}{}", + "{:<40} {:>8} {:<16} {}{}{}", display_purl, count_str, format_severity(severity, use_color), vuln_str, update_marker, + not_installed_marker, ); } @@ -1804,7 +2194,30 @@ pub async fn run(args: ScanArgs) -> i32 { for p in &vendored_selected { println!( " [skip] {} (vendored — run scan --vendor to update)", - p.purl + normalize_purl(&p.purl) + ); + } + } + + // Lockfile-only purls leave the in-place apply selection (calm skip, + // mirrors the JSON path). In `--vendor` mode they stay: the vendor + // engine fetches lockfile-resolved packages pristine. + let (selected, not_installed_selected): (Vec<_>, Vec) = if args.vendor { + (selected, Vec::new()) + } else { + let (kept, skipped) = partition_not_installed_selected(selected, &lockfile_only.purls); + let printed: Vec = skipped + .iter() + .filter_map(|r| r["purl"].as_str().map(str::to_string)) + .collect(); + (kept, printed) + }; + if !args.common.silent { + for purl in ¬_installed_selected { + println!( + " [skip] {} (not installed — run your package manager's install first, \ + or `scan --vendor` to vendor it from the lockfile)", + normalize_purl(purl) ); } } @@ -1816,6 +2229,22 @@ pub async fn run(args: ScanArgs) -> i32 { return embed_vex_human(&args.common, &args.vex, &manifest_path, 0).await; } + // Vendor mode: pre-verify baselines so a content mismatch surfaces + // BEFORE the confirm prompt (vendoring still proceeds for these — + // the stage force-applies the verified patched content). + let mismatched_baselines: HashSet = if args.vendor && !args.common.silent { + preverify_vendor_baselines( + &api_client, + effective_org_slug, + &selected, + &filtered_crawled, + &lockfile_only.purls, + ) + .await + } else { + HashSet::new() + }; + // Display detailed summary of selected patches before confirming // (presentational only — skipped wholesale under --silent). if !args.common.silent { @@ -1851,10 +2280,18 @@ pub async fn run(args: ScanArgs) -> i32 { println!( " {} [{}] {}", - patch.purl, + // Human display only: show the decoded form of an + // API-encoded purl (`%40scope` → `@scope`). JSON output + // keeps the verbatim key. + normalize_purl(&patch.purl), patch.tier.to_uppercase(), sev_colored, ); + if mismatched_baselines.contains(&patch.uuid) { + println!( + " (installed content differs from patch baseline — will vendor patched content)" + ); + } if !vuln_ids.is_empty() { println!(" Fixes: {}", vuln_ids.join(", ")); } @@ -1925,6 +2362,8 @@ pub async fn run(args: ScanArgs) -> i32 { download_mode: args.common.download_mode.clone(), api_overrides: args.common.api_client_overrides(), all_releases: args.all_releases, + strict: args.common.strict, + persist_blobs: !args.vendor, }; let code = if args.vendor { @@ -1954,7 +2393,14 @@ pub async fn run(args: ScanArgs) -> i32 { // run `socket-patch gc` (or `repair`) explicitly. (Vendor mode // already ran its GC before the vendor step.) if prune && !args.vendor { - let gc = run_apply_gc(&manifest_path, &socket_dir, &scanned_purls, &vendored_purls).await; + let gc = run_apply_gc( + &args.common, + &manifest_path, + &socket_dir, + &scanned_purls, + &vendored_purls, + ) + .await; let total = gc.blobs.blobs_removed + gc.diffs.blobs_removed + gc.packages.blobs_removed; if !args.common.silent && (!gc.pruned.is_empty() || total > 0) { println!( @@ -1966,6 +2412,19 @@ pub async fn run(args: ScanArgs) -> i32 { socket_patch_core::utils::cleanup_blobs::format_bytes(gc.total_bytes()), ); } + if !args.common.silent && (!gc.vendored_reverted.is_empty() || gc.vendor_orphan_dirs > 0) { + println!( + "GC: reverted {} vendored entr{}; swept {} orphan vendor dir{}.", + gc.vendored_reverted.len(), + if gc.vendored_reverted.len() == 1 { + "y" + } else { + "ies" + }, + gc.vendor_orphan_dirs, + if gc.vendor_orphan_dirs == 1 { "" } else { "s" }, + ); + } } embed_vex_human(&args.common, &args.vex, &manifest_path, code).await @@ -2156,6 +2615,16 @@ mod tests { HashSet::new() } + /// GlobalArgs rooted at the test project dir (the vendored-state GC + /// loads `.socket/vendor/state.json` from `cwd`; these fixtures have + /// none, so the vendor pass is a no-op). + fn gc_common(cwd: &Path) -> crate::args::GlobalArgs { + crate::args::GlobalArgs { + cwd: cwd.to_path_buf(), + ..Default::default() + } + } + #[test] fn detect_prunable_empty_manifest_empty_scanned() { let m = PatchManifest::new(); @@ -2244,6 +2713,23 @@ mod tests { ); } + #[test] + fn detect_prunable_encoded_manifest_key_not_pruned() { + // The API serves scoped purls percent-encoded and they land in the + // manifest verbatim; the crawler reports the literal `@scope` form. + // Comparing raw strings would make every encoded scoped entry look + // prunable — `scan --prune` would GC the patch it just downloaded. + let m = manifest_with(&[("pkg:npm/%40scope/x@1.0.0", "uuid-a")]); + let s = scanned(&["pkg:npm/@scope/x@1.0.0"]); + assert!( + detect_prunable(&m, &s, &no_vendored()).is_empty(), + "encoded manifest key must match the decoded scanned purl" + ); + // A genuinely-gone encoded entry still prunes. + let out = detect_prunable(&m, &scanned(&[]), &no_vendored()); + assert_eq!(out, vec!["pkg:npm/%40scope/x@1.0.0".to_string()]); + } + #[test] fn detect_prunable_exempts_qualified_variant_of_vendored_base() { // The ledger key set carries qualifier-stripped bases (see @@ -2319,7 +2805,14 @@ mod tests { seed_manifest_with_blob(tmp.path(), "pkg:npm/gone@1.0.0", &after_hash); let scanned: HashSet = HashSet::new(); - let preview = preview_apply_gc(&manifest_path, &socket_dir, &scanned, &no_vendored()).await; + let preview = preview_apply_gc( + &gc_common(tmp.path()), + &manifest_path, + &socket_dir, + &scanned, + &no_vendored(), + ) + .await; assert_eq!( preview.pruned, @@ -2356,13 +2849,27 @@ mod tests { let (mp_p, sd_p, blob_p) = seed_manifest_with_blob(tmp_preview.path(), "pkg:npm/gone@1.0.0", &after_hash); let scanned: HashSet = HashSet::new(); - let preview = preview_apply_gc(&mp_p, &sd_p, &scanned, &no_vendored()).await; + let preview = preview_apply_gc( + &gc_common(tmp_preview.path()), + &mp_p, + &sd_p, + &scanned, + &no_vendored(), + ) + .await; assert!(blob_p.exists(), "preview must not mutate"); let tmp_wet = tempfile::tempdir().unwrap(); let (mp_w, sd_w, blob_w) = seed_manifest_with_blob(tmp_wet.path(), "pkg:npm/gone@1.0.0", &after_hash); - let wet = run_apply_gc(&mp_w, &sd_w, &scanned, &no_vendored()).await; + let wet = run_apply_gc( + &gc_common(tmp_wet.path()), + &mp_w, + &sd_w, + &scanned, + &no_vendored(), + ) + .await; assert_eq!( preview.blobs.blobs_removed, wet.blobs.blobs_removed, diff --git a/crates/socket-patch-cli/src/commands/vendor.rs b/crates/socket-patch-cli/src/commands/vendor.rs index ac7f9c6..dae1815 100644 --- a/crates/socket-patch-cli/src/commands/vendor.rs +++ b/crates/socket-patch-cli/src/commands/vendor.rs @@ -27,7 +27,7 @@ use socket_patch_core::patch::vendor::{ self, ecosystem_dir_for_purl, load_state, save_state, RevertOutcome, VendorEntry, VendorOutcome, VendorWarning, }; -use socket_patch_core::utils::purl::strip_purl_qualifiers; +use socket_patch_core::utils::purl::{normalize_purl, strip_purl_qualifiers}; use socket_patch_core::utils::telemetry::{track_patch_vendor_failed, track_patch_vendored}; use socket_patch_core::vex::time::now_rfc3339; use std::collections::{HashMap, HashSet}; @@ -36,7 +36,7 @@ use std::time::Duration; use crate::args::{apply_env_toggles, GlobalArgs}; use crate::commands::apply::{result_to_event, variant_matches_installed}; -use crate::commands::fetch_stage::{stage_patch_sources, StageOutcome}; +use crate::commands::fetch_stage::{stage_vendor_sources_in_memory, MemStageOutcome}; use crate::commands::lock_cli::{acquire_or_emit, lock_broken_event}; use crate::commands::vex::{generate_vex_from_manifest_path, VexEmbedArgs}; use crate::ecosystem_dispatch::{find_packages_for_purls, partition_purls}; @@ -49,8 +49,12 @@ pub struct VendorArgs { #[command(flatten)] pub common: GlobalArgs, - /// Skip pre-vendor hash verification (vendor even if the installed - /// package's files differ from the patch's beforeHash). + /// Tolerate MISSING patch-target files in the staged copy (they are + /// skipped instead of failing the vendor) and bypass the variant + /// probe for multi-release ecosystems. A plain beforeHash mismatch + /// no longer needs this: vendor staging always overwrites mismatched + /// content with the verified patched bytes (surfaced as a + /// `vendor_content_mismatch_overwritten` warning). #[arg( short = 'f', long, @@ -78,7 +82,7 @@ pub struct VendorArgs { /// Refusal codes that are expected skips, not command failures: the user's /// request is still fully satisfied when these are the only non-successes. -fn refusal_is_benign(code: &str) -> bool { +pub(crate) fn refusal_is_benign(code: &str) -> bool { matches!(code, "vendor_unsupported_ecosystem" | "already_vendored") } @@ -86,7 +90,7 @@ fn refusal_is_benign(code: &str) -> bool { /// installed location (site-packages root for pypi, the package dir /// otherwise). Returns `None` for purls with no vendor backend in this build. #[allow(clippy::too_many_arguments)] -async fn dispatch_vendor_one( +pub(crate) async fn dispatch_vendor_one( purl: &str, pkg_path: &Path, project_root: &Path, @@ -235,9 +239,67 @@ pub(crate) async fn dispatch_revert_one( } } +/// Is this vendored entry still consumed by its project's lockfile +/// dependency graph? `None` = cannot determine — callers must keep the +/// entry (fail-safe): non-npm ecosystems have no in-use probe yet, and a +/// missing/unreadable lockfile proves nothing. +pub(crate) async fn dispatch_in_use_one(entry: &VendorEntry, project_root: &Path) -> Option { + match entry.ecosystem.as_str() { + "npm" => { + socket_patch_core::patch::vendor::npm_flavor::vendored_entry_in_use(entry, project_root) + .await + } + _ => None, + } +} + +/// Uuid dirs under `.socket/vendor//` with no owning `(eco, uuid)` +/// ledger entry (a hand-edited state file, or artifacts left by an +/// interrupted run). The lockfile wiring for these is already gone or +/// owned by a recorded entry, so removal is safe; removed unless +/// `dry_run`. Unparseable dirs are never returned (and never deleted). +/// Returns the orphans so callers can emit events / counts. +pub(crate) async fn sweep_orphan_vendor_dirs( + cwd: &Path, + state: &socket_patch_core::patch::vendor::VendorState, + dry_run: bool, +) -> Vec { + let recorded_units: HashSet<(&str, &str)> = state + .entries + .values() + .map(|e| (e.ecosystem.as_str(), e.uuid.as_str())) + .collect(); + let mut orphans = Vec::new(); + for unit in vendor::path::sweep_vendor_dirs(cwd).await { + if recorded_units.contains(&(unit.eco.as_str(), unit.uuid.as_str())) { + continue; + } + if !dry_run { + let _ = remove_tree(&unit.dir).await; + } + orphans.push(unit); + } + orphans +} + +/// Does `eco` fall inside this run's `--ecosystems` scope? +pub(crate) fn ecosystem_in_scope(common: &GlobalArgs, eco: &str) -> bool { + match common.ecosystems.as_deref() { + None => true, + Some(list) => list.iter().any(|e| { + e.eq_ignore_ascii_case(eco) || (eco == "golang" && e.eq_ignore_ascii_case("go")) + }), + } +} + /// Surface a backend warning: stderr line for humans, a Skipped event with /// the stable code for JSON consumers (Skipped never flips the status). -fn record_warning(env: &mut Envelope, purl: &str, warning: &VendorWarning, common: &GlobalArgs) { +pub(crate) fn record_warning( + env: &mut Envelope, + purl: &str, + warning: &VendorWarning, + common: &GlobalArgs, +) { if !common.silent && !common.json { eprintln!("Warning ({}): {}", warning.code, warning.detail); } @@ -373,20 +435,24 @@ async fn run_vendor(args: &VendorArgs, manifest_path: &Path, env: &mut Envelope) let mut has_errors = reconcile_dropped(&manifest, common, env).await; let socket_dir = manifest_path.parent().unwrap_or(Path::new(".")); - let staged = match stage_patch_sources(common, &manifest, socket_dir).await { - Ok(StageOutcome::Ready(s)) => s, - Ok(StageOutcome::Unavailable) => { - env.mark_error(EnvelopeError::new( - "no_local_source", - "patch artifacts unavailable (offline or download failure)", - )); - return 1; - } - Err(e) => { - env.mark_error(EnvelopeError::new("stage_failed", e)); - return 1; - } - }; + // Vendor stages patch content IN MEMORY: existing .socket artifacts are + // read in place, missing content is fetched per patch — vendoring never + // writes blobs or temp files (the committed artifact is the patch). + let staged = + match stage_vendor_sources_in_memory(common, &manifest, socket_dir, &common.cwd).await { + Ok(MemStageOutcome::Ready(s)) => s, + Ok(MemStageOutcome::Unavailable) => { + env.mark_error(EnvelopeError::new( + "no_local_source", + "patch artifacts unavailable (offline or download failure)", + )); + return 1; + } + Err(e) => { + env.mark_error(EnvelopeError::new("stage_failed", e)); + return 1; + } + }; let sources = staged.as_patch_sources(); has_errors |= vendor_records(common, &manifest.patches, &sources, false, args.force, env).await; @@ -408,6 +474,140 @@ async fn run_vendor(args: &VendorArgs, manifest_path: &Path, env: &mut Envelope) /// /// Does NOT lock, read the manifest, or print the envelope — callers own all /// three. Returns whether any non-benign failure occurred. +/// Persist one backend-returned ledger entry: detached flagging, wiring +/// `original` carry-forward from the entry being replaced, per-package save +/// (crash-consistent with what is already wired), and the stale-uuid-dir +/// sweep on re-vendors. Returns `true` when the save failed (has_errors). +#[allow(clippy::too_many_arguments)] +pub(crate) async fn persist_vendor_entry( + common: &GlobalArgs, + env: &mut Envelope, + state: &mut socket_patch_core::patch::vendor::VendorState, + candidate: &str, + mut entry: socket_patch_core::patch::vendor::VendorEntry, + detached: bool, + record: &PatchRecord, +) -> bool { + let mut has_errors = false; + let candidate = candidate.to_string(); + entry.detached = detached; + entry.record = detached.then(|| record.clone()); + // A re-vendor run re-derives the entry from current + // disk state, where the takeover already happened — + // preserve the prior flag or the revert-time + // "takeover_not_restored" hint is lost. + let prev = state.entries.get(&candidate).cloned(); + if let Some(prev) = &prev { + entry.took_over_go_patches = entry.took_over_go_patches || prev.took_over_go_patches; + // A re-vendor (new patch uuid) rewrites our own + // stale wiring, so the backend records + // `original: None` (it must never record a + // dangling `.socket/vendor/` pointer as the + // pre-vendor fragment). The TRUE pre-vendor + // original lives in the entry being replaced — + // carry it forward by wiring identity, or a + // later `--revert` can only shrug + // (`vendor_lock_entry_drifted`) instead of + // restoring the registry fragment. + for rec in &mut entry.wiring { + if rec.action == socket_patch_core::patch::vendor::state::WiringAction::Rewritten + && rec.original.is_none() + { + if let Some(prev_rec) = prev + .wiring + .iter() + .find(|p| p.file == rec.file && p.kind == rec.kind && p.key == rec.key) + { + rec.original = prev_rec.original.clone(); + } + } + } + } + let new_uuid = entry.uuid.clone(); + state.entries.insert(candidate.clone(), entry); + // Persist per-package so a crash mid-run leaves a + // ledger that matches what's already wired. + if let Err(e) = save_state(&common.cwd, state).await { + has_errors = true; + env.record( + PatchEvent::new(PatchAction::Failed, candidate.clone()) + .with_error("vendor_state_write_failed", e.to_string()), + ); + } else if let Some(prev) = prev.filter(|p| p.uuid != new_uuid) { + // Re-vendor under a newer patch uuid: the old + // uuid's dir is an orphan now — the wiring and + // ledger both point at the new uuid — unless + // another entry still shares it (the same + // `(eco, uuid)` ownership test as `--revert`'s + // orphan sweep). Only the live entry would + // otherwise reclaim it, and that never happens. + let still_referenced = state + .entries + .values() + .any(|e| e.ecosystem == prev.ecosystem && e.uuid == prev.uuid); + let stale_rel = vendor::path::vendor_uuid_dir_rel(&prev.ecosystem, &prev.uuid); + if let Some(rel) = stale_rel.filter(|_| !still_referenced) { + if !common.dry_run { + let _ = remove_tree(&common.cwd.join(rel)).await; + } + env.record( + PatchEvent::new(PatchAction::Removed, candidate.clone()).with_reason( + "vendor_stale_artifact_removed", + "previous patch uuid's vendored artifact removed", + ), + ); + } + } + has_errors +} + +/// One registry-fetch attempt through the pristine-source ladder's network +/// half: the lockfile inventory first, then the ledger-recovered pre-vendor +/// registry fragment (the live lockfile is rewired to `.socket/vendor/...` +/// for vendored packages, so only `--revert`'s restore data still knows the +/// registry resolution). Always integrity-verified fail-closed. +pub(crate) enum PristineFetch { + Fetched(socket_patch_core::patch::vendor::registry_fetch::FetchedPackage), + /// Neither the lockfile nor the ledger can name a verifiable source. + NoSource, + Unverifiable(String), + Failed(String), +} + +pub(crate) async fn fetch_pristine_package( + project_root: &Path, + inventory: &[socket_patch_core::patch::vendor::lock_inventory::LockfileEntry], + client: &socket_patch_core::patch::vendor::registry_fetch::RegistryClient, + purl: &str, + ledger_entry: Option<&socket_patch_core::patch::vendor::VendorEntry>, +) -> PristineFetch { + use socket_patch_core::patch::vendor::{lock_inventory, registry_fetch}; + + let entry = match lock_inventory::lookup(inventory, purl) { + Some(e) => e.clone(), + None => { + let Some(le) = ledger_entry else { + return PristineFetch::NoSource; + }; + match lock_inventory::recover_lock_entry(project_root, le).await { + Ok(rec) => rec, + Err(e) => { + return PristineFetch::Unverifiable(format!( + "the lockfile no longer records a registry resolution for {purl} \ + (rewired to the vendored artifact) and the ledger cannot recover \ + one: {e}" + )) + } + } + } + }; + match registry_fetch::fetch_and_stage(&entry, client).await { + Ok(fetched) => PristineFetch::Fetched(fetched), + Err(registry_fetch::FetchError::Unverifiable(d)) => PristineFetch::Unverifiable(d), + Err(registry_fetch::FetchError::Failed(d)) => PristineFetch::Failed(d), + } +} + pub(crate) async fn vendor_records( common: &GlobalArgs, records: &HashMap, @@ -465,13 +665,162 @@ pub(crate) async fn vendor_records( global_prefix: common.global_prefix.clone(), batch_size: 100, }; - let all_packages = find_packages_for_purls( + let mut all_packages = find_packages_for_purls( &vendorable_partition, &crawler_options, common.silent || common.json, ) .await; + // ── Auto-fetch: lockfile-resolved packages with no installed copy ──── + // A manifest patch whose package is not on disk but IS resolvable from + // the project's lockfile is fetched pristine from its registry (lock- + // recorded URL else the conventional one), verified against the lock's + // integrity FAIL-CLOSED, and staged from a private tempdir — the + // project tree is never touched, and the lock wiring works without an + // installed copy (it keys off lock entries). The holders keep the + // tempdirs alive until the dispatch loop below has staged from them. + let mut fetched_holders: Vec = + Vec::new(); + // Fetch failures must keep their distinct Failed event; this set + // suppresses the later duplicate `package_not_installed` skip. + let mut fetch_failed: HashSet = HashSet::new(); + { + use socket_patch_core::patch::vendor::{lock_inventory, registry_fetch}; + let missing: Vec = vendorable + .iter() + .filter(|p| !all_packages.contains_key(*p)) + .cloned() + .collect(); + if !missing.is_empty() { + // The inventory is a local file read — fine offline; only the + // fetch itself needs the network. + let inventory = lock_inventory::inventory_project(&common.cwd).await; + let client = registry_fetch::build_registry_client(); + // Pre-loaded vendor ledger for the artifact-staging path: an + // already-vendored purl with no installed copy (fresh clone) + // stages from its own committed artifact, sha256-verified + // against the ledger — offline-safe, no registry traffic. + let ledger = load_state(&common.cwd).await.unwrap_or_default(); + for purl in &missing { + let ledger_entry = ledger + .entries + .get(purl) + .or_else(|| ledger.entries.values().find(|e| &e.base_purl == purl)); + if let Some(entry) = ledger_entry + .filter(|e| e.ecosystem == "npm" && e.artifact.path.ends_with(".tgz")) + { + let tgz = common.cwd.join(&entry.artifact.path); + if tokio::fs::metadata(&tgz).await.is_err() { + // The committed artifact is GONE (gitignored or + // deleted): not corruption — fall through to the + // registry ladder, which recovers the pre-vendor + // resolution from the ledger and rebuilds. + record_warning( + env, + purl, + &VendorWarning::new( + "vendor_artifact_missing", + format!( + "the committed vendored artifact {} is missing; \ + recovering the registry resolution to rebuild it", + entry.artifact.path + ), + ), + common, + ); + } else { + match registry_fetch::stage_local_artifact(&tgz, &entry.artifact.sha256) + .await + { + Ok(staged) => { + all_packages.insert(purl.clone(), staged.dir().to_path_buf()); + fetched_holders.push(staged); + continue; + } + Err(registry_fetch::FetchError::Failed(detail)) => { + // A PRESENT-but-corrupt committed artifact is + // worth a loud failure — silently re-vendoring + // over it would mask the corruption. + fetch_failed.insert(purl.clone()); + let detail = format!( + "{detail}; run `socket-patch repair` to rebuild the \ + vendored artifact" + ); + env.record( + PatchEvent::new(PatchAction::Failed, purl.clone()) + .with_error("vendor_fetch_failed", detail.clone()), + ); + if !common.silent && !common.json { + eprintln!("Cannot vendor {}: {detail}", normalize_purl(purl)); + } + continue; + } + Err(registry_fetch::FetchError::Unverifiable(_)) => { + // No recorded hash (legacy ledger) — fall + // through to the lockfile/registry path. + } + } + } + } + if common.offline { + // The enriched skip detail lands below in the unmatched + // pass (the purl stays unmatched). + continue; + } + match fetch_pristine_package(&common.cwd, &inventory, &client, purl, ledger_entry) + .await + { + PristineFetch::Fetched(fetched) => { + record_warning( + env, + purl, + &VendorWarning::new( + "vendor_fetched_missing", + format!( + "{} is not installed; fetched the pristine artifact \ + from {} (integrity verified) and vendored from that \ + copy — the project tree was not touched", + normalize_purl(purl), + fetched.url + ), + ), + common, + ); + all_packages.insert(purl.clone(), fetched.dir().to_path_buf()); + fetched_holders.push(fetched); + } + PristineFetch::NoSource => { + // Plain not-installed package → the calm + // package_not_installed skip below. + } + PristineFetch::Unverifiable(detail) => { + record_warning( + env, + purl, + &VendorWarning::new("vendor_fetch_unverifiable", detail), + common, + ); + // Falls through to package_not_installed below. + } + PristineFetch::Failed(detail) => { + fetch_failed.insert(purl.clone()); + env.record( + PatchEvent::new(PatchAction::Failed, purl.clone()) + .with_error("vendor_fetch_failed", detail.clone()), + ); + if !common.silent && !common.json { + eprintln!( + "Cannot vendor {}: fetch failed: {detail}", + normalize_purl(purl) + ); + } + } + } + } + } + } + let vendored_at = now_rfc3339(); let mut state = match load_state(&common.cwd).await { Ok(s) => s, @@ -566,7 +915,7 @@ pub(crate) async fn vendor_records( ); } if !common.silent && !common.json { - eprintln!("Cannot vendor {candidate}: {detail}"); + eprintln!("Cannot vendor {}: {detail}", normalize_purl(candidate)); } } Some(VendorOutcome::Done { @@ -579,101 +928,55 @@ pub(crate) async fn vendor_records( if !common.silent && !common.json { eprintln!( "Failed to vendor {}: {}", - candidate, + normalize_purl(candidate), result.error.as_deref().unwrap_or("unknown error") ); } } let mut event = result_to_event(&result, common.dry_run); // The shared translator's in-sync classification reads - // `already_patched`; under `vendor` the contract tag is - // `already_vendored` (artifact + wiring already in sync). + // `already_patched`. Two distinct cases land there: + // + // * `entry` is None — the TRUE in-sync rerun (the backend + // synthesized AlreadyPatched and recorded nothing); + // under `vendor` the contract tag is `already_vendored`. + // * `entry` is Some — the FIRST vendor of a package + // already patched in place by `apply`: every file + // verified AlreadyPatched, but THIS run packed the + // artifact and rewired the lock. That is an Applied + // (`summary.applied` must count it), not a skip. if event.action == PatchAction::Skipped && event.error_code.as_deref() == Some("already_patched") { - event = PatchEvent::new(PatchAction::Skipped, candidate.clone()) - .with_reason( - "already_vendored", - "artifact and lockfile wiring already in sync", - ); + if entry.is_none() { + event = PatchEvent::new(PatchAction::Skipped, candidate.clone()) + .with_reason( + "already_vendored", + "artifact and lockfile wiring already in sync", + ); + } else { + let files = result + .files_verified + .iter() + .map(|f| crate::json_envelope::PatchEventFile { + path: f.file.clone(), + verified: true, + applied_via: None, + }) + .collect(); + event = PatchEvent::new(PatchAction::Applied, candidate.clone()) + .with_files(files); + } } env.record(event); for w in &warnings { record_warning(env, candidate, w, common); } - if let Some(mut entry) = entry { - entry.detached = detached; - entry.record = detached.then(|| record.clone()); - // A re-vendor run re-derives the entry from current - // disk state, where the takeover already happened — - // preserve the prior flag or the revert-time - // "takeover_not_restored" hint is lost. - let prev = state.entries.get(candidate).cloned(); - if let Some(prev) = &prev { - entry.took_over_go_patches = - entry.took_over_go_patches || prev.took_over_go_patches; - // A re-vendor (new patch uuid) rewrites our own - // stale wiring, so the backend records - // `original: None` (it must never record a - // dangling `.socket/vendor/` pointer as the - // pre-vendor fragment). The TRUE pre-vendor - // original lives in the entry being replaced — - // carry it forward by wiring identity, or a - // later `--revert` can only shrug - // (`vendor_lock_entry_drifted`) instead of - // restoring the registry fragment. - for rec in &mut entry.wiring { - if rec.action - == socket_patch_core::patch::vendor::state::WiringAction::Rewritten - && rec.original.is_none() - { - if let Some(prev_rec) = prev.wiring.iter().find(|p| { - p.file == rec.file - && p.kind == rec.kind - && p.key == rec.key - }) { - rec.original = prev_rec.original.clone(); - } - } - } - } - let new_uuid = entry.uuid.clone(); - state.entries.insert(candidate.clone(), entry); - // Persist per-package so a crash mid-run leaves a - // ledger that matches what's already wired. - if let Err(e) = save_state(&common.cwd, &state).await { - has_errors = true; - env.record( - PatchEvent::new(PatchAction::Failed, candidate.clone()) - .with_error("vendor_state_write_failed", e.to_string()), - ); - } else if let Some(prev) = prev.filter(|p| p.uuid != new_uuid) { - // Re-vendor under a newer patch uuid: the old - // uuid's dir is an orphan now — the wiring and - // ledger both point at the new uuid — unless - // another entry still shares it (the same - // `(eco, uuid)` ownership test as `--revert`'s - // orphan sweep). Only the live entry would - // otherwise reclaim it, and that never happens. - let still_referenced = state - .entries - .values() - .any(|e| e.ecosystem == prev.ecosystem && e.uuid == prev.uuid); - let stale_rel = - vendor::path::vendor_uuid_dir_rel(&prev.ecosystem, &prev.uuid); - if let Some(rel) = stale_rel.filter(|_| !still_referenced) { - if !common.dry_run { - let _ = remove_tree(&common.cwd.join(rel)).await; - } - env.record( - PatchEvent::new(PatchAction::Removed, candidate.clone()) - .with_reason( - "vendor_stale_artifact_removed", - "previous patch uuid's vendored artifact removed", - ), - ); - } - } + if let Some(entry) = entry { + has_errors |= persist_vendor_entry( + common, env, &mut state, candidate, entry, detached, record, + ) + .await; } } } @@ -681,10 +984,10 @@ pub(crate) async fn vendor_records( } // Manifest entries that targeted in-scope ecosystems but had no - // installed package on disk. + // installed package on disk (and could not be auto-fetched). let mut unmatched: Vec = vendorable .iter() - .filter(|p| !matched.contains(*p)) + .filter(|p| !matched.contains(*p) && !fetch_failed.contains(*p)) .cloned() .collect(); unmatched.sort(); @@ -694,15 +997,38 @@ pub(crate) async fn vendor_records( .map(|p| strip_purl_qualifiers(p).to_string()) .collect(); unmatched.retain(|p| !vendored_bases.contains(strip_purl_qualifiers(p))); + has_errors |= !fetch_failed.is_empty(); if !unmatched.is_empty() { has_errors = true; + // Offline runs name the packages the lockfile COULD have fetched — + // the inventory is a local file read, allowed offline. + let lock_resolvable: HashSet = if common.offline { + let entries = + socket_patch_core::patch::vendor::lock_inventory::inventory_project(&common.cwd) + .await; + unmatched + .iter() + .filter(|p| { + socket_patch_core::patch::vendor::lock_inventory::lookup(&entries, p).is_some() + }) + .cloned() + .collect() + } else { + HashSet::new() + }; for purl in &unmatched { + let detail = if lock_resolvable.contains(purl) { + "no installed package found; --offline prevents fetching it from the \ + registry (the lockfile resolves it)" + } else { + "no installed package found" + }; env.record( PatchEvent::new(PatchAction::Skipped, purl.clone()) - .with_reason("package_not_installed", "no installed package found"), + .with_reason("package_not_installed", detail), ); if !common.silent && !common.json { - eprintln!("Cannot vendor {purl}: package not installed"); + eprintln!("Cannot vendor {}: {detail}", normalize_purl(purl)); } } } @@ -741,12 +1067,6 @@ pub(crate) async fn reconcile_dropped( // Respect this run's --ecosystems scope: a `vendor --ecosystems npm` // invocation must not silently revert a cargo/go entry (restoring its // lockfile and deleting its artifact) as a cross-ecosystem side effect. - let in_scope = |eco: &str| match common.ecosystems.as_deref() { - None => true, - Some(list) => list.iter().any(|e| { - e.eq_ignore_ascii_case(eco) || (eco == "golang" && e.eq_ignore_ascii_case("go")) - }), - }; let stale: Vec = state .entries .iter() @@ -756,7 +1076,7 @@ pub(crate) async fn reconcile_dropped( // normal state, not a drop — only `vendor --revert` or // `remove` may undo them. !entry.detached - && in_scope(&entry.ecosystem) + && ecosystem_in_scope(common, &entry.ecosystem) && !manifest.patches.contains_key(*purl) && !manifest.patches.contains_key(&entry.base_purl) }) @@ -849,19 +1169,7 @@ async fn run_revert(args: &VendorArgs, env: &mut Envelope) -> i32 { // state file, or artifacts left by an interrupted run). The lockfile // wiring for these is already gone or owned by a recorded entry, so // removal is safe; unparseable dirs are reported, never deleted. - let swept = vendor::path::sweep_vendor_dirs(&common.cwd).await; - let recorded_units: HashSet<(&str, &str)> = state - .entries - .values() - .map(|e| (e.ecosystem.as_str(), e.uuid.as_str())) - .collect(); - for unit in swept { - if recorded_units.contains(&(unit.eco.as_str(), unit.uuid.as_str())) { - continue; - } - if !common.dry_run { - let _ = remove_tree(&unit.dir).await; - } + for unit in sweep_orphan_vendor_dirs(&common.cwd, &state, common.dry_run).await { let label = unit .purls .first() @@ -899,3 +1207,406 @@ async fn run_revert(args: &VendorArgs, env: &mut Envelope) -> i32 { 0 } } + +// ───────────────────────── prune-time vendored GC ───────────────────────── + +/// Summary of the vendored-state GC pass `scan --prune` runs (wet or +/// preview). Purls are the state-ledger keys (manifest spelling). +#[derive(Debug, Default)] +pub(crate) struct VendorGcSummary { + /// (a) entries whose patch is gone from the manifest — reverted. + pub dropped_reverted: Vec, + /// (b) entries whose package left the lockfile dependency graph — + /// reverted, and their manifest entries dropped. + pub unused_reverted: Vec, + /// (c) orphan uuid dirs (no owning ledger entry) swept. + pub orphan_dirs: usize, + /// Entries that could not be reverted (kept in the ledger), plus any + /// pass-level skip marker (e.g. lock contention). + pub failed: Vec, +} + +/// The vendored-state GC behind `scan --prune`: +/// +/// (a) revert entries whose patch was dropped from the manifest (same +/// stale test as [`reconcile_dropped`], shared with the vendor flows); +/// (b) revert entries whose dependency is no longer in the lockfile graph +/// ([`dispatch_in_use_one`] == `Some(false)`; `None` keeps, fail-safe) +/// and drop their manifest entries so the caller's manifest prune + +/// blob sweep reclaims the rest in the same pass; +/// (c) sweep orphan uuid dirs. +/// +/// Detached entries are exempt from BOTH (a) (never manifest-tracked) and +/// (b) (lockfile-invisible by design — the probe would always call them +/// unused). A missing/unreadable manifest skips (a) only (a prune must +/// not mass-revert on a deleted manifest — that is `vendor --revert`'s +/// explicit contract). +/// +/// Wet runs take the apply lock (lockfiles + the manifest are rewritten); +/// contention records a skip marker and returns — it never fails the +/// scan. Dry runs are read-only, lock-free, and list-only. +pub(crate) async fn run_vendor_gc( + common: &GlobalArgs, + manifest_path: &Path, + dry_run: bool, +) -> VendorGcSummary { + let mut out = VendorGcSummary::default(); + let mut state = match load_state(&common.cwd).await { + Ok(s) if !s.entries.is_empty() => s, + // No ledger (or unreadable): only the orphan sweep could apply, and + // without a trustworthy ledger it must not delete anything. + _ => return out, + }; + + let socket_dir = manifest_path + .parent() + .map(Path::to_path_buf) + .unwrap_or_else(|| common.cwd.clone()); + let _guard = if dry_run { + None + } else { + match socket_patch_core::patch::apply_lock::acquire(&socket_dir, Duration::from_secs(0)) { + Ok(g) => Some(g), + Err(_) => { + out.failed.push( + "vendor GC skipped: another socket-patch run holds the apply lock".to_string(), + ); + return out; + } + } + }; + + // (a) manifest-dropped entries. + let mut manifest = socket_patch_core::manifest::operations::read_manifest(manifest_path) + .await + .ok() + .flatten(); + if let Some(m) = &manifest { + let stale: Vec = state + .entries + .iter() + .filter(|(purl, entry)| { + !entry.detached + && ecosystem_in_scope(common, &entry.ecosystem) + && !m.patches.contains_key(*purl) + && !m.patches.contains_key(&entry.base_purl) + }) + .map(|(purl, _)| purl.clone()) + .collect(); + for purl in stale { + if dry_run { + out.dropped_reverted.push(purl); + continue; + } + let entry = state.entries.get(&purl).cloned().expect("listed above"); + if dispatch_revert_one(&entry, &common.cwd, false) + .await + .success + { + state.entries.remove(&purl); + out.dropped_reverted.push(purl); + } else { + out.failed.push(purl); + } + } + } + + // (b) lockfile-unused entries. + let mut manifest_dirty = false; + let candidates: Vec = state + .entries + .iter() + .filter(|(_, entry)| !entry.detached && ecosystem_in_scope(common, &entry.ecosystem)) + .map(|(purl, _)| purl.clone()) + .collect(); + for purl in candidates { + let entry = state.entries.get(&purl).cloned().expect("listed above"); + if dispatch_in_use_one(&entry, &common.cwd).await != Some(false) { + continue; // in use, or cannot determine — keep + } + if dry_run { + out.unused_reverted.push(purl); + continue; + } + if !dispatch_revert_one(&entry, &common.cwd, false) + .await + .success + { + out.failed.push(purl); + continue; + } + state.entries.remove(&purl); + if let Some(m) = manifest.as_mut() { + let base = strip_purl_qualifiers(&entry.base_purl).to_string(); + let dropped: Vec = m + .patches + .keys() + .filter(|k| *k == &purl || strip_purl_qualifiers(k) == base) + .cloned() + .collect(); + for k in dropped { + m.patches.remove(&k); + manifest_dirty = true; + } + } + out.unused_reverted.push(purl); + } + + if !dry_run { + let _ = save_state(&common.cwd, &state).await; + if manifest_dirty { + if let Some(m) = &manifest { + let _ = + socket_patch_core::manifest::operations::write_manifest(manifest_path, m).await; + } + } + } + + // (c) orphan uuid dirs, against the post-removal ledger. + out.orphan_dirs = sweep_orphan_vendor_dirs(&common.cwd, &state, dry_run) + .await + .len(); + out +} + +#[cfg(test)] +mod gc_tests { + use super::*; + use socket_patch_core::manifest::operations::{read_manifest, write_manifest}; + use socket_patch_core::patch::vendor::state::VendorArtifact; + use socket_patch_core::patch::vendor::VendorState; + use std::path::PathBuf; + + const UUID: &str = "9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f"; + const PURL: &str = "pkg:npm/left-pad@1.3.0"; + + fn entry(detached: bool) -> VendorEntry { + VendorEntry { + ecosystem: "npm".into(), + base_purl: PURL.into(), + uuid: UUID.into(), + artifact: VendorArtifact { + path: format!(".socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz"), + sha256: String::new(), + size: None, + platform_locked: None, + }, + wiring: Vec::new(), + lock: None, + took_over_go_patches: false, + detached, + record: None, + flavor: Some("package-lock".into()), + uv: None, + pnpm: None, + poetry: None, + pdm: None, + pipenv: None, + } + } + + /// Tempdir with: a manifest carrying PURL, a ledger with one entry, + /// the artifact on disk, and a package-lock that resolves to it. + async fn gc_fixture(detached: bool) -> (tempfile::TempDir, GlobalArgs, PathBuf) { + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + let socket = root.join(".socket"); + tokio::fs::create_dir_all(socket.join(format!("vendor/npm/{UUID}"))) + .await + .unwrap(); + tokio::fs::write( + socket.join(format!("vendor/npm/{UUID}/left-pad-1.3.0.tgz")), + b"tgz", + ) + .await + .unwrap(); + + let mut manifest = PatchManifest::new(); + manifest.patches.insert( + PURL.to_string(), + socket_patch_core::manifest::schema::PatchRecord { + uuid: UUID.to_string(), + exported_at: String::new(), + files: HashMap::new(), + vulnerabilities: HashMap::new(), + description: String::new(), + license: String::new(), + tier: String::new(), + }, + ); + let manifest_path = socket.join("manifest.json"); + write_manifest(&manifest_path, &manifest).await.unwrap(); + + let mut state = VendorState::default(); + state.entries.insert(PURL.to_string(), entry(detached)); + save_state(root, &state).await.unwrap(); + + tokio::fs::write( + root.join("package-lock.json"), + format!( + "{{\"packages\":{{\"node_modules/left-pad\":{{\"resolved\":\"file:.socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz\"}}}}}}" + ), + ) + .await + .unwrap(); + + let common = GlobalArgs { + cwd: root.to_path_buf(), + json: true, + silent: true, + ..GlobalArgs::default() + }; + (tmp, common, manifest_path) + } + + /// In-manifest + in-lock: the GC keeps everything. + #[tokio::test] + async fn vendor_gc_keeps_in_use_entries() { + let (tmp, common, manifest_path) = gc_fixture(false).await; + let out = run_vendor_gc(&common, &manifest_path, false).await; + assert!(out.dropped_reverted.is_empty(), "{out:?}"); + assert!(out.unused_reverted.is_empty(), "{out:?}"); + assert_eq!(out.orphan_dirs, 0); + assert!(load_state(tmp.path()) + .await + .unwrap() + .entries + .contains_key(PURL)); + } + + /// (a) the patch is gone from the manifest: revert + drop the entry. + #[tokio::test] + async fn vendor_gc_reverts_manifest_dropped_entry() { + let (tmp, common, manifest_path) = gc_fixture(false).await; + write_manifest(&manifest_path, &PatchManifest::new()) + .await + .unwrap(); + + let out = run_vendor_gc(&common, &manifest_path, false).await; + assert_eq!(out.dropped_reverted, vec![PURL.to_string()], "{out:?}"); + assert!(out.failed.is_empty(), "{out:?}"); + assert!(load_state(tmp.path()).await.unwrap().entries.is_empty()); + assert!( + !tmp.path() + .join(format!(".socket/vendor/npm/{UUID}")) + .exists(), + "artifact dir removed by the revert" + ); + } + + /// (b) the dependency left the lockfile graph: revert + drop BOTH the + /// ledger entry and the manifest entry. + #[tokio::test] + async fn vendor_gc_reverts_unused_entry_and_drops_manifest_entry() { + let (tmp, common, manifest_path) = gc_fixture(false).await; + // Re-lock without the dependency (no reference to the artifact). + tokio::fs::write(tmp.path().join("package-lock.json"), "{\"packages\":{}}") + .await + .unwrap(); + + let out = run_vendor_gc(&common, &manifest_path, false).await; + assert_eq!(out.unused_reverted, vec![PURL.to_string()], "{out:?}"); + assert!(load_state(tmp.path()).await.unwrap().entries.is_empty()); + let manifest = read_manifest(&manifest_path).await.unwrap().unwrap(); + assert!( + !manifest.patches.contains_key(PURL), + "the unused entry's manifest record is dropped too" + ); + } + + /// Dry run lists without mutating anything. + #[tokio::test] + async fn vendor_gc_dry_run_is_read_only() { + let (tmp, common, manifest_path) = gc_fixture(false).await; + tokio::fs::write(tmp.path().join("package-lock.json"), "{\"packages\":{}}") + .await + .unwrap(); + let state_before = tokio::fs::read(tmp.path().join(".socket/vendor/state.json")) + .await + .unwrap(); + let manifest_before = tokio::fs::read(&manifest_path).await.unwrap(); + + let out = run_vendor_gc(&common, &manifest_path, true).await; + assert_eq!(out.unused_reverted, vec![PURL.to_string()], "{out:?}"); + assert_eq!( + tokio::fs::read(tmp.path().join(".socket/vendor/state.json")) + .await + .unwrap(), + state_before, + "dry run must not touch the ledger" + ); + assert_eq!( + tokio::fs::read(&manifest_path).await.unwrap(), + manifest_before, + "dry run must not touch the manifest" + ); + assert!( + tmp.path() + .join(format!(".socket/vendor/npm/{UUID}")) + .exists(), + "dry run must not remove artifacts" + ); + } + + /// A missing/undeterminable lockfile keeps the entry (fail-safe), and a + /// DETACHED entry is exempt from both (a) and (b). + #[tokio::test] + async fn vendor_gc_keeps_undeterminable_and_detached_entries() { + // Lock removed entirely: probe says None → keep. + let (tmp, common, manifest_path) = gc_fixture(false).await; + tokio::fs::remove_file(tmp.path().join("package-lock.json")) + .await + .unwrap(); + let out = run_vendor_gc(&common, &manifest_path, false).await; + assert!(out.unused_reverted.is_empty(), "{out:?}"); + assert!(load_state(tmp.path()) + .await + .unwrap() + .entries + .contains_key(PURL)); + + // Detached entry: absent from the manifest AND lockfile-invisible — + // exactly its normal state. Never reverted by the GC. + let (tmp, common, manifest_path) = gc_fixture(true).await; + write_manifest(&manifest_path, &PatchManifest::new()) + .await + .unwrap(); + tokio::fs::write(tmp.path().join("package-lock.json"), "{\"packages\":{}}") + .await + .unwrap(); + let out = run_vendor_gc(&common, &manifest_path, false).await; + assert!(out.dropped_reverted.is_empty(), "{out:?}"); + assert!(out.unused_reverted.is_empty(), "{out:?}"); + assert!(load_state(tmp.path()) + .await + .unwrap() + .entries + .contains_key(PURL)); + } + + /// (c) uuid dirs with no owning ledger entry are swept (wet) / counted + /// (dry). + #[tokio::test] + async fn vendor_gc_sweeps_orphan_uuid_dirs() { + let (tmp, common, manifest_path) = gc_fixture(false).await; + let orphan_uuid = "1a2b3c4d-5e6f-4a1b-8c2d-9e0f1a2b3c4d"; + let orphan_dir = tmp.path().join(format!(".socket/vendor/npm/{orphan_uuid}")); + tokio::fs::create_dir_all(&orphan_dir).await.unwrap(); + tokio::fs::write(orphan_dir.join("left-pad-1.3.0.tgz"), b"tgz") + .await + .unwrap(); + + let out = run_vendor_gc(&common, &manifest_path, true).await; + assert_eq!(out.orphan_dirs, 1, "{out:?}"); + assert!(orphan_dir.exists(), "dry run keeps the orphan"); + + let out = run_vendor_gc(&common, &manifest_path, false).await; + assert_eq!(out.orphan_dirs, 1, "{out:?}"); + assert!(!orphan_dir.exists(), "wet run sweeps the orphan"); + // The recorded entry's dir survives the sweep. + assert!(tmp + .path() + .join(format!(".socket/vendor/npm/{UUID}")) + .exists()); + } +} diff --git a/crates/socket-patch-cli/src/json_envelope.rs b/crates/socket-patch-cli/src/json_envelope.rs index 9d88fde..13eb342 100644 --- a/crates/socket-patch-cli/src/json_envelope.rs +++ b/crates/socket-patch-cli/src/json_envelope.rs @@ -328,6 +328,10 @@ pub enum PatchAction { /// `apply --dry-run` / `scan --dry-run`: patch *would* apply /// cleanly. `files` lists what would change. Verified, + /// `repair`: a missing/corrupt vendored artifact was rebuilt in place + /// from verified sources (lockfiles and the vendor ledger untouched + /// unless drift was healed). + Rebuilt, } /// Patch-source strategy used to apply a file. Mirrors the existing @@ -402,6 +406,14 @@ pub struct Summary { pub failed: u32, pub removed: u32, pub verified: u32, + /// `repair`-only (vendored artifact rebuilds); omitted while zero so + /// every other command's summary shape is unchanged. + #[serde(skip_serializing_if = "u32_is_zero")] + pub rebuilt: u32, +} + +fn u32_is_zero(n: &u32) -> bool { + *n == 0 } impl Summary { @@ -415,6 +427,7 @@ impl Summary { PatchAction::Failed => self.failed += 1, PatchAction::Removed => self.removed += 1, PatchAction::Verified => self.verified += 1, + PatchAction::Rebuilt => self.rebuilt += 1, } } } diff --git a/crates/socket-patch-cli/tests/apply_network.rs b/crates/socket-patch-cli/tests/apply_network.rs index d9bb628..5c4eadb 100644 --- a/crates/socket-patch-cli/tests/apply_network.rs +++ b/crates/socket-patch-cli/tests/apply_network.rs @@ -449,49 +449,101 @@ async fn apply_with_force_overrides_hash_mismatch() { } #[tokio::test] -async fn apply_without_force_hash_mismatch_emits_failed_event() { +async fn apply_hash_mismatch_default_warns_and_applies_strict_fails() { let after = b"after\n"; let after_hash = git_sha256(after); let expected_before = b"expected-before\n"; let actual_before = b"DIFFERENT-CONTENT\n"; let expected_before_hash = git_sha256(expected_before); - let tmp = tempfile::tempdir().expect("tempdir"); - write_root_package_json(tmp.path()); - write_npm_package(tmp.path(), "mismatch", "1.0.0", "index.js", actual_before); - let socket = tmp.path().join(".socket"); - write_manifest_with_patch( - &socket, - "pkg:npm/mismatch@1.0.0", - "11111111-1111-4111-8111-111111111111", - &expected_before_hash, - &after_hash, - ); - let blobs = socket.join("blobs"); - std::fs::create_dir_all(&blobs).unwrap(); - std::fs::write(blobs.join(&after_hash), after).unwrap(); + let fixture = || { + let tmp = tempfile::tempdir().expect("tempdir"); + write_root_package_json(tmp.path()); + write_npm_package(tmp.path(), "mismatch", "1.0.0", "index.js", actual_before); + let socket = tmp.path().join(".socket"); + write_manifest_with_patch( + &socket, + "pkg:npm/mismatch@1.0.0", + "11111111-1111-4111-8111-111111111111", + &expected_before_hash, + &after_hash, + ); + let blobs = socket.join("blobs"); + std::fs::create_dir_all(&blobs).unwrap(); + std::fs::write(blobs.join(&after_hash), after).unwrap(); + tmp + }; + // DEFAULT: the mismatch is overwritten with the full verified patched + // content (the diff strategy would self-skip; the blob is hash-gated to + // afterHash) and surfaced as a warning event — exit 0. + let tmp = fixture(); let out = Command::new(binary()) .args(["apply", "--json", "--offline"]) .current_dir(tmp.path()) .env_remove("SOCKET_API_TOKEN") .output() .expect("run socket-patch"); - let code = out.status.code().unwrap_or(-1); let stdout = String::from_utf8_lossy(&out.stdout).to_string(); - assert_eq!(code, 1, "hash mismatch w/o --force must exit 1"); let v: serde_json::Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); - assert_eq!(v["status"], "partialFailure"); + assert_eq!( + out.status.code().unwrap_or(-1), + 0, + "default mismatch is a warning, not an error: {v:#}" + ); + assert_eq!(v["status"], "success", "{v:#}"); let events = v["events"].as_array().expect("events array"); - let has_failed = events.iter().any(|e| e["action"] == "failed"); assert!( - has_failed, - "must emit a failed event on hash mismatch; got events={events:?}" + events.iter().any(|e| e["action"] == "applied"), + "{events:?}" + ); + assert!( + events + .iter() + .any(|e| e["errorCode"] == "content_mismatch_overwritten"), + "the overwrite is surfaced as a warning event: {events:?}" + ); + let content = std::fs::read(tmp.path().join("node_modules/mismatch/index.js")).unwrap(); + assert_eq!( + content, after, + "the file carries the verified patched bytes" + ); + + // The human run logs the warning to stderr. + let tmp = fixture(); + let out = Command::new(binary()) + .args(["apply", "--offline", "--yes"]) + .current_dir(tmp.path()) + .env_remove("SOCKET_API_TOKEN") + .output() + .expect("run socket-patch"); + let stderr = String::from_utf8_lossy(&out.stderr); + assert_eq!(out.status.code().unwrap_or(-1), 0, "stderr={stderr}"); + assert!( + stderr.contains("content_mismatch_overwritten"), + "stderr warning present: {stderr}" ); - // File must be UNCHANGED. + // --strict: the old fail-closed contract — exit 1, failed event, file + // untouched. + let tmp = fixture(); + let out = Command::new(binary()) + .args(["apply", "--json", "--offline", "--strict"]) + .current_dir(tmp.path()) + .env_remove("SOCKET_API_TOKEN") + .output() + .expect("run socket-patch"); + let stdout = String::from_utf8_lossy(&out.stdout).to_string(); + let v: serde_json::Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); + assert_eq!(out.status.code().unwrap_or(-1), 1, "{v:#}"); + assert_eq!(v["status"], "partialFailure", "{v:#}"); + let events = v["events"].as_array().expect("events array"); + assert!( + events.iter().any(|e| e["action"] == "failed"), + "strict emits a failed event: {events:?}" + ); let content = std::fs::read(tmp.path().join("node_modules/mismatch/index.js")).unwrap(); - assert_eq!(content, actual_before, "hash mismatch must not modify file"); + assert_eq!(content, actual_before, "strict must not modify the file"); } // --------------------------------------------------------------------------- @@ -650,3 +702,95 @@ async fn apply_uses_locally_cached_blob_without_fetching() { "cached blob must survive apply" ); } + +// --------------------------------------------------------------------------- +// Mismatch + diff-mode sources: the full blob is redownloaded on demand. +// --------------------------------------------------------------------------- + +/// A mismatched file cannot be patched from a partial source (the diff +/// strategy needs the exact before-bytes), so the default mismatch policy +/// redownloads the FULL afterHash blob and applies that — even when a +/// local source archive made the stage step skip downloading. +#[tokio::test] +async fn apply_mismatch_redownloads_full_blob_and_applies() { + let after = b"after\n"; + let after_hash = git_sha256(after); + let expected_before_hash = git_sha256(b"expected-before\n"); + + let mock = MockServer::start().await; + Mock::given(method("GET")) + .and(path(format!( + "/v0/orgs/{ORG_SLUG}/patches/blob/{after_hash}" + ))) + .respond_with(ResponseTemplate::new(200).set_body_bytes(after.to_vec())) + .mount(&mock) + .await; + + let uuid = "11111111-1111-4111-8111-111111111111"; + let tmp = tempfile::tempdir().expect("tempdir"); + write_root_package_json(tmp.path()); + write_npm_package( + tmp.path(), + "mismatch", + "1.0.0", + "index.js", + b"DIFFERENT-CONTENT\n", + ); + let socket = tmp.path().join(".socket"); + write_manifest_with_patch( + &socket, + "pkg:npm/mismatch@1.0.0", + uuid, + &expected_before_hash, + &after_hash, + ); + // A LOCAL package archive exists (so the stage step downloads nothing) + // but carries no entry for index.js — only the blob can produce the + // patched bytes, and no blob is staged. + let packages = socket.join("packages"); + std::fs::create_dir_all(&packages).unwrap(); + { + use std::io::Write as _; + let mut builder = tar::Builder::new(flate2::write::GzEncoder::new( + std::fs::File::create(packages.join(format!("{uuid}.tar.gz"))).unwrap(), + flate2::Compression::default(), + )); + let mut header = tar::Header::new_gnu(); + let bytes = b"unrelated"; + header.set_size(bytes.len() as u64); + header.set_mode(0o644); + header.set_cksum(); + builder + .append_data(&mut header, "other.js", &bytes[..]) + .unwrap(); + builder + .into_inner() + .unwrap() + .finish() + .unwrap() + .flush() + .unwrap(); + } + + let (code, stdout, stderr) = run_apply(tmp.path(), &mock.uri(), &[]); + let v: serde_json::Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); + assert_eq!(code, 0, "stdout={v:#}\nstderr={stderr}"); + let events = v["events"].as_array().expect("events array"); + assert!( + events + .iter() + .any(|e| e["errorCode"] == "content_mismatch_overwritten"), + "{events:?}" + ); + + // The blob was fetched on demand… + let requests = mock.received_requests().await.unwrap(); + let blob_path = format!("/v0/orgs/{ORG_SLUG}/patches/blob/{after_hash}"); + assert!( + requests.iter().any(|r| r.url.path() == blob_path), + "the full blob must be redownloaded for the mismatched file" + ); + // …and the file carries the verified patched bytes. + let content = std::fs::read(tmp.path().join("node_modules/mismatch/index.js")).unwrap(); + assert_eq!(content, after); +} diff --git a/crates/socket-patch-cli/tests/cli_global_args.rs b/crates/socket-patch-cli/tests/cli_global_args.rs index bbcb84b..6faebd8 100644 --- a/crates/socket-patch-cli/tests/cli_global_args.rs +++ b/crates/socket-patch-cli/tests/cli_global_args.rs @@ -202,6 +202,7 @@ fn global_flag_cases_cover_every_global_field() { break_lock: _, debug: _, no_telemetry: _, + strict: _, } = common; // 20 fields ↔ 20 long-flag cases. Bump both this count and add a case when diff --git a/crates/socket-patch-cli/tests/cli_parse_scan.rs b/crates/socket-patch-cli/tests/cli_parse_scan.rs index 359994f..b961eb4 100644 --- a/crates/socket-patch-cli/tests/cli_parse_scan.rs +++ b/crates/socket-patch-cli/tests/cli_parse_scan.rs @@ -523,6 +523,7 @@ fn scan_json_empty_cwd_emits_updates_key() { let expected = serde_json::json!({ "status": "success", "scannedPackages": 0, + "lockfileOnlyPackages": 0, "packagesWithPatches": 0, "totalPatches": 0, "freePatches": 0, diff --git a/crates/socket-patch-cli/tests/docker_e2e_cargo.rs b/crates/socket-patch-cli/tests/docker_e2e_cargo.rs index 9966217..95cd40a 100644 --- a/crates/socket-patch-cli/tests/docker_e2e_cargo.rs +++ b/crates/socket-patch-cli/tests/docker_e2e_cargo.rs @@ -177,12 +177,13 @@ chmod u+w "$LIB_RS" || true # scan --sync writes manifest + blob; the cargo crawler with --global # probes $CARGO_HOME/registry/src/. Note: in this fixture scan's own -# apply pass matches 0 files (the all-zeros beforeHash doesn't match the -# real cfg-if bytes), so scan exits non-zero (partial_failure) BY DESIGN -# — the dedicated `apply --force` step below does the real patching. -# Exit code is logged for diagnostics, not gated; the gate is the exact -# content-hash check at the end. -socket-patch scan --json --sync --yes --global \ +# apply pass meets an all-zeros beforeHash that doesn't match the real +# cfg-if bytes; `--strict` pins the hard-error behavior (the default +# would warn and apply the full blob) so scan exits non-zero +# (partial_failure) BY DESIGN and the dedicated `apply --force` step +# below stays the verified writer. Exit code is logged for diagnostics, +# not gated; the gate is the exact content-hash check at the end. +socket-patch scan --json --sync --strict --yes --global \ --api-url '{api_url}' --api-token fake --org {ORG} \ --ecosystems cargo > /tmp/sync.out 2>/tmp/sync.err SCAN_RC=$? @@ -227,8 +228,16 @@ grep -q '"failed": 0,' /tmp/apply.out || {{ cat /tmp/apply.out >&2 exit 1 }} -grep -q '"skipped": 0,' /tmp/apply.out || {{ - echo "FAIL: apply JSON did not report skipped:0" >&2 +# The --force overwrite of the mismatched baseline surfaces the +# content_mismatch_overwritten warning as a Skipped event (the +# mismatch-warn contract) — exactly that one, nothing else skipped. +grep -q '"skipped": 1,' /tmp/apply.out || {{ + echo "FAIL: apply JSON did not report skipped:1 (the mismatch-overwrite warning)" >&2 + cat /tmp/apply.out >&2 + exit 1 +}} +grep -q '"errorCode": "content_mismatch_overwritten"' /tmp/apply.out || {{ + echo "FAIL: apply JSON missing the content_mismatch_overwritten warning event" >&2 cat /tmp/apply.out >&2 exit 1 }} diff --git a/crates/socket-patch-cli/tests/docker_e2e_composer.rs b/crates/socket-patch-cli/tests/docker_e2e_composer.rs index 6686e82..b71ec49 100644 --- a/crates/socket-patch-cli/tests/docker_e2e_composer.rs +++ b/crates/socket-patch-cli/tests/docker_e2e_composer.rs @@ -225,7 +225,7 @@ fi PRE_SHA=$(sha256sum "$PHP_FILE" | cut -d' ' -f1) # scan exit code is intentionally not gated (see verify_snippet); capture JSON. -socket-patch scan --json --sync --yes \ +socket-patch scan --json --sync --strict --yes \ --api-url '{api_url}' --api-token fake --org {ORG} \ --ecosystems composer > /tmp/scan.json 2>/tmp/sync.err cat /tmp/sync.err >&2 @@ -264,7 +264,7 @@ PRE_SHA=$(sha256sum "$PHP_FILE" | cut -d' ' -f1) mkdir -p /workspace/proj && cd /workspace/proj # scan exit code is intentionally not gated (see verify_snippet); capture JSON. -socket-patch scan --json --sync --yes --global \ +socket-patch scan --json --sync --strict --yes --global \ --api-url '{api_url}' --api-token fake --org {ORG} \ --ecosystems composer > /tmp/scan.json 2>/tmp/sync.err cat /tmp/sync.err >&2 diff --git a/crates/socket-patch-cli/tests/docker_e2e_golang.rs b/crates/socket-patch-cli/tests/docker_e2e_golang.rs index e281432..e2d5d16 100644 --- a/crates/socket-patch-cli/tests/docker_e2e_golang.rs +++ b/crates/socket-patch-cli/tests/docker_e2e_golang.rs @@ -166,7 +166,7 @@ chmod u+w "$GIN_GO" || true # exits non-zero (partial_failure) BY DESIGN — the dedicated `apply # --force` step below does the real patching. Exit code is logged for # diagnostics, not gated; the gate is the exact content-hash check below. -socket-patch scan --json --sync --yes --global \ +socket-patch scan --json --sync --strict --yes --global \ --api-url '{api_url}' --api-token fake --org {ORG} \ --ecosystems golang > /tmp/sync.out 2>/tmp/sync.err SCAN_RC=$? @@ -209,8 +209,16 @@ grep -qE '^[[:space:]]*"failed": 0,[[:space:]]*$' /tmp/apply.out || {{ cat /tmp/apply.out >&2 exit 1 }} -grep -qE '^[[:space:]]*"skipped": 0,[[:space:]]*$' /tmp/apply.out || {{ - echo "FAIL: apply JSON reported a non-zero skipped count" >&2 +# The --force overwrite of the mismatched baseline surfaces the +# content_mismatch_overwritten warning as a Skipped event (the +# mismatch-warn contract) — exactly that one, nothing else skipped. +grep -qE '^[[:space:]]*"skipped": 1,[[:space:]]*$' /tmp/apply.out || {{ + echo "FAIL: apply JSON did not report skipped:1 (the mismatch-overwrite warning)" >&2 + cat /tmp/apply.out >&2 + exit 1 +}} +grep -q '"errorCode": "content_mismatch_overwritten"' /tmp/apply.out || {{ + echo "FAIL: apply JSON missing the content_mismatch_overwritten warning event" >&2 cat /tmp/apply.out >&2 exit 1 }} diff --git a/crates/socket-patch-cli/tests/docker_e2e_maven.rs b/crates/socket-patch-cli/tests/docker_e2e_maven.rs index 526bbf4..5dc3474 100644 --- a/crates/socket-patch-cli/tests/docker_e2e_maven.rs +++ b/crates/socket-patch-cli/tests/docker_e2e_maven.rs @@ -190,7 +190,7 @@ chmod u+w "$POM_FILE" || true # gated (scan's own apply pass matches 0 files because the all-zeros # beforeHash doesn't match the real .pom bytes); the gate is the exact # content-hash check at the end. -socket-patch scan --json --sync --yes --global \ +socket-patch scan --json --sync --strict --yes --global \ --api-url '{api_url}' --api-token fake --org {ORG} \ --ecosystems maven > /tmp/sync.out 2>/tmp/sync.err SCAN_RC=$? @@ -235,8 +235,16 @@ grep -q '"failed": 0,' /tmp/apply.out || {{ cat /tmp/apply.out >&2 exit 1 }} -grep -q '"skipped": 0,' /tmp/apply.out || {{ - echo "FAIL: apply JSON did not report skipped:0" >&2 +# The --force overwrite of the mismatched baseline surfaces the +# content_mismatch_overwritten warning as a Skipped event (the +# mismatch-warn contract) — exactly that one, nothing else skipped. +grep -q '"skipped": 1,' /tmp/apply.out || {{ + echo "FAIL: apply JSON did not report skipped:1 (the mismatch-overwrite warning)" >&2 + cat /tmp/apply.out >&2 + exit 1 +}} +grep -q '"errorCode": "content_mismatch_overwritten"' /tmp/apply.out || {{ + echo "FAIL: apply JSON missing the content_mismatch_overwritten warning event" >&2 cat /tmp/apply.out >&2 exit 1 }} diff --git a/crates/socket-patch-cli/tests/docker_e2e_nuget.rs b/crates/socket-patch-cli/tests/docker_e2e_nuget.rs index 7cad49e..622b9a2 100644 --- a/crates/socket-patch-cli/tests/docker_e2e_nuget.rs +++ b/crates/socket-patch-cli/tests/docker_e2e_nuget.rs @@ -198,7 +198,7 @@ echo "===SCAN VERIFIED===" >&2 # because the fixture's placeholder beforeHash doesn't match the real # installed bytes. That's expected — the separate forced apply below # is what actually writes the patch, so we only log sync's exit code. -socket-patch scan --json --sync --yes "${{COMMON_ARGS[@]}}" >/tmp/sync.out 2>/tmp/sync.err +socket-patch scan --json --sync --strict --yes "${{COMMON_ARGS[@]}}" >/tmp/sync.out 2>/tmp/sync.err echo "sync exit=$?" >&2 cat /tmp/sync.out >&2 || true cat /tmp/sync.err >&2 || true @@ -240,8 +240,16 @@ grep -q '"failed": 0,' /tmp/apply.out || {{ cat /tmp/apply.out >&2 exit 1 }} -grep -q '"skipped": 0,' /tmp/apply.out || {{ - echo "FAIL: apply JSON did not report skipped:0" >&2 +# The --force overwrite of the mismatched baseline surfaces the +# content_mismatch_overwritten warning as a Skipped event (the +# mismatch-warn contract) — exactly that one, nothing else skipped. +grep -q '"skipped": 1,' /tmp/apply.out || {{ + echo "FAIL: apply JSON did not report skipped:1 (the mismatch-overwrite warning)" >&2 + cat /tmp/apply.out >&2 + exit 1 +}} +grep -q '"errorCode": "content_mismatch_overwritten"' /tmp/apply.out || {{ + echo "FAIL: apply JSON missing the content_mismatch_overwritten warning event" >&2 cat /tmp/apply.out >&2 exit 1 }} @@ -326,7 +334,7 @@ echo "===SCAN VERIFIED===" >&2 # 2. scan --sync. May exit non-zero (un-forced sync-apply HashMismatch # against the fixture's placeholder beforeHash); the forced apply # below is what writes the patch, so only log sync's exit code. -socket-patch scan --json --sync --yes "${{COMMON_ARGS[@]}}" >/tmp/sync.out 2>/tmp/sync.err +socket-patch scan --json --sync --strict --yes "${{COMMON_ARGS[@]}}" >/tmp/sync.out 2>/tmp/sync.err echo "sync exit=$?" >&2 cat /tmp/sync.out >&2 || true cat /tmp/sync.err >&2 || true @@ -362,8 +370,16 @@ grep -q '"failed": 0,' /tmp/apply.out || {{ cat /tmp/apply.out >&2 exit 1 }} -grep -q '"skipped": 0,' /tmp/apply.out || {{ - echo "FAIL: apply JSON did not report skipped:0" >&2 +# The --force overwrite of the mismatched baseline surfaces the +# content_mismatch_overwritten warning as a Skipped event (the +# mismatch-warn contract) — exactly that one, nothing else skipped. +grep -q '"skipped": 1,' /tmp/apply.out || {{ + echo "FAIL: apply JSON did not report skipped:1 (the mismatch-overwrite warning)" >&2 + cat /tmp/apply.out >&2 + exit 1 +}} +grep -q '"errorCode": "content_mismatch_overwritten"' /tmp/apply.out || {{ + echo "FAIL: apply JSON missing the content_mismatch_overwritten warning event" >&2 cat /tmp/apply.out >&2 exit 1 }} diff --git a/crates/socket-patch-cli/tests/e2e_vendor_cargo_build.rs b/crates/socket-patch-cli/tests/e2e_vendor_cargo_build.rs index 12e1b1a..28c118d 100644 --- a/crates/socket-patch-cli/tests/e2e_vendor_cargo_build.rs +++ b/crates/socket-patch-cli/tests/e2e_vendor_cargo_build.rs @@ -81,6 +81,10 @@ fn cargo(cwd: &Path, args: &[&str], cargo_home: &Path) -> Output { .args(args) .current_dir(cwd) .env("CARGO_HOME", cargo_home) + // The assertions read `/target/debug/...`; an ambient + // CARGO_TARGET_DIR (shared-build-cache setups) would redirect the + // child build elsewhere and break them. + .env_remove("CARGO_TARGET_DIR") .output() .expect("failed to run cargo") } diff --git a/crates/socket-patch-cli/tests/in_process_cargo_apply.rs b/crates/socket-patch-cli/tests/in_process_cargo_apply.rs index 860c4dd..bf17233 100644 --- a/crates/socket-patch-cli/tests/in_process_cargo_apply.rs +++ b/crates/socket-patch-cli/tests/in_process_cargo_apply.rs @@ -293,12 +293,14 @@ async fn cargo_fetch_scan_sync_patches_real_file() { } /// Safety gate: when the patch's advertised `beforeHash` does NOT match the -/// on-disk file, apply must REFUSE to write (it cannot trust that the blob is -/// a valid successor of whatever is actually on disk). The positive test -/// above only ever feeds a correct `beforeHash`, so a regression that made -/// apply blindly clobber the file regardless of its current content would -/// sail through it. This test pins the refusal: the file must be left -/// byte-for-byte untouched and the run must NOT report success. +/// on-disk file, `--strict` apply must REFUSE to write (the v3.4 DEFAULT +/// instead overwrites with the verified afterHash content and warns — see +/// `apply_hash_mismatch_default_warns_and_applies_strict_fails`). The +/// positive test above only ever feeds a correct `beforeHash`, so a +/// regression that made strict mode clobber the file regardless of its +/// current content would sail through it. This test pins the strict +/// refusal: the file must be left byte-for-byte untouched and the run must +/// NOT report success. #[tokio::test] #[serial] async fn cargo_apply_refuses_on_before_hash_mismatch() { @@ -344,9 +346,10 @@ async fn cargo_apply_refuses_on_before_hash_mismatch() { ecosystems: Some(vec!["cargo".to_string()]), download_mode: "diff".to_string(), dry_run: false, - // force MUST stay false: with --force, a hash mismatch is - // deliberately downgraded to "ready" and the file WOULD be - // overwritten. We are asserting the safe default refuses. + // strict pins the fail-closed contract: the v3.4 default (and + // --force) deliberately downgrade a hash mismatch to "ready" + // and the file WOULD be overwritten with verified content. + strict: true, ..socket_patch_cli::args::GlobalArgs::default() }, batch_size: 100, diff --git a/crates/socket-patch-cli/tests/in_process_get_update_count.rs b/crates/socket-patch-cli/tests/in_process_get_update_count.rs index a2101ae..905a5d0 100644 --- a/crates/socket-patch-cli/tests/in_process_get_update_count.rs +++ b/crates/socket-patch-cli/tests/in_process_get_update_count.rs @@ -72,6 +72,8 @@ fn params(root: &Path, server: &MockServer) -> DownloadParams { org_slug: Some(ORG.to_string()), proxy_url: None, }, + strict: false, + persist_blobs: true, // Skip release-narrowing; npm has no variants anyway. all_releases: true, } diff --git a/crates/socket-patch-cli/tests/in_process_vendor.rs b/crates/socket-patch-cli/tests/in_process_vendor.rs index 0ef740c..536ea67 100644 --- a/crates/socket-patch-cli/tests/in_process_vendor.rs +++ b/crates/socket-patch-cli/tests/in_process_vendor.rs @@ -1228,3 +1228,183 @@ fn json_envelope_shape() { assert_eq!(env["status"], "noManifest"); assert!(events(&env).is_empty()); } + +// ──────────────── vendor auto-force + already-applied lifecycle ──────────────── + +/// A package already patched IN PLACE by `apply` must vendor cleanly on the +/// first run — and the envelope must report it as `applied` (this run packed +/// the artifact and rewired the lock), NOT `skipped/already_vendored`. The +/// second run is the true in-sync rerun and reports `already_vendored`. +#[test] +fn vendor_after_in_place_apply_emits_applied_event() { + let fx = npm_fixture(); + // Simulate a prior in-place `socket-patch apply`. + std::fs::write(fx.installed_index(), PATCHED_INDEX).unwrap(); + + let (code, env) = vendor_cli(fx.root(), &[]); + assert_eq!(code, 0, "{env:#}"); + let applied = find_event(&env, "applied", None); + assert_eq!(applied["purl"], PURL); + assert_eq!( + env["summary"]["applied"], 1, + "first vendor of an applied package counts as applied: {env:#}" + ); + assert!(fx.tgz_path().exists(), "artifact packed"); + assert!(fx.state_path().exists(), "ledger entry recorded"); + // No mismatch warning: afterHash content is AlreadyPatched, not divergent. + assert!( + !events(&env) + .iter() + .any(|e| e["errorCode"] == "vendor_content_mismatch_overwritten"), + "{env:#}" + ); + + // Second run: artifact + wiring already in sync. + let (code, env) = vendor_cli(fx.root(), &[]); + assert_eq!(code, 0, "{env:#}"); + find_event(&env, "skipped", Some("already_vendored")); + assert_eq!(env["summary"]["applied"], 0); +} + +/// Installed content matching NEITHER hash (a patch built against different +/// bytes than the installed artifact — the flatted@3.3.1 case) still vendors: +/// the stage is overwritten with the verified patched content, the run exits +/// 0 with an `applied` event, and the overwrite surfaces as a +/// `vendor_content_mismatch_overwritten` warning event. +#[test] +fn mismatched_baseline_vendors_with_warning_event() { + let fx = npm_fixture(); + std::fs::write( + fx.installed_index(), + b"module.exports = () => 'divergent';\n", + ) + .unwrap(); + + let (code, env) = vendor_cli(fx.root(), &[]); + assert_eq!(code, 0, "{env:#}"); + let applied = find_event(&env, "applied", None); + assert_eq!(applied["purl"], PURL); + let warning = find_event(&env, "skipped", Some("vendor_content_mismatch_overwritten")); + assert!( + warning["reason"] + .as_str() + .unwrap_or("") + .contains("left-pad@1.3.0"), + "warning names the package: {env:#}" + ); + assert!( + fx.tgz_path().exists(), + "artifact packed despite the mismatch" + ); + // The installed tree keeps its divergent bytes (only the stage changed). + assert_eq!( + std::fs::read(fx.installed_index()).unwrap(), + b"module.exports = () => 'divergent';\n" + ); +} + +/// A patch-target file MISSING from the installed package still fails closed +/// (auto-force must not inherit `--force`'s silent NotFound skip — the +/// tarball would ship without the fix); `--force` keeps that tolerance. +#[test] +fn vendor_missing_file_fails_closed_without_force() { + let fx = npm_fixture(); + std::fs::remove_file(fx.installed_index()).unwrap(); + + let (code, env) = vendor_cli(fx.root(), &[]); + assert_ne!(code, 0, "missing patch target must fail: {env:#}"); + let failed = find_event(&env, "failed", None); + assert!( + failed["error"] + .as_str() + .unwrap_or("") + .contains("File not found"), + "{env:#}" + ); + assert_eq!(fx.lock_bytes(), fx.original_lock, "lock byte-untouched"); + assert!(!fx.vendor_dir().exists(), "no artifacts on failure"); + + // --force: the missing file is tolerated (skipped) and the vendor lands. + let fx2 = npm_fixture(); + std::fs::remove_file(fx2.installed_index()).unwrap(); + let (code, env) = vendor_cli(fx2.root(), &["--force"]); + assert_eq!(code, 0, "{env:#}"); +} + +// ──────────────── percent-encoded scoped purls (Fix A integration) ──────────────── + +/// Build a fixture whose installed package is the SCOPED `@scope/left-pad` +/// while the manifest keys the patch by the API's percent-encoded purl +/// (`pkg:npm/%40scope/left-pad@1.3.0`) — exactly what `scan` writes. +fn npm_scoped_fixture() -> NpmFixture { + let fx = npm_fixture_with_purls(&["pkg:npm/%40scope/left-pad@1.3.0"]); + let root = fx.root(); + + // Re-home the installed package under the scope dir. + let scoped = root.join("node_modules/@scope/left-pad"); + std::fs::create_dir_all(scoped.parent().unwrap()).unwrap(); + std::fs::rename(root.join("node_modules/left-pad"), &scoped).unwrap(); + std::fs::write( + scoped.join("package.json"), + br#"{"name":"@scope/left-pad","version":"1.3.0"}"#, + ) + .unwrap(); + + // Re-key the lock entry to the scoped install path. + let mut lock: Value = serde_json::from_slice(&fx.original_lock).unwrap(); + let packages = lock["packages"].as_object_mut().unwrap(); + let entry = packages.remove("node_modules/left-pad").unwrap(); + packages.insert("node_modules/@scope/left-pad".to_string(), entry); + lock["packages"][""]["dependencies"] = json!({ "@scope/left-pad": "^1.3.0" }); + let mut lock_bytes = serde_json::to_vec_pretty(&lock).unwrap(); + lock_bytes.push(b'\n'); + std::fs::write(root.join("package-lock.json"), &lock_bytes).unwrap(); + + fx +} + +/// The API serves scoped purls percent-encoded and `scan` stores them +/// verbatim as manifest keys; vendor must decode them to find the installed +/// `node_modules/@scope/...` package and wire the lock — while the ledger +/// stays keyed by the verbatim encoded purl (manifest parity). +#[test] +fn vendor_resolves_percent_encoded_scope_purl() { + let fx = npm_scoped_fixture(); + + let (code, env) = vendor_cli(fx.root(), &[]); + assert_eq!(code, 0, "{env:#}"); + let applied = find_event(&env, "applied", None); + assert_eq!(applied["purl"], "pkg:npm/%40scope/left-pad@1.3.0"); + + // Artifact lands under the DECODED scope dir. + let tgz = fx.root().join(format!( + ".socket/vendor/npm/{UUID}/@scope/left-pad-1.3.0.tgz" + )); + assert!(tgz.exists(), "tarball at the decoded scoped path"); + + // Lock rewired to the vendored artifact. + let lock = fx.lock_value(); + assert_eq!( + lock["packages"]["node_modules/@scope/left-pad"]["resolved"], + json!(format!( + "file:.socket/vendor/npm/{UUID}/@scope/left-pad-1.3.0.tgz" + )) + ); + + // Ledger keyed by the VERBATIM encoded purl (manifest key parity). + let state: Value = serde_json::from_slice(&std::fs::read(fx.state_path()).unwrap()).unwrap(); + assert!( + state["entries"]["pkg:npm/%40scope/left-pad@1.3.0"].is_object(), + "state keyed by the encoded manifest purl: {state:#}" + ); + + // Round-trip: revert restores the original (scoped) lock bytes. + let (code, env) = vendor_cli(fx.root(), &["--revert"]); + assert_eq!(code, 0, "{env:#}"); + let lock = fx.lock_value(); + assert_eq!( + lock["packages"]["node_modules/@scope/left-pad"]["resolved"], + json!(REG_RESOLVED) + ); + assert!(!fx.vendor_dir().join("npm").exists(), "artifacts removed"); +} diff --git a/crates/socket-patch-cli/tests/repair_vendor_e2e.rs b/crates/socket-patch-cli/tests/repair_vendor_e2e.rs new file mode 100644 index 0000000..a6003fe --- /dev/null +++ b/crates/socket-patch-cli/tests/repair_vendor_e2e.rs @@ -0,0 +1,755 @@ +//! End-to-end tests for `repair`'s vendored-artifact phase: artifacts +//! referenced by the ledger and/or rewired lockfiles but missing/corrupt on +//! disk are rebuilt fail-closed (and the ledger itself is reconstructed from +//! lockfile references when it was deleted wholesale). Mock API + real npm +//! lockfile fixtures, driven through the built binary. + +use std::path::{Path, PathBuf}; +use std::process::Command; + +use sha2::{Digest, Sha256}; +use wiremock::matchers::{method, path}; +use wiremock::{Mock, MockServer, ResponseTemplate}; + +fn binary() -> PathBuf { + env!("CARGO_BIN_EXE_socket-patch").into() +} + +const ORG_SLUG: &str = "test-org"; +const UUID: &str = "11111111-1111-4111-8111-111111111111"; +const PURL: &str = "pkg:npm/left-pad@1.3.0"; +const ENCODED: &str = "pkg%3Anpm%2Fleft-pad%401.3.0"; +const BEFORE: &[u8] = b"before\n"; +const AFTER: &[u8] = b"after\n"; +const AFTER_B64: &str = "YWZ0ZXIK"; + +fn git_sha256(content: &[u8]) -> String { + let header = format!("blob {}\0", content.len()); + let mut hasher = Sha256::new(); + hasher.update(header.as_bytes()); + hasher.update(content); + hex::encode(hasher.finalize()) +} + +fn sha256_hex(bytes: &[u8]) -> String { + hex::encode(Sha256::digest(bytes)) +} + +fn sri_of(bytes: &[u8]) -> String { + use base64::Engine as _; + use sha2::Sha512; + format!( + "sha512-{}", + base64::engine::general_purpose::STANDARD.encode(Sha512::digest(bytes)) + ) +} + +/// A pristine registry tarball for left-pad@1.3.0 (BEFORE bytes). +fn pristine_tgz() -> Vec { + let mut builder = tar::Builder::new(flate2::write::GzEncoder::new( + Vec::new(), + flate2::Compression::default(), + )); + for (path, bytes) in [ + ( + "package/package.json", + br#"{"name":"left-pad","version":"1.3.0"}"#.as_slice(), + ), + ("package/index.js", BEFORE), + ] { + let mut header = tar::Header::new_gnu(); + header.set_size(bytes.len() as u64); + header.set_mode(0o644); + header.set_cksum(); + builder.append_data(&mut header, path, bytes).unwrap(); + } + builder.into_inner().unwrap().finish().unwrap() +} + +/// Vendorable npm project: package.json, a v3 lock whose left-pad entry +/// resolves to `resolved_url`/`integrity`, and the installed package. +fn write_fixture(root: &Path, resolved_url: &str, integrity: &str) { + std::fs::write( + root.join("package.json"), + r#"{ "name": "repair-vendor-test", "version": "0.0.0" }"#, + ) + .unwrap(); + let lock = serde_json::json!({ + "name": "repair-vendor-test", + "version": "0.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "repair-vendor-test", + "version": "0.0.0", + "dependencies": { "left-pad": "^1.3.0" } + }, + "node_modules/left-pad": { + "version": "1.3.0", + "resolved": resolved_url, + "integrity": integrity, + "license": "WTFPL" + } + } + }); + let mut lock_bytes = serde_json::to_vec_pretty(&lock).unwrap(); + lock_bytes.push(b'\n'); + std::fs::write(root.join("package-lock.json"), lock_bytes).unwrap(); + + let pkg = root.join("node_modules/left-pad"); + std::fs::create_dir_all(&pkg).unwrap(); + std::fs::write( + pkg.join("package.json"), + br#"{"name":"left-pad","version":"1.3.0"}"#, + ) + .unwrap(); + std::fs::write(pkg.join("index.js"), BEFORE).unwrap(); +} + +/// Mount discovery + view for `UUID` (same shapes as scan_vendor_e2e). +async fn mount_patch_api(mock: &MockServer) { + let before_hash = git_sha256(BEFORE); + let after_hash = git_sha256(AFTER); + Mock::given(method("POST")) + .and(path(format!("/v0/orgs/{ORG_SLUG}/patches/batch"))) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "packages": [{ + "purl": PURL, + "patches": [{ + "uuid": UUID, + "purl": PURL, + "tier": "free", + "cveIds": ["CVE-2026-0001"], + "ghsaIds": [], + "severity": "high", + "title": "vendor target" + }] + }], + "canAccessPaidPatches": false, + }))) + .mount(mock) + .await; + Mock::given(method("GET")) + .and(path(format!( + "/v0/orgs/{ORG_SLUG}/patches/by-package/{ENCODED}" + ))) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "patches": [{ + "uuid": UUID, + "purl": PURL, + "publishedAt": "2026-01-01T00:00:00Z", + "description": "Vendor patch", + "license": "MIT", + "tier": "free", + "vulnerabilities": {} + }], + "canAccessPaidPatches": false, + }))) + .mount(mock) + .await; + Mock::given(method("GET")) + .and(path(format!("/v0/orgs/{ORG_SLUG}/patches/view/{UUID}"))) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "uuid": UUID, + "purl": PURL, + "publishedAt": "2026-01-01T00:00:00Z", + "files": { + "package/index.js": { + "beforeHash": before_hash, + "afterHash": after_hash, + "blobContent": AFTER_B64, + } + }, + "vulnerabilities": { + "GHSA-aaaa-bbbb-cccc": { + "cves": ["CVE-2026-0001"], + "summary": "test vuln", + "severity": "high", + "description": "details" + } + }, + "description": "Vendor patch", + "license": "MIT", + "tier": "free", + }))) + .mount(mock) + .await; +} + +/// Serve the after-blob for `--download-mode file` repairs (test 7's step 1 +/// runs before the ledger is reconstructed, so its vendored entry is not +/// yet excluded from the download phase). +async fn mount_blob(mock: &MockServer) { + Mock::given(method("GET")) + .and(path(format!( + "/v0/orgs/{ORG_SLUG}/patches/blob/{}", + git_sha256(AFTER) + ))) + .respond_with(ResponseTemplate::new(200).set_body_bytes(AFTER)) + .mount(mock) + .await; +} + +fn run_cli(root: &Path, mock_uri: &str, argv: &[&str]) -> (i32, String, String) { + let mut full = argv.to_vec(); + full.extend_from_slice(&[ + "--json", + "--api-url", + mock_uri, + "--api-token", + "fake-token", + "--org", + ORG_SLUG, + ]); + let out = Command::new(binary()) + .args(&full) + .current_dir(root) + .env("SOCKET_TELEMETRY_DISABLED", "1") + .output() + .expect("run"); + ( + out.status.code().unwrap_or(-1), + String::from_utf8_lossy(&out.stdout).into_owned(), + String::from_utf8_lossy(&out.stderr).into_owned(), + ) +} + +/// `scan --vendor --yes` to establish a vendored project; returns the +/// vendored tarball path. +fn vendor_project(root: &Path, mock_uri: &str, extra: &[&str]) -> PathBuf { + let mut argv = vec!["scan", "--vendor", "--yes"]; + argv.extend_from_slice(extra); + let (code, stdout, stderr) = run_cli(root, mock_uri, &argv); + assert_eq!(code, 0, "vendor setup failed: {stdout} {stderr}"); + let tgz = root.join(format!(".socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz")); + assert!(tgz.is_file(), "setup must vendor the tarball"); + tgz +} + +fn parse_env(stdout: &str) -> serde_json::Value { + serde_json::from_str(stdout.trim()).unwrap_or_else(|e| panic!("bad JSON ({e}): {stdout}")) +} + +fn events_of(v: &serde_json::Value) -> Vec { + v["events"].as_array().cloned().unwrap_or_default() +} + +/// 1. Deleted tarball → `repair` rebuilds it byte-identically (installed +/// copy + view-fetched patch content), lockfile and ledger untouched. +#[tokio::test] +async fn repair_rebuilds_deleted_vendored_tarball() { + let mock = MockServer::start().await; + mount_patch_api(&mock).await; + let tmp = tempfile::tempdir().unwrap(); + write_fixture( + tmp.path(), + "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz", + "sha512-orig==", + ); + let tgz = vendor_project(tmp.path(), &mock.uri(), &[]); + let tgz_bytes = std::fs::read(&tgz).unwrap(); + let lock1 = std::fs::read(tmp.path().join("package-lock.json")).unwrap(); + let state1 = std::fs::read(tmp.path().join(".socket/vendor/state.json")).unwrap(); + + std::fs::remove_file(&tgz).unwrap(); + + let (code, stdout, stderr) = run_cli(tmp.path(), &mock.uri(), &["repair"]); + assert_eq!(code, 0, "stdout={stdout} stderr={stderr}"); + let v = parse_env(&stdout); + assert_eq!(v["summary"]["rebuilt"], 1, "envelope={v}"); + assert!( + events_of(&v) + .iter() + .any(|e| e["action"] == "rebuilt" && e["purl"] == PURL), + "envelope={v}" + ); + assert_eq!( + std::fs::read(&tgz).unwrap(), + tgz_bytes, + "deterministic rebuild must reproduce the recorded bytes" + ); + assert_eq!( + std::fs::read(tmp.path().join("package-lock.json")).unwrap(), + lock1, + "lockfile untouched" + ); + assert_eq!( + std::fs::read(tmp.path().join(".socket/vendor/state.json")).unwrap(), + state1, + "ledger untouched" + ); + + // Healthy re-run: nothing to rebuild. + let (code, stdout, _) = run_cli(tmp.path(), &mock.uri(), &["repair"]); + assert_eq!(code, 0); + let v = parse_env(&stdout); + assert!( + v["summary"]["rebuilt"].is_null() || v["summary"]["rebuilt"] == 0, + "healthy ledger rebuilds nothing: {v}" + ); +} + +/// 2. `repair --offline` rebuilds from purely local sources (installed copy +/// + seeded blob) with zero network. +#[tokio::test] +async fn repair_offline_rebuilds_from_local_sources() { + let mock = MockServer::start().await; + mount_patch_api(&mock).await; + let tmp = tempfile::tempdir().unwrap(); + write_fixture( + tmp.path(), + "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz", + "sha512-orig==", + ); + let tgz = vendor_project(tmp.path(), &mock.uri(), &[]); + std::fs::remove_file(&tgz).unwrap(); + + // Patch content available locally: the after-blob on disk. + let blobs = tmp.path().join(".socket/blobs"); + std::fs::create_dir_all(&blobs).unwrap(); + std::fs::write(blobs.join(git_sha256(AFTER)), AFTER).unwrap(); + + let before_reqs = mock.received_requests().await.unwrap().len(); + let (code, stdout, stderr) = run_cli(tmp.path(), &mock.uri(), &["repair", "--offline"]); + assert_eq!(code, 0, "stdout={stdout} stderr={stderr}"); + let v = parse_env(&stdout); + assert_eq!(v["summary"]["rebuilt"], 1, "envelope={v}"); + assert!(tgz.is_file(), "tarball rebuilt offline"); + let after_reqs = mock.received_requests().await.unwrap().len(); + assert_eq!( + before_reqs, after_reqs, + "--offline must make no network requests" + ); +} + +/// 3. Truncated/corrupt tarball → detected (whole-file sha vs ledger) and +/// rebuilt. +#[tokio::test] +async fn repair_rebuilds_corrupt_vendored_tarball() { + let mock = MockServer::start().await; + mount_patch_api(&mock).await; + let tmp = tempfile::tempdir().unwrap(); + write_fixture( + tmp.path(), + "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz", + "sha512-orig==", + ); + let tgz = vendor_project(tmp.path(), &mock.uri(), &[]); + let tgz_bytes = std::fs::read(&tgz).unwrap(); + + std::fs::write(&tgz, b"\x1f\x8bgarbage").unwrap(); + + let (code, stdout, stderr) = run_cli(tmp.path(), &mock.uri(), &["repair"]); + assert_eq!(code, 0, "stdout={stdout} stderr={stderr}"); + let v = parse_env(&stdout); + assert_eq!(v["summary"]["rebuilt"], 1, "envelope={v}"); + assert_eq!( + std::fs::read(&tgz).unwrap(), + tgz_bytes, + "rebuild restores the recorded bytes" + ); +} + +/// 4. A tampered ledger sha can never be satisfied: the rebuild is removed +/// and the run fails loudly rather than leaving unverifiable bytes. +#[tokio::test] +async fn repair_fails_closed_on_tampered_ledger_sha() { + let mock = MockServer::start().await; + mount_patch_api(&mock).await; + let tmp = tempfile::tempdir().unwrap(); + write_fixture( + tmp.path(), + "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz", + "sha512-orig==", + ); + let tgz = vendor_project(tmp.path(), &mock.uri(), &[]); + + let state_path = tmp.path().join(".socket/vendor/state.json"); + let state = std::fs::read_to_string(&state_path).unwrap(); + let mut v: serde_json::Value = serde_json::from_str(&state).unwrap(); + v["entries"][PURL]["artifact"]["sha256"] = serde_json::json!("0".repeat(64)); + std::fs::write(&state_path, serde_json::to_vec_pretty(&v).unwrap()).unwrap(); + + let (code, stdout, stderr) = run_cli(tmp.path(), &mock.uri(), &["repair"]); + assert_eq!(code, 1, "stdout={stdout} stderr={stderr}"); + let env = parse_env(&stdout); + assert!( + events_of(&env) + .iter() + .any(|e| e["action"] == "failed" && e["errorCode"] == "vendor_artifact_rebuild_failed"), + "envelope={env}" + ); + assert!( + !tgz.exists(), + "an unverifiable rebuild must not be left on disk" + ); +} + +/// 5. Fresh-clone `vendor` re-run with the committed artifact AND +/// node_modules gone: the ledger's wiring original recovers the registry +/// resolution, the pristine tarball is fetched + verified, and the +/// artifact is rebuilt — exit 0 (previously a hard vendor_fetch_failed). +#[tokio::test] +async fn vendor_rerun_recovers_registry_resolution_from_ledger() { + let mock = MockServer::start().await; + mount_patch_api(&mock).await; + let tgz_bytes = pristine_tgz(); + let integrity = sri_of(&tgz_bytes); + Mock::given(method("GET")) + .and(path("/left-pad/-/left-pad-1.3.0.tgz")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(tgz_bytes)) + .mount(&mock) + .await; + let tmp = tempfile::tempdir().unwrap(); + // The PRE-VENDOR lock resolves to the mock registry with the real + // integrity — that's what the ledger preserves as the wiring original. + write_fixture( + tmp.path(), + &format!("{}/left-pad/-/left-pad-1.3.0.tgz", mock.uri()), + &integrity, + ); + let tgz = vendor_project(tmp.path(), &mock.uri(), &[]); + let lock1 = std::fs::read(tmp.path().join("package-lock.json")).unwrap(); + + std::fs::remove_file(&tgz).unwrap(); + std::fs::remove_dir_all(tmp.path().join("node_modules")).unwrap(); + + let (code, stdout, stderr) = run_cli(tmp.path(), &mock.uri(), &["vendor"]); + assert_eq!(code, 0, "stdout={stdout} stderr={stderr}"); + let v = parse_env(&stdout); + assert!( + events_of(&v) + .iter() + .any(|e| e["errorCode"] == "vendor_artifact_missing"), + "the missing artifact is surfaced as a warning skip: {v}" + ); + assert!(tgz.is_file(), "artifact rebuilt from the recovered fetch"); + assert_eq!( + std::fs::read(tmp.path().join("package-lock.json")).unwrap(), + lock1, + "lockfile byte-stable" + ); +} + +/// 6. Detached vendoring (no manifest ever): repair rebuilds via the +/// ledger-embedded record. +#[tokio::test] +async fn repair_rebuilds_detached_entry_without_manifest() { + let mock = MockServer::start().await; + mount_patch_api(&mock).await; + let tmp = tempfile::tempdir().unwrap(); + write_fixture( + tmp.path(), + "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz", + "sha512-orig==", + ); + let tgz = vendor_project(tmp.path(), &mock.uri(), &["--detached"]); + assert!( + !tmp.path().join(".socket/manifest.json").exists(), + "detached mode writes no manifest" + ); + std::fs::remove_file(&tgz).unwrap(); + + let (code, stdout, stderr) = run_cli(tmp.path(), &mock.uri(), &["repair"]); + assert_eq!(code, 0, "stdout={stdout} stderr={stderr}"); + let v = parse_env(&stdout); + assert_eq!(v["summary"]["rebuilt"], 1, "envelope={v}"); + assert!(tgz.is_file()); +} + +/// 7. The whole `.socket/vendor` tree (state.json included) deleted while +/// the manifest survives: repair reconstructs the ledger entry from the +/// lockfile's vendor-path reference and rebuilds the artifact. +#[tokio::test] +async fn repair_reconstructs_ledger_from_lockfile_references() { + let mock = MockServer::start().await; + mount_patch_api(&mock).await; + let tmp = tempfile::tempdir().unwrap(); + write_fixture( + tmp.path(), + "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz", + "sha512-orig==", + ); + let tgz = vendor_project(tmp.path(), &mock.uri(), &[]); + let lock1 = std::fs::read(tmp.path().join("package-lock.json")).unwrap(); + + std::fs::remove_dir_all(tmp.path().join(".socket/vendor")).unwrap(); + + // With the ledger gone, step 1 sees the manifest entry as un-vendored + // and downloads its source; serve the blob and use file mode. + mount_blob(&mock).await; + let (code, stdout, stderr) = run_cli( + tmp.path(), + &mock.uri(), + &["repair", "--download-mode", "file"], + ); + assert_eq!(code, 0, "stdout={stdout} stderr={stderr}"); + let v = parse_env(&stdout); + assert_eq!(v["summary"]["rebuilt"], 1, "envelope={v}"); + assert!(tgz.is_file(), "artifact rebuilt"); + assert_eq!( + std::fs::read(tmp.path().join("package-lock.json")).unwrap(), + lock1, + "lockfile untouched" + ); + + // The re-synthesized ledger entry: same uuid, fingerprint of the + // rebuilt bytes, NOT detached (the manifest still has the record). + let state: serde_json::Value = serde_json::from_str( + &std::fs::read_to_string(tmp.path().join(".socket/vendor/state.json")).unwrap(), + ) + .unwrap(); + let entry = &state["entries"][PURL]; + assert_eq!(entry["uuid"], UUID, "state={state}"); + assert!(entry["detached"].is_null(), "state={state}"); + assert_eq!( + entry["artifact"]["sha256"], + sha256_hex(&std::fs::read(&tgz).unwrap()), + "recomputed fingerprint matches the rebuilt artifact: {state}" + ); + + // Revert degrades gracefully (no recorded originals): exit 0, artifact + // removed, the drifted-entry guidance surfaced. + let (code, stdout, _) = run_cli(tmp.path(), &mock.uri(), &["vendor", "--revert"]); + assert_eq!(code, 0, "revert of a reconstructed entry: {stdout}"); + assert!(!tgz.exists(), "revert removed the artifact"); +} + +/// 8. No ledger AND no manifest — only the rewired lockfile: the uuid in +/// the lock path drives an API view fetch and the entry is re-created +/// DETACHED (manifest-invisible), with the artifact rebuilt. +#[tokio::test] +async fn repair_reconstructs_detached_from_lockfile_only() { + let mock = MockServer::start().await; + mount_patch_api(&mock).await; + let tmp = tempfile::tempdir().unwrap(); + write_fixture( + tmp.path(), + "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz", + "sha512-orig==", + ); + let tgz = vendor_project(tmp.path(), &mock.uri(), &[]); + + std::fs::remove_dir_all(tmp.path().join(".socket")).unwrap(); + + let (code, stdout, stderr) = run_cli(tmp.path(), &mock.uri(), &["repair"]); + assert_eq!(code, 0, "stdout={stdout} stderr={stderr}"); + let v = parse_env(&stdout); + assert_eq!(v["summary"]["rebuilt"], 1, "envelope={v}"); + assert!(tgz.is_file(), "artifact rebuilt"); + + let state: serde_json::Value = serde_json::from_str( + &std::fs::read_to_string(tmp.path().join(".socket/vendor/state.json")).unwrap(), + ) + .unwrap(); + let entry = &state["entries"][PURL]; + assert_eq!(entry["uuid"], UUID, "state={state}"); + assert_eq!( + entry["detached"], true, + "manifest-less reconstruction is detached: {state}" + ); + assert_eq!( + entry["record"]["uuid"], UUID, + "the record is embedded for future repairs/VEX: {state}" + ); +} + +/// 9. The hardest reconstruction: no ledger, no manifest help needed beyond +/// the record, and NO installed copy. The rewired lockfile's recorded +/// integrity is the trust anchor: the pristine tarball is fetched +/// unverified from the conventional registry URL and the REBUILT +/// artifact must reproduce the wired integrity. +#[tokio::test] +async fn repair_reconstructs_without_installed_copy_via_wired_integrity() { + let mock = MockServer::start().await; + mount_patch_api(&mock).await; + Mock::given(method("GET")) + .and(path("/left-pad/-/left-pad-1.3.0.tgz")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(pristine_tgz())) + .mount(&mock) + .await; + let tmp = tempfile::tempdir().unwrap(); + write_fixture( + tmp.path(), + "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz", + "sha512-orig==", + ); + let tgz = vendor_project(tmp.path(), &mock.uri(), &[]); + + // Fresh-clone hole: vendor tree gone AND nothing installed. + std::fs::remove_dir_all(tmp.path().join(".socket/vendor")).unwrap(); + std::fs::remove_dir_all(tmp.path().join("node_modules")).unwrap(); + + mount_blob(&mock).await; + let out = Command::new(binary()) + .args([ + "repair", + "--download-mode", + "file", + "--json", + "--api-url", + &mock.uri(), + "--api-token", + "fake-token", + "--org", + ORG_SLUG, + ]) + .current_dir(tmp.path()) + .env("SOCKET_TELEMETRY_DISABLED", "1") + .env("SOCKET_NPM_REGISTRY", mock.uri()) + .output() + .expect("run"); + let stdout = String::from_utf8_lossy(&out.stdout); + let stderr = String::from_utf8_lossy(&out.stderr); + assert_eq!( + out.status.code(), + Some(0), + "stdout={stdout} stderr={stderr}" + ); + let v = parse_env(&stdout); + assert_eq!(v["summary"]["rebuilt"], 1, "envelope={v}"); + assert!(tgz.is_file(), "artifact rebuilt from the unverified fetch"); + + // The rebuilt tarball's integrity is exactly what the lock records. + let lock = std::fs::read_to_string(tmp.path().join("package-lock.json")).unwrap(); + let rebuilt_sri = sri_of(&std::fs::read(&tgz).unwrap()); + assert!( + lock.contains(&rebuilt_sri), + "rebuilt sri {rebuilt_sri} must be the wired one; lock={lock}" + ); +} + +/// 10. A tampered pristine source changes the deterministic rebuild, which +/// then fails the wired-integrity check: nothing is kept, exit 1. +#[tokio::test] +async fn repair_reconstruction_rejects_tampered_pristine_source() { + let mock = MockServer::start().await; + mount_patch_api(&mock).await; + // The "registry" serves a tarball whose non-patched member differs. + let mut tampered = tar::Builder::new(flate2::write::GzEncoder::new( + Vec::new(), + flate2::Compression::default(), + )); + for (p, bytes) in [ + ( + "package/package.json", + br#"{"name":"left-pad","version":"1.3.0","scripts":{"postinstall":"evil"}}"#.as_slice(), + ), + ("package/index.js", BEFORE), + ] { + let mut header = tar::Header::new_gnu(); + header.set_size(bytes.len() as u64); + header.set_mode(0o644); + header.set_cksum(); + tampered.append_data(&mut header, p, bytes).unwrap(); + } + let tampered = tampered.into_inner().unwrap().finish().unwrap(); + Mock::given(method("GET")) + .and(path("/left-pad/-/left-pad-1.3.0.tgz")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(tampered)) + .mount(&mock) + .await; + + let tmp = tempfile::tempdir().unwrap(); + write_fixture( + tmp.path(), + "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz", + "sha512-orig==", + ); + let tgz = vendor_project(tmp.path(), &mock.uri(), &[]); + std::fs::remove_dir_all(tmp.path().join(".socket/vendor")).unwrap(); + std::fs::remove_dir_all(tmp.path().join("node_modules")).unwrap(); + + mount_blob(&mock).await; + let out = Command::new(binary()) + .args([ + "repair", + "--download-mode", + "file", + "--json", + "--api-url", + &mock.uri(), + "--api-token", + "fake-token", + "--org", + ORG_SLUG, + ]) + .current_dir(tmp.path()) + .env("SOCKET_TELEMETRY_DISABLED", "1") + .env("SOCKET_NPM_REGISTRY", mock.uri()) + .output() + .expect("run"); + let stdout = String::from_utf8_lossy(&out.stdout); + assert_eq!(out.status.code(), Some(1), "stdout={stdout}"); + let v = parse_env(&stdout); + assert!( + events_of(&v).iter().any(|e| e["action"] == "failed" + && e["errorCode"] == "vendor_artifact_rebuild_failed" + && e["error"] + .as_str() + .unwrap_or("") + .contains("integrity the lockfile records")), + "envelope={v}" + ); + assert!(!tgz.exists(), "a tampered rebuild must not be kept"); +} + +/// Dry run previews the rebuild without touching disk. +#[tokio::test] +async fn repair_dry_run_previews_rebuild() { + let mock = MockServer::start().await; + mount_patch_api(&mock).await; + let tmp = tempfile::tempdir().unwrap(); + write_fixture( + tmp.path(), + "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz", + "sha512-orig==", + ); + let tgz = vendor_project(tmp.path(), &mock.uri(), &[]); + std::fs::remove_file(&tgz).unwrap(); + + let (code, stdout, stderr) = run_cli(tmp.path(), &mock.uri(), &["repair", "--dry-run"]); + assert_eq!(code, 0, "stdout={stdout} stderr={stderr}"); + let v = parse_env(&stdout); + assert!( + events_of(&v).iter().any(|e| e["action"] == "verified" + && e["details"]["wouldRebuild"] == true + && e["purl"] == PURL), + "envelope={v}" + ); + assert!(!tgz.exists(), "dry run writes nothing"); +} + +/// Offline with a broken artifact and NO local sources: a calm, loud, +/// per-entry failure naming the purl and the path; exit 1. +#[tokio::test] +async fn repair_offline_without_sources_fails_loudly() { + let mock = MockServer::start().await; + mount_patch_api(&mock).await; + let tmp = tempfile::tempdir().unwrap(); + write_fixture( + tmp.path(), + "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz", + "sha512-orig==", + ); + let tgz = vendor_project(tmp.path(), &mock.uri(), &[]); + std::fs::remove_file(&tgz).unwrap(); + // No installed copy either — and no local patch sources. + std::fs::remove_dir_all(tmp.path().join("node_modules")).unwrap(); + + let (code, stdout, stderr) = run_cli(tmp.path(), &mock.uri(), &["repair", "--offline"]); + assert_eq!(code, 1, "stdout={stdout} stderr={stderr}"); + let v = parse_env(&stdout); + let failed: Vec<_> = events_of(&v) + .into_iter() + .filter(|e| e["action"] == "failed") + .collect(); + assert!( + failed + .iter() + .any(|e| e["purl"] == PURL && e["error"].as_str().unwrap_or("").contains("--offline")), + "the failure names the purl and the offline cause: {v}" + ); + assert!(!tgz.exists()); +} diff --git a/crates/socket-patch-cli/tests/scan_vendor_e2e.rs b/crates/socket-patch-cli/tests/scan_vendor_e2e.rs index cc13f29..a76bc22 100644 --- a/crates/socket-patch-cli/tests/scan_vendor_e2e.rs +++ b/crates/socket-patch-cli/tests/scan_vendor_e2e.rs @@ -171,6 +171,23 @@ fn run_scan_vendor(root: &Path, mock_uri: &str, extra: &[&str]) -> (i32, String, ) } +/// Vendor flows hold patch content in MEMORY: `.socket/` must end up with +/// nothing beyond the manifest and the committed vendor artifacts — no +/// `blobs/`, `diffs/`, `packages/`, or stray temp files. +fn assert_socket_dir_lean(root: &Path) { + let entries: Vec = std::fs::read_dir(root.join(".socket")) + .expect(".socket exists") + .map(|e| e.unwrap().file_name().to_string_lossy().into_owned()) + .filter(|n| n != "apply.lock") + .collect(); + assert!( + entries + .iter() + .all(|n| n == "manifest.json" || n == "vendor"), + "vendoring must not write blobs or temp files into .socket; found: {entries:?}" + ); +} + #[tokio::test] async fn scan_vendor_manifest_mode_end_to_end() { // scan --vendor: discover → download (manifest written) → vendor. @@ -231,6 +248,7 @@ async fn scan_vendor_manifest_mode_end_to_end() { BEFORE, "installed tree stays pristine" ); + assert_socket_dir_lean(tmp.path()); // Idempotent re-run: already_vendored skip, zero new applies. let (code, stdout, stderr) = run_scan_vendor(tmp.path(), &mock.uri(), &[]); @@ -329,6 +347,10 @@ async fn scan_vendor_detached_mode_writes_no_manifest() { .any(|r| r.url.path().contains("/patches/view/")), "idempotent detached re-run must not re-fetch the patch view" ); + assert!( + !tmp.path().join(".socket/blobs").exists(), + "detached vendoring must never persist blobs" + ); } #[tokio::test] @@ -412,3 +434,761 @@ async fn scan_vendor_flag_conflicts_are_clap_errors() { ); } } + +// ───────────── percent-encoded scoped purls (API canonical form) ───────────── + +const SCOPED_CRAWLER_PURL: &str = "pkg:npm/@scope/left-pad@1.3.0"; +const SCOPED_API_PURL: &str = "pkg:npm/%40scope/left-pad@1.3.0"; + +/// Like `write_fixture`, but the installed package is the SCOPED +/// `@scope/left-pad` (the crawler reports the literal `@scope` form). +fn write_scoped_fixture(root: &Path) { + std::fs::write( + root.join("package.json"), + r#"{ "name": "scan-vendor-test", "version": "0.0.0" }"#, + ) + .unwrap(); + let lock = serde_json::json!({ + "name": "scan-vendor-test", + "version": "0.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "scan-vendor-test", + "version": "0.0.0", + "dependencies": { "@scope/left-pad": "^1.3.0" } + }, + "node_modules/@scope/left-pad": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/@scope/left-pad/-/left-pad-1.3.0.tgz", + "integrity": "sha512-orig==", + "license": "WTFPL" + } + } + }); + let mut lock_bytes = serde_json::to_vec_pretty(&lock).unwrap(); + lock_bytes.push(b'\n'); + std::fs::write(root.join("package-lock.json"), lock_bytes).unwrap(); + + let pkg = root.join("node_modules/@scope/left-pad"); + std::fs::create_dir_all(&pkg).unwrap(); + std::fs::write( + pkg.join("package.json"), + br#"{"name":"@scope/left-pad","version":"1.3.0"}"#, + ) + .unwrap(); + std::fs::write(pkg.join("index.js"), BEFORE).unwrap(); +} + +/// Mock API that serves the patch under the percent-ENCODED purl (the +/// canonical form the production patches API returns for scoped packages), +/// while the batch request/response is keyed by the crawler's literal form. +async fn mount_scoped_patch_api(mock: &MockServer, uuid: &str) { + let before_hash = git_sha256(BEFORE); + let after_hash = git_sha256(AFTER); + Mock::given(method("POST")) + .and(path(format!("/v0/orgs/{ORG_SLUG}/patches/batch"))) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "packages": [{ + "purl": SCOPED_CRAWLER_PURL, + "patches": [{ + "uuid": uuid, + "purl": SCOPED_API_PURL, + "tier": "free", + "cveIds": ["CVE-2026-0001"], + "ghsaIds": [], + "severity": "high", + "title": "vendor target" + }] + }], + "canAccessPaidPatches": false, + }))) + .mount(mock) + .await; + // Per-package search: the crawler purl, urlencoded. + Mock::given(method("GET")) + .and(path(format!( + "/v0/orgs/{ORG_SLUG}/patches/by-package/pkg%3Anpm%2F%40scope%2Fleft-pad%401.3.0" + ))) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "patches": [{ + "uuid": uuid, + "purl": SCOPED_API_PURL, + "publishedAt": "2026-01-01T00:00:00Z", + "description": "Vendor patch", + "license": "MIT", + "tier": "free", + "vulnerabilities": {} + }], + "canAccessPaidPatches": false, + }))) + .mount(mock) + .await; + Mock::given(method("GET")) + .and(path(format!("/v0/orgs/{ORG_SLUG}/patches/view/{uuid}"))) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "uuid": uuid, + "purl": SCOPED_API_PURL, + "publishedAt": "2026-01-01T00:00:00Z", + "files": { + "package/index.js": { + "beforeHash": before_hash, + "afterHash": after_hash, + "blobContent": AFTER_B64, + } + }, + "vulnerabilities": {}, + "description": "Vendor patch", + "license": "MIT", + "tier": "free", + }))) + .mount(mock) + .await; +} + +/// The production patches API serves scoped purls percent-encoded +/// (`pkg:npm/%40scope/...`) and scan stores them verbatim as manifest keys. +/// The whole pipeline — download, vendor lookup against the literal +/// `node_modules/@scope/...` install, lock rewiring, prune exemption — must +/// bridge the two spellings. (Flowise regression: `%40modelcontextprotocol` +/// failed with `package not installed`.) +#[tokio::test] +async fn scan_vendor_resolves_percent_encoded_scoped_purl() { + let mock = MockServer::start().await; + mount_scoped_patch_api(&mock, UUID).await; + let tmp = tempfile::tempdir().unwrap(); + write_scoped_fixture(tmp.path()); + + // --prune in the same run: the freshly-downloaded ENCODED manifest + // entry must not be GC'd against the literal crawler purl. + let (code, stdout, stderr) = run_scan_vendor(tmp.path(), &mock.uri(), &["--prune"]); + assert_eq!(code, 0, "stdout={stdout}; stderr={stderr}"); + let v: serde_json::Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); + assert_eq!(v["status"], "success", "envelope={v}"); + + // Manifest keyed by the verbatim encoded purl — and NOT pruned. + let manifest: serde_json::Value = serde_json::from_str( + &std::fs::read_to_string(tmp.path().join(".socket/manifest.json")).unwrap(), + ) + .unwrap(); + assert_eq!( + manifest["patches"][SCOPED_API_PURL]["uuid"], UUID, + "manifest={manifest}" + ); + assert_eq!( + v["gc"]["prunedManifestEntries"], + serde_json::json!([]), + "the encoded entry must not look prunable: {v}" + ); + + // Vendored: artifact under the DECODED scope dir, lock rewired. + assert_eq!(v["vendor"]["summary"]["applied"], 1, "envelope={v}"); + let tgz = tmp.path().join(format!( + ".socket/vendor/npm/{UUID}/@scope/left-pad-1.3.0.tgz" + )); + assert!(tgz.is_file(), "tarball at the decoded scoped path"); + let lock = std::fs::read_to_string(tmp.path().join("package-lock.json")).unwrap(); + assert!( + lock.contains(&format!( + ".socket/vendor/npm/{UUID}/@scope/left-pad-1.3.0.tgz" + )), + "lock consumes the vendored tarball; lock={lock}" + ); + // Ledger keyed by the verbatim encoded purl. + let state: serde_json::Value = serde_json::from_str( + &std::fs::read_to_string(tmp.path().join(".socket/vendor/state.json")).unwrap(), + ) + .unwrap(); + assert_eq!(state["entries"][SCOPED_API_PURL]["uuid"], UUID, "{state}"); +} + +// ───────────────────── prune reconciles vendored state ───────────────────── + +/// After a dependency is removed and re-locked, `scan --prune` (without +/// `--vendor`) reverts the now-unused vendored entry: lock restored, ledger +/// entry + manifest entry dropped, artifact dir removed. +#[tokio::test] +async fn scan_prune_reverts_unused_vendored_entry() { + let mock = MockServer::start().await; + mount_patch_api(&mock, UUID).await; + let tmp = tempfile::tempdir().unwrap(); + write_fixture(tmp.path()); + + // A second installed package so the later prune run's crawl is + // non-empty (left-pad itself gets removed below). + let other = tmp.path().join("node_modules/keeper"); + std::fs::create_dir_all(&other).unwrap(); + std::fs::write( + other.join("package.json"), + br#"{"name":"keeper","version":"1.0.0"}"#, + ) + .unwrap(); + + let (code, stdout, stderr) = run_scan_vendor(tmp.path(), &mock.uri(), &[]); + assert_eq!(code, 0, "stdout={stdout}; stderr={stderr}"); + + // Simulate `npm uninstall left-pad` + re-lock: drop the dep from the + // lock graph and remove the installed copy. The override-free npm + // wiring leaves nothing else behind. + let lock = serde_json::json!({ + "name": "scan-vendor-test", + "version": "0.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { "name": "scan-vendor-test", "version": "0.0.0" } + } + }); + let mut lock_bytes = serde_json::to_vec_pretty(&lock).unwrap(); + lock_bytes.push(b'\n'); + std::fs::write(tmp.path().join("package-lock.json"), &lock_bytes).unwrap(); + std::fs::remove_dir_all(tmp.path().join("node_modules/left-pad")).unwrap(); + + // Plain prune scan (read-only discovery + GC; no --vendor, no --apply). + let out = Command::new(binary()) + .args([ + "scan", + "--json", + "--prune", + "--yes", + "--api-url", + &mock.uri(), + "--api-token", + "fake-token", + "--org", + ORG_SLUG, + ]) + .current_dir(tmp.path()) + .output() + .expect("run"); + let stdout = String::from_utf8_lossy(&out.stdout); + let code = out.status.code().unwrap_or(-1); + assert_eq!(code, 0, "stdout={stdout}"); + let v: serde_json::Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); + + assert_eq!( + v["gc"]["revertedVendoredEntries"], + serde_json::json!([PURL]), + "gc must report the reverted entry: {v}" + ); + + // Ledger empty (an emptied state file may be removed outright), + // manifest entry dropped, artifact gone. + match std::fs::read_to_string(tmp.path().join(".socket/vendor/state.json")) { + Ok(text) => { + let state: serde_json::Value = serde_json::from_str(&text).unwrap(); + assert!( + state["entries"].as_object().is_none_or(|m| m.is_empty()), + "ledger entry removed: {state}" + ); + } + Err(e) if e.kind() == std::io::ErrorKind::NotFound => {} + Err(e) => panic!("unexpected state.json read error: {e}"), + } + let manifest: serde_json::Value = serde_json::from_str( + &std::fs::read_to_string(tmp.path().join(".socket/manifest.json")).unwrap(), + ) + .unwrap(); + assert!( + manifest["patches"] + .as_object() + .is_none_or(|m| !m.contains_key(PURL)), + "manifest entry dropped: {manifest}" + ); + assert!( + !tmp.path() + .join(format!(".socket/vendor/npm/{UUID}")) + .exists(), + "artifact dir removed" + ); + // The (already left-pad-free) lock stays exactly as the user re-locked + // it — the revert had nothing to restore there. + assert_eq!( + std::fs::read(tmp.path().join("package-lock.json")).unwrap(), + lock_bytes + ); +} + +/// Interactive (non-JSON) `scan --vendor` pre-verifies patch baselines: +/// installed content matching NEITHER hash is annotated BEFORE the +/// confirm prompt, and the run still vendors (auto-force) with the +/// `vendor_content_mismatch_overwritten` warning on stderr. +#[tokio::test] +async fn scan_vendor_annotates_mismatched_baseline_and_vendors_anyway() { + let mock = MockServer::start().await; + mount_patch_api(&mock, UUID).await; + let tmp = tempfile::tempdir().unwrap(); + write_fixture(tmp.path()); + // Divergent installed bytes: neither BEFORE nor AFTER. + std::fs::write( + tmp.path().join("node_modules/left-pad/index.js"), + b"divergent\n", + ) + .unwrap(); + + let out = Command::new(binary()) + .args([ + "scan", + "--vendor", + "--yes", + "--api-url", + &mock.uri(), + "--api-token", + "fake-token", + "--org", + ORG_SLUG, + ]) + .current_dir(tmp.path()) + .output() + .expect("run"); + let stdout = String::from_utf8_lossy(&out.stdout); + let stderr = String::from_utf8_lossy(&out.stderr); + assert_eq!( + out.status.code().unwrap_or(-1), + 0, + "stdout={stdout}; stderr={stderr}" + ); + assert!( + stdout.contains("installed content differs from patch baseline"), + "pre-prompt annotation present; stdout={stdout}" + ); + assert!( + stderr.contains("vendor_content_mismatch_overwritten"), + "overwrite warning surfaced; stderr={stderr}" + ); + // Vendored despite the mismatch. + assert!(tmp + .path() + .join(format!(".socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz")) + .is_file()); +} + +// ───────────── lockfile auto-fetch + scan lockfile supplement ───────────── + +/// sha512 SRI of the given bytes (what an npm-family lock records). +fn sri_of(bytes: &[u8]) -> String { + use base64::Engine as _; + use sha2::Sha512; + format!( + "sha512-{}", + base64::engine::general_purpose::STANDARD.encode(Sha512::digest(bytes)) + ) +} + +/// A pristine registry tarball for left-pad@1.3.0 whose index.js carries +/// the patch's BEFORE bytes. +fn pristine_tgz() -> Vec { + let mut builder = tar::Builder::new(flate2::write::GzEncoder::new( + Vec::new(), + flate2::Compression::default(), + )); + for (path, bytes) in [ + ( + "package/package.json", + br#"{"name":"left-pad","version":"1.3.0"}"#.as_slice(), + ), + ("package/index.js", BEFORE), + ] { + let mut header = tar::Header::new_gnu(); + header.set_size(bytes.len() as u64); + header.set_mode(0o644); + header.set_cksum(); + builder.append_data(&mut header, path, bytes).unwrap(); + } + builder.into_inner().unwrap().finish().unwrap() +} + +/// Project fixture with a lockfile but NO node_modules: package.json + +/// package-lock.json whose left-pad entry resolves to `resolved_url` with +/// `integrity`. +fn write_lockfile_only_fixture(root: &Path, resolved_url: &str, integrity: &str) { + std::fs::write( + root.join("package.json"), + r#"{ "name": "scan-vendor-test", "version": "0.0.0", "dependencies": { "left-pad": "^1.3.0" } }"#, + ) + .unwrap(); + let lock = serde_json::json!({ + "name": "scan-vendor-test", + "version": "0.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "scan-vendor-test", + "version": "0.0.0", + "dependencies": { "left-pad": "^1.3.0" } + }, + "node_modules/left-pad": { + "version": "1.3.0", + "resolved": resolved_url, + "integrity": integrity, + "license": "WTFPL" + } + } + }); + let mut lock_bytes = serde_json::to_vec_pretty(&lock).unwrap(); + lock_bytes.push(b'\n'); + std::fs::write(root.join("package-lock.json"), lock_bytes).unwrap(); +} + +/// Pre-seed `.socket/manifest.json` + the after-blob so a standalone +/// `vendor` run has local patch sources (no patch-API traffic). +fn seed_manifest_and_blob(root: &Path) { + let socket = root.join(".socket"); + std::fs::create_dir_all(socket.join("blobs")).unwrap(); + let manifest = serde_json::json!({ + "patches": { + PURL: { + "uuid": UUID, + "exportedAt": "2026-01-01T00:00:00Z", + "files": { + "package/index.js": { + "beforeHash": git_sha256(BEFORE), + "afterHash": git_sha256(AFTER), + } + }, + "vulnerabilities": {}, + "description": "synthetic", + "license": "MIT", + "tier": "free" + } + } + }); + std::fs::write( + socket.join("manifest.json"), + serde_json::to_vec_pretty(&manifest).unwrap(), + ) + .unwrap(); + std::fs::write(socket.join("blobs").join(git_sha256(AFTER)), AFTER).unwrap(); +} + +async fn mount_registry_tarball(mock: &MockServer, tgz: Vec) { + Mock::given(method("GET")) + .and(path("/left-pad/-/left-pad-1.3.0.tgz")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(tgz)) + .mount(mock) + .await; +} + +fn run_vendor(root: &Path, extra: &[&str]) -> (i32, serde_json::Value, String) { + let mut argv = vec!["vendor", "--json"]; + argv.extend_from_slice(extra); + let out = Command::new(binary()) + .args(&argv) + .current_dir(root) + .env("SOCKET_TELEMETRY_DISABLED", "1") + .output() + .expect("run vendor"); + let stdout = String::from_utf8_lossy(&out.stdout).into_owned(); + let stderr = String::from_utf8_lossy(&out.stderr).into_owned(); + let v: serde_json::Value = serde_json::from_str(stdout.trim()) + .unwrap_or_else(|e| panic!("vendor --json must emit JSON: {e}\n{stdout}\n{stderr}")); + (out.status.code().unwrap_or(-1), v, stderr) +} + +/// A manifest patch whose package is NOT installed but IS lockfile-resolved +/// is fetched pristine from the registry (integrity-verified against the +/// lock) and vendored — node_modules never appears. +#[tokio::test] +async fn vendor_auto_fetches_missing_package_from_lockfile() { + let mock = MockServer::start().await; + let tgz = pristine_tgz(); + let integrity = sri_of(&tgz); + mount_registry_tarball(&mock, tgz).await; + + let tmp = tempfile::tempdir().unwrap(); + write_lockfile_only_fixture( + tmp.path(), + &format!("{}/left-pad/-/left-pad-1.3.0.tgz", mock.uri()), + &integrity, + ); + seed_manifest_and_blob(tmp.path()); + + let (code, v, _) = run_vendor(tmp.path(), &[]); + assert_eq!(code, 0, "{v:#}"); + let events = v["events"].as_array().unwrap(); + assert!( + events + .iter() + .any(|e| e["action"] == "applied" && e["purl"] == PURL), + "{v:#}" + ); + assert!( + events + .iter() + .any(|e| e["errorCode"] == "vendor_fetched_missing"), + "fetch surfaced as a warning event: {v:#}" + ); + assert!(tmp + .path() + .join(format!(".socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz")) + .is_file()); + let lock = std::fs::read_to_string(tmp.path().join("package-lock.json")).unwrap(); + assert!(lock.contains(&format!(".socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz"))); + assert!( + !tmp.path().join("node_modules").exists(), + "the project tree is never touched" + ); +} + +/// Integrity mismatch between the lock and the served bytes is a distinct +/// vendor_fetch_failed failure — and nothing is written. +#[tokio::test] +async fn vendor_fetch_integrity_mismatch_is_vendor_fetch_failed() { + let mock = MockServer::start().await; + mount_registry_tarball(&mock, pristine_tgz()).await; + + let tmp = tempfile::tempdir().unwrap(); + write_lockfile_only_fixture( + tmp.path(), + &format!("{}/left-pad/-/left-pad-1.3.0.tgz", mock.uri()), + &sri_of(b"the lock expects different bytes"), + ); + seed_manifest_and_blob(tmp.path()); + + let (code, v, _) = run_vendor(tmp.path(), &[]); + assert_ne!(code, 0, "{v:#}"); + let events = v["events"].as_array().unwrap(); + assert!( + events + .iter() + .any(|e| e["action"] == "failed" && e["errorCode"] == "vendor_fetch_failed"), + "{v:#}" + ); + assert!( + !events + .iter() + .any(|e| e["errorCode"] == "package_not_installed"), + "no duplicate not-installed skip: {v:#}" + ); + assert!(!tmp.path().join(".socket/vendor").exists()); +} + +/// --offline refuses the fetch with a calm package_not_installed skip that +/// names the lockfile as the would-be source. No HTTP traffic happens (no +/// registry route is mounted — a request would 404 and fail differently). +#[tokio::test] +async fn vendor_offline_refuses_fetch_with_calm_skip() { + let tmp = tempfile::tempdir().unwrap(); + write_lockfile_only_fixture( + tmp.path(), + "http://127.0.0.1:1/left-pad/-/left-pad-1.3.0.tgz", + &sri_of(b"irrelevant"), + ); + seed_manifest_and_blob(tmp.path()); + + let (code, v, _) = run_vendor(tmp.path(), &["--offline"]); + assert_ne!(code, 0, "not-installed stays a non-benign skip: {v:#}"); + let events = v["events"].as_array().unwrap(); + let skip = events + .iter() + .find(|e| e["errorCode"] == "package_not_installed") + .unwrap_or_else(|| panic!("{v:#}")); + assert!( + skip["reason"] + .as_str() + .unwrap_or("") + .contains("--offline prevents fetching"), + "offline detail names the lockfile resolution: {v:#}" + ); +} + +/// An entry whose lock records no integrity is never fetched (fail-closed) +/// and keeps the plain not-installed outcome plus an explanatory warning. +#[tokio::test] +async fn vendor_fetch_unverifiable_lock_entry_stays_not_installed() { + let tmp = tempfile::tempdir().unwrap(); + // Hand-write a lock whose entry has no integrity field. + std::fs::write( + tmp.path().join("package.json"), + r#"{ "name": "x", "version": "0.0.0" }"#, + ) + .unwrap(); + std::fs::write( + tmp.path().join("package-lock.json"), + serde_json::to_vec_pretty(&serde_json::json!({ + "name": "x", "version": "0.0.0", "lockfileVersion": 3, + "packages": { + "": { "name": "x", "version": "0.0.0" }, + "node_modules/left-pad": { + "version": "1.3.0", + "resolved": "http://127.0.0.1:1/left-pad/-/left-pad-1.3.0.tgz" + } + } + })) + .unwrap(), + ) + .unwrap(); + seed_manifest_and_blob(tmp.path()); + + let (code, v, _) = run_vendor(tmp.path(), &[]); + assert_ne!(code, 0, "{v:#}"); + let events = v["events"].as_array().unwrap(); + assert!( + events + .iter() + .any(|e| e["errorCode"] == "vendor_fetch_unverifiable"), + "{v:#}" + ); + assert!( + events + .iter() + .any(|e| e["errorCode"] == "package_not_installed"), + "{v:#}" + ); +} + +/// The headline flow: a COMPLETELY fresh clone (lockfile, no node_modules, +/// no .socket) discovers from the lockfile and `scan --vendor` vendors +/// end-to-end via the registry fetch. +#[tokio::test] +async fn scan_vendor_works_on_a_completely_fresh_clone() { + let mock = MockServer::start().await; + mount_patch_api(&mock, UUID).await; + let tgz = pristine_tgz(); + let integrity = sri_of(&tgz); + mount_registry_tarball(&mock, tgz).await; + + let tmp = tempfile::tempdir().unwrap(); + write_lockfile_only_fixture( + tmp.path(), + &format!("{}/left-pad/-/left-pad-1.3.0.tgz", mock.uri()), + &integrity, + ); + + let (code, stdout, stderr) = run_scan_vendor(tmp.path(), &mock.uri(), &[]); + assert_eq!(code, 0, "stdout={stdout}; stderr={stderr}"); + let v: serde_json::Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); + assert_eq!(v["lockfileOnlyPackages"], 1, "{v}"); + assert_eq!(v["vendor"]["summary"]["applied"], 1, "{v}"); + assert!(tmp + .path() + .join(format!(".socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz")) + .is_file()); + assert!(!tmp.path().join("node_modules").exists()); + assert_socket_dir_lean(tmp.path()); + + // Second run: in sync. + let (code, stdout, stderr) = run_scan_vendor(tmp.path(), &mock.uri(), &[]); + assert_eq!(code, 0, "stdout={stdout}; stderr={stderr}"); + let v: serde_json::Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); + let events = v["vendor"]["events"].as_array().unwrap(); + assert!( + events.iter().any(|e| e["errorCode"] == "already_vendored"), + "{v}" + ); + assert_socket_dir_lean(tmp.path()); +} + +/// Read-only discovery flags lockfile-only packages in JSON and the human +/// table. +#[tokio::test] +async fn scan_discovers_lockfile_only_packages_with_warning() { + let mock = MockServer::start().await; + mount_patch_api(&mock, UUID).await; + let tmp = tempfile::tempdir().unwrap(); + write_lockfile_only_fixture( + tmp.path(), + "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz", + &sri_of(b"unused for discovery"), + ); + + // JSON shape. + let out = Command::new(binary()) + .args([ + "scan", + "--json", + "--api-url", + &mock.uri(), + "--api-token", + "fake-token", + "--org", + ORG_SLUG, + ]) + .current_dir(tmp.path()) + .env("SOCKET_TELEMETRY_DISABLED", "1") + .output() + .expect("run"); + let stdout = String::from_utf8_lossy(&out.stdout); + let v: serde_json::Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); + assert_eq!(v["scannedPackages"], 1, "{v}"); + assert_eq!(v["lockfileOnlyPackages"], 1, "{v}"); + assert_eq!(v["packages"][0]["notInstalled"], true, "{v}"); + + // Human output: the table marker + the note. + let out = Command::new(binary()) + .args([ + "scan", + "--api-url", + &mock.uri(), + "--api-token", + "fake-token", + "--org", + ORG_SLUG, + "--dry-run", + "--yes", + ]) + .current_dir(tmp.path()) + .env("SOCKET_TELEMETRY_DISABLED", "1") + .output() + .expect("run"); + let stdout = String::from_utf8_lossy(&out.stdout); + let stderr = String::from_utf8_lossy(&out.stderr); + assert!( + stdout.contains("[NOT INSTALLED]"), + "stdout={stdout}; stderr={stderr}" + ); + assert!( + stderr.contains("not yet installed (lockfile-only)"), + "stderr={stderr}" + ); +} + +/// `scan --apply` skips lockfile-only patches calmly: exit 0, a skipped +/// record with package_not_installed, and NO manifest entry written. +#[tokio::test] +async fn scan_apply_skips_lockfile_only_without_error() { + let mock = MockServer::start().await; + mount_patch_api(&mock, UUID).await; + let tmp = tempfile::tempdir().unwrap(); + write_lockfile_only_fixture( + tmp.path(), + "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz", + &sri_of(b"unused"), + ); + + let out = Command::new(binary()) + .args([ + "scan", + "--json", + "--apply", + "--yes", + "--api-url", + &mock.uri(), + "--api-token", + "fake-token", + "--org", + ORG_SLUG, + ]) + .current_dir(tmp.path()) + .env("SOCKET_TELEMETRY_DISABLED", "1") + .output() + .expect("run"); + let stdout = String::from_utf8_lossy(&out.stdout); + let code = out.status.code().unwrap_or(-1); + let v: serde_json::Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); + assert_eq!(code, 0, "lockfile-only must not flip the exit code: {v}"); + assert_eq!(v["status"], "success", "{v}"); + let patches = v["apply"]["patches"].as_array().unwrap(); + assert!( + patches + .iter() + .any(|p| p["action"] == "skipped" && p["errorCode"] == "package_not_installed"), + "{v}" + ); + assert!( + !tmp.path().join(".socket/manifest.json").exists(), + "no manifest entry is written for a not-installed package" + ); +} diff --git a/crates/socket-patch-core/src/crawlers/deno_crawler.rs b/crates/socket-patch-core/src/crawlers/deno_crawler.rs index 5a12c2d..9a7d784 100644 --- a/crates/socket-patch-core/src/crawlers/deno_crawler.rs +++ b/crates/socket-patch-core/src/crawlers/deno_crawler.rs @@ -120,16 +120,18 @@ impl DenoCrawler { // manifest PURL and are joined onto the cache root below. A real // JSR coordinate is a single path segment, so reject any that // could traverse out of the cache (`..`/`.`, a separator, NUL). + // The parser percent-decodes components, so these guards see the + // decoded form — `%2e%2e` cannot smuggle a traversal past them. // Unlike the cargo/npm crawlers there is no content check to catch // a bogus path, and jsr patches in place — so fail closed here. - if !(is_safe_jsr_component(scope) - && is_safe_jsr_component(name) - && is_safe_jsr_component(version)) + if !(is_safe_jsr_component(&scope) + && is_safe_jsr_component(&name) + && is_safe_jsr_component(&version)) { continue; } // Cache layout: //// - let pkg_dir = jsr_cache_path.join(scope).join(name).join(version); + let pkg_dir = jsr_cache_path.join(&*scope).join(&*name).join(&*version); if !is_dir(&pkg_dir).await { continue; } diff --git a/crates/socket-patch-core/src/crawlers/npm_crawler.rs b/crates/socket-patch-core/src/crawlers/npm_crawler.rs index 91cb624..2d9f8d3 100644 --- a/crates/socket-patch-core/src/crawlers/npm_crawler.rs +++ b/crates/socket-patch-core/src/crawlers/npm_crawler.rs @@ -4,6 +4,7 @@ use std::path::{Path, PathBuf}; use serde::Deserialize; use super::types::{CrawledPackage, CrawlerOptions}; +use crate::utils::purl::{percent_decode_purl_component, strip_purl_qualifiers}; /// Default batch size for crawling. #[cfg(test)] @@ -686,11 +687,7 @@ impl NpmCrawler { /// Parse a PURL string to extract namespace, name, and version. fn parse_purl_components(purl: &str) -> Option<(Option, String, String)> { - // Strip qualifiers - let base = match purl.find('?') { - Some(idx) => &purl[..idx], - None => purl, - }; + let base = strip_purl_qualifiers(purl); let rest = base.strip_prefix("pkg:npm/")?; let at_idx = rest.rfind('@')?; @@ -701,16 +698,33 @@ impl NpmCrawler { return None; } - if name_part.starts_with('@') { - let slash_idx = name_part.find('/')?; - let namespace = name_part[..slash_idx].to_string(); - let name = name_part[slash_idx + 1..].to_string(); - if name.is_empty() { + // SECURITY: components are percent-decoded AFTER the `/`/`@` splits + // above (so an encoded `%2f` cannot create a new path segment here) + // and BEFORE the `is_safe_npm_component` guards in `find_by_purls` + // (so `%2e%2e` cannot smuggle a traversal past them). The API serves + // scoped purls as `pkg:npm/%40scope/name@version`, which must match + // the literal `node_modules/@scope/name` install. + let version = percent_decode_purl_component(version); + + if let Some(slash_idx) = name_part.find('/') { + let namespace = percent_decode_purl_component(&name_part[..slash_idx]); + let name = percent_decode_purl_component(&name_part[slash_idx + 1..]); + // An npm namespace is always an `@scope` (checked post-decode). + if name.is_empty() || !namespace.starts_with('@') { return None; } - Some((Some(namespace), name, version.to_string())) + Some(( + Some(namespace.into_owned()), + name.into_owned(), + version.into_owned(), + )) } else { - Some((None, name_part.to_string(), version.to_string())) + let name = percent_decode_purl_component(name_part); + // A bare `@scope` with no `/name` is not a package name. + if name.starts_with('@') { + return None; + } + Some((None, name.into_owned(), version.into_owned())) } } } @@ -1031,6 +1045,93 @@ mod tests { assert!(!result.contains_key("pkg:npm/not-installed@0.0.1")); } + /// Regression: the patches API serves scoped purls percent-encoded + /// (`pkg:npm/%40scope/name@version`) and `scan` stores them verbatim as + /// manifest keys. `find_by_purls` must decode the components to match + /// the literal `node_modules/@scope/name` install — while keeping the + /// result keyed by the *verbatim* encoded input (downstream contract). + #[test] + fn test_parse_purl_components_percent_encoded_scope() { + let (ns, name, ver) = + NpmCrawler::parse_purl_components("pkg:npm/%40modelcontextprotocol/sdk@1.12.0") + .unwrap(); + assert_eq!(ns.as_deref(), Some("@modelcontextprotocol")); + assert_eq!(name, "sdk"); + assert_eq!(ver, "1.12.0"); + // An encoded bare scope with no `/name` is still not a package. + assert!(NpmCrawler::parse_purl_components("pkg:npm/%40scope@1.0.0").is_none()); + // A `#subpath` without a qualifier must not bleed into the version. + let (_, name, ver) = + NpmCrawler::parse_purl_components("pkg:npm/foo@1.0.0#lib/util").unwrap(); + assert_eq!(name, "foo"); + assert_eq!(ver, "1.0.0"); + } + + #[tokio::test] + async fn test_find_by_purls_percent_encoded_scope_resolves() { + let dir = tempfile::tempdir().unwrap(); + let nm = dir.path().join("node_modules"); + + let sdk_dir = nm.join("@modelcontextprotocol").join("sdk"); + tokio::fs::create_dir_all(&sdk_dir).await.unwrap(); + tokio::fs::write( + sdk_dir.join("package.json"), + r#"{"name": "@modelcontextprotocol/sdk", "version": "1.12.0"}"#, + ) + .await + .unwrap(); + + let crawler = NpmCrawler::new(); + let encoded = "pkg:npm/%40modelcontextprotocol/sdk@1.12.0".to_string(); + let result = crawler + .find_by_purls(&nm, std::slice::from_ref(&encoded)) + .await + .unwrap(); + + assert_eq!(result.len(), 1, "encoded scope must resolve: {result:?}"); + let pkg = result + .get(&encoded) + .expect("result keyed by the verbatim encoded input purl"); + assert_eq!(pkg.path, sdk_dir); + assert_eq!(pkg.name, "sdk"); + assert_eq!(pkg.namespace.as_deref(), Some("@modelcontextprotocol")); + } + + /// SECURITY regression: percent-encoded traversal sequences must be + /// rejected by the post-decode guards — `%2e%2e` decodes to `..` and + /// `%2f` to `/`, so guarding the *encoded* form would be a bypass. + #[tokio::test] + async fn test_find_by_purls_rejects_encoded_traversal() { + let root = tempfile::tempdir().unwrap(); + let nm = root.path().join("node_modules"); + // A real scope dir so a scoped traversal's kernel walk could resolve. + tokio::fs::create_dir_all(nm.join("@x")).await.unwrap(); + + // A victim package OUTSIDE node_modules, reachable only via `..`. + let evil_dir = root.path().join("evil"); + tokio::fs::create_dir_all(&evil_dir).await.unwrap(); + tokio::fs::write( + evil_dir.join("package.json"), + r#"{"name": "evil", "version": "1.0.0"}"#, + ) + .await + .unwrap(); + + let crawler = NpmCrawler::new(); + let purls = vec![ + "pkg:npm/%2e%2e/evil@1.0.0".to_string(), + "pkg:npm/@x/%2e%2e@1.0.0".to_string(), + "pkg:npm/@x/%2e%2e%2f%2e%2e%2fevil@1.0.0".to_string(), + "pkg:npm/..%2fevil@1.0.0".to_string(), + ]; + let result = crawler.find_by_purls(&nm, &purls).await.unwrap(); + + assert!( + result.is_empty(), + "encoded traversal must not escape node_modules; got {result:?}" + ); + } + /// Regression: a qualified PURL (carrying `?qualifiers`) must resolve and /// be keyed by the *verbatim* input PURL — not a reconstructed, stripped /// form. The dispatcher drives npm with `passthrough_purls` + diff --git a/crates/socket-patch-core/src/patch/apply.rs b/crates/socket-patch-core/src/patch/apply.rs index e880707..da1ce6d 100644 --- a/crates/socket-patch-core/src/patch/apply.rs +++ b/crates/socket-patch-core/src/patch/apply.rs @@ -11,7 +11,7 @@ use crate::patch::file_hash::compute_file_git_sha256; use crate::patch::package::read_archive_filtered; /// Status of a file patch verification. -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum VerifyStatus { /// File is ready to be patched (current hash matches beforeHash). Ready, @@ -34,6 +34,33 @@ pub struct VerifyResult { pub target_hash: Option, } +/// How the apply pipeline treats a file whose on-disk content matches +/// NEITHER `beforeHash` nor `afterHash` (and a pre-existing file that is +/// missing). +/// +/// Mismatch tolerance is safe content-wise in every mode: the diff +/// strategy self-disables on a wrong base, and the archive/blob +/// strategies verify their bytes hash to exactly `afterHash` BEFORE any +/// write — a tolerated mismatch is overwritten with the verified patched +/// content or fails, never silently corrupted. What tolerance can do is +/// discard local modifications to the dependency file, which is why +/// `Strict` exists. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub enum MismatchPolicy { + /// DEFAULT: a beforeHash mismatch is overwritten with the verified + /// patched content and surfaced as a warning (the promoted + /// [`VerifyResult`] keeps `expected_hash`/`current_hash`, which is + /// how callers detect and report it). A MISSING pre-existing file is + /// still a hard error. + #[default] + Warn, + /// A beforeHash mismatch is a hard error (`--strict`). + Strict, + /// [`MismatchPolicy::Warn`] PLUS missing pre-existing files are + /// skipped instead of failing (`--force`). + Force, +} + /// Which patch source actually wrote the patched bytes for a file. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum AppliedVia { @@ -67,6 +94,11 @@ pub struct PatchSources<'a> { pub blobs_path: &'a Path, pub packages_path: Option<&'a Path>, pub diffs_path: Option<&'a Path>, + /// In-memory blob overlay (`afterHash` → patched bytes), consulted + /// BEFORE the on-disk blob dir. The vendor flows stage their patch + /// content here so vendoring writes no `.socket/blobs` entries and no + /// temporary files — the bytes live only for the run. + pub mem_blobs: Option<&'a HashMap>>, } impl<'a> PatchSources<'a> { @@ -78,6 +110,7 @@ impl<'a> PatchSources<'a> { blobs_path, packages_path: None, diffs_path: None, + mem_blobs: None, } } } @@ -682,7 +715,7 @@ pub async fn apply_package_patch( sources: &PatchSources<'_>, uuid: Option<&str>, dry_run: bool, - force: bool, + policy: MismatchPolicy, ) -> ApplyResult { let mut result = ApplyResult { package_key: package_key.to_string(), @@ -714,30 +747,32 @@ pub async fn apply_package_patch( if verify_result.status != VerifyStatus::Ready && verify_result.status != VerifyStatus::AlreadyPatched { - if force { - match verify_result.status { - VerifyStatus::HashMismatch => { - // Force: treat hash mismatch as ready - verify_result.status = VerifyStatus::Ready; - } - VerifyStatus::NotFound => { - // Force: skip files that don't exist (non-new files) - result.files_verified.push(verify_result); - continue; - } - _ => {} + match (verify_result.status, policy) { + // Mismatch tolerated (default + force): promote to Ready. + // The promoted result KEEPS `expected_hash`/`current_hash` + // — the signature callers use to surface the warning. The + // diff strategy self-disables on the wrong base; the + // archive/blob strategies are hash-gated to afterHash. + (VerifyStatus::HashMismatch, MismatchPolicy::Warn | MismatchPolicy::Force) => { + verify_result.status = VerifyStatus::Ready; + } + // Force only: skip missing pre-existing files. + (VerifyStatus::NotFound, MismatchPolicy::Force) => { + result.files_verified.push(verify_result); + continue; + } + _ => { + let msg = verify_result + .message + .clone() + .unwrap_or_else(|| format!("{:?}", verify_result.status)); + result.error = Some(format!( + "Cannot apply patch: {} - {}", + verify_result.file, msg + )); + result.files_verified.push(verify_result); + return result; } - } else { - let msg = verify_result - .message - .clone() - .unwrap_or_else(|| format!("{:?}", verify_result.status)); - result.error = Some(format!( - "Cannot apply patch: {} - {}", - verify_result.file, msg - )); - result.files_verified.push(verify_result); - return result; } } @@ -848,16 +883,27 @@ pub async fn apply_package_patch( continue; } - // ── Strategy 3: per-file blob (legacy fallback) ────────────── - let blob_path = sources.blobs_path.join(&file_info.after_hash); - let patched_content = match tokio::fs::read(&blob_path).await { - Ok(content) => content, - Err(e) => { - result.error = Some(format!( - "Failed to read blob {}: {}", - file_info.after_hash, e - )); - return result; + // ── Strategy 3: per-file blob ──────────────────────────────── + // The in-memory overlay wins (vendor flows stage there — no + // `.socket/blobs` writes); the on-disk dir is the fallback. + let mem_hit = sources + .mem_blobs + .and_then(|m| m.get(&file_info.after_hash)) + .cloned(); + let patched_content = match mem_hit { + Some(content) => content, + None => { + let blob_path = sources.blobs_path.join(&file_info.after_hash); + match tokio::fs::read(&blob_path).await { + Ok(content) => content, + Err(e) => { + result.error = Some(format!( + "Failed to read blob {}: {}", + file_info.after_hash, e + )); + return result; + } + } } }; @@ -1654,7 +1700,7 @@ mod tests { &PatchSources::blobs_only(blobs_dir.path()), None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -1706,7 +1752,7 @@ mod tests { &PatchSources::blobs_only(blobs_dir.path()), None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -1743,7 +1789,7 @@ mod tests { &PatchSources::blobs_only(blobs_dir.path()), None, true, - false, + MismatchPolicy::Warn, ) .await; @@ -1785,7 +1831,7 @@ mod tests { &PatchSources::blobs_only(blobs_dir.path()), None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -1818,7 +1864,7 @@ mod tests { &PatchSources::blobs_only(blobs_dir.path()), None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -1826,24 +1872,23 @@ mod tests { assert!(result.error.is_some()); } + /// beforeHash mismatch across the three policies: the DEFAULT (Warn) + /// overwrites with the verified patched content and keeps the + /// promoted warning signature (`Ready` + `expected_hash: Some` + + /// differing `current_hash`); `Strict` is the old hard error; `Force` + /// behaves like Warn (its extra tolerance is missing files). #[tokio::test] - async fn test_apply_package_patch_force_hash_mismatch() { + async fn test_apply_package_patch_hash_mismatch_policies() { let pkg_dir = tempfile::tempdir().unwrap(); let blobs_dir = tempfile::tempdir().unwrap(); let patched = b"patched content"; let after_hash = compute_git_sha256_from_bytes(patched); + let divergent = b"something unexpected"; - // Write a file whose hash does NOT match before_hash - tokio::fs::write(pkg_dir.path().join("index.js"), b"something unexpected") - .await - .unwrap(); - - // Write blob tokio::fs::write(blobs_dir.path().join(&after_hash), patched) .await .unwrap(); - let mut files = HashMap::new(); files.insert( "index.js".to_string(), @@ -1853,25 +1898,41 @@ mod tests { }, ); - // Without force: should fail - let result = apply_package_patch( - "pkg:npm/test@1.0.0", - pkg_dir.path(), - &files, - &PatchSources::blobs_only(blobs_dir.path()), - None, - false, - false, - ) - .await; - assert!(!result.success); + for policy in [MismatchPolicy::Warn, MismatchPolicy::Force] { + tokio::fs::write(pkg_dir.path().join("index.js"), divergent) + .await + .unwrap(); + let result = apply_package_patch( + "pkg:npm/test@1.0.0", + pkg_dir.path(), + &files, + &PatchSources::blobs_only(blobs_dir.path()), + None, + false, + policy, + ) + .await; + assert!(result.success, "{policy:?}: {:?}", result.error); + assert_eq!(result.files_patched.len(), 1, "{policy:?}"); + // The promoted verify keeps the mismatch signature for the + // caller's warning report. + let v = &result.files_verified[0]; + assert_eq!(v.status, VerifyStatus::Ready, "{policy:?}"); + assert!( + v.expected_hash.is_some() && v.current_hash != v.expected_hash, + "{policy:?}: promoted signature retained" + ); + // The bytes on disk are EXACTLY the verified patched content. + let written = tokio::fs::read(pkg_dir.path().join("index.js")) + .await + .unwrap(); + assert_eq!(written, patched, "{policy:?}"); + } - // Reset the file - tokio::fs::write(pkg_dir.path().join("index.js"), b"something unexpected") + // Strict: the old fail-closed behavior, file untouched. + tokio::fs::write(pkg_dir.path().join("index.js"), divergent) .await .unwrap(); - - // With force: should succeed let result = apply_package_patch( "pkg:npm/test@1.0.0", pkg_dir.path(), @@ -1879,16 +1940,38 @@ mod tests { &PatchSources::blobs_only(blobs_dir.path()), None, false, - true, + MismatchPolicy::Strict, ) .await; - assert!(result.success); - assert_eq!(result.files_patched.len(), 1); + assert!(!result.success); + assert!(result + .error + .as_deref() + .unwrap_or("") + .contains("does not match")); + let untouched = tokio::fs::read(pkg_dir.path().join("index.js")) + .await + .unwrap(); + assert_eq!(untouched, divergent, "strict never writes"); - let written = tokio::fs::read(pkg_dir.path().join("index.js")) + // A missing pre-existing file is STILL an error by default and + // under strict — only Force skips it. + tokio::fs::remove_file(pkg_dir.path().join("index.js")) .await .unwrap(); - assert_eq!(written, patched); + for policy in [MismatchPolicy::Warn, MismatchPolicy::Strict] { + let result = apply_package_patch( + "pkg:npm/test@1.0.0", + pkg_dir.path(), + &files, + &PatchSources::blobs_only(blobs_dir.path()), + None, + false, + policy, + ) + .await; + assert!(!result.success, "{policy:?}: missing file fails closed"); + } } #[tokio::test] @@ -1913,7 +1996,7 @@ mod tests { &PatchSources::blobs_only(blobs_dir.path()), None, false, - false, + MismatchPolicy::Warn, ) .await; assert!(!result.success); @@ -1926,7 +2009,7 @@ mod tests { &PatchSources::blobs_only(blobs_dir.path()), None, false, - true, + MismatchPolicy::Force, ) .await; assert!(result.success); @@ -2046,6 +2129,7 @@ mod tests { blobs_path: &blobs_dir, packages_path: Some(&packages_dir), diffs_path: Some(&diffs_dir), + mem_blobs: None, }; let result = apply_package_patch( "pkg:npm/x@1.0.0", @@ -2054,7 +2138,7 @@ mod tests { &sources, Some(TEST_UUID), false, - false, + MismatchPolicy::Warn, ) .await; @@ -2081,6 +2165,7 @@ mod tests { blobs_path: &blobs_dir, packages_path: Some(&packages_dir), diffs_path: Some(&diffs_dir), + mem_blobs: None, }; let result = apply_package_patch( "pkg:npm/x@1.0.0", @@ -2089,7 +2174,7 @@ mod tests { &sources, Some(TEST_UUID), false, - false, + MismatchPolicy::Warn, ) .await; @@ -2115,6 +2200,7 @@ mod tests { blobs_path: &blobs_dir, packages_path: Some(&packages_dir), diffs_path: Some(&diffs_dir), + mem_blobs: None, }; let result = apply_package_patch( "pkg:npm/x@1.0.0", @@ -2123,7 +2209,7 @@ mod tests { &sources, Some(TEST_UUID), false, - false, + MismatchPolicy::Warn, ) .await; @@ -2144,6 +2230,7 @@ mod tests { blobs_path: &blobs_dir, packages_path: Some(&packages_dir), diffs_path: Some(&diffs_dir), + mem_blobs: None, }; let result = apply_package_patch( "pkg:npm/x@1.0.0", @@ -2152,7 +2239,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -2181,6 +2268,7 @@ mod tests { blobs_path: &blobs_dir, packages_path: Some(&packages_dir), diffs_path: Some(&diffs_dir), + mem_blobs: None, }; let result = apply_package_patch( "pkg:npm/x@1.0.0", @@ -2189,7 +2277,7 @@ mod tests { &sources, Some(TEST_UUID), false, - true, // --force + MismatchPolicy::Force, ) .await; @@ -2221,6 +2309,7 @@ mod tests { blobs_path: &blobs_dir, packages_path: Some(&packages_dir), diffs_path: Some(&diffs_dir), + mem_blobs: None, }; let result = apply_package_patch( "pkg:npm/x@1.0.0", @@ -2229,7 +2318,7 @@ mod tests { &sources, Some(TEST_UUID), false, - false, + MismatchPolicy::Warn, ) .await; @@ -2251,6 +2340,7 @@ mod tests { blobs_path: &blobs_dir, packages_path: Some(&packages_dir), diffs_path: Some(&diffs_dir), + mem_blobs: None, }; let result = apply_package_patch( "pkg:npm/x@1.0.0", @@ -2259,7 +2349,7 @@ mod tests { &sources, Some(TEST_UUID), true, // dry-run - false, + MismatchPolicy::Warn, ) .await; @@ -2553,7 +2643,7 @@ mod tests { &PatchSources::blobs_only(blobs_dir.path()), None, false, - false, + MismatchPolicy::Warn, ) .await; diff --git a/crates/socket-patch-core/src/patch/go_redirect.rs b/crates/socket-patch-core/src/patch/go_redirect.rs index 34e5778..63a43f2 100644 --- a/crates/socket-patch-core/src/patch/go_redirect.rs +++ b/crates/socket-patch-core/src/patch/go_redirect.rs @@ -27,7 +27,8 @@ use std::path::{Path, PathBuf}; use crate::manifest::schema::{PatchFileInfo, PatchManifest}; use crate::patch::apply::{ - apply_package_patch, normalize_file_path, ApplyResult, PatchSources, VerifyResult, VerifyStatus, + apply_package_patch, normalize_file_path, ApplyResult, MismatchPolicy, PatchSources, + VerifyResult, VerifyStatus, }; use crate::patch::file_hash::compute_file_git_sha256; use crate::utils::purl::{build_golang_purl, parse_golang_purl, strip_purl_qualifiers}; @@ -164,7 +165,7 @@ pub async fn apply_go_redirect( sources: &PatchSources<'_>, uuid: Option<&str>, dry_run: bool, - force: bool, + policy: MismatchPolicy, ) -> ApplyResult { // SECURITY: refuse coordinates that would escape the copy base. // A `..`/separator-laden `module`/`version` (a tampered manifest PURL) would @@ -195,7 +196,7 @@ pub async fn apply_go_redirect( // Verify (read-only) against the pristine source for an accurate // "would patch" report, without creating the copy or editing go.mod. let mut result = - apply_package_patch(purl, pristine_src, files, sources, uuid, true, force).await; + apply_package_patch(purl, pristine_src, files, sources, uuid, true, policy).await; result.package_path = copy_dir.display().to_string(); result.sidecar = None; // a replace copy is not the cache (no go.sum advisory) return result; @@ -235,7 +236,8 @@ pub async fn apply_go_redirect( } // Delegate to the hardened pipeline, pointed at the copy. - let mut result = apply_package_patch(purl, ©_dir, files, sources, uuid, false, force).await; + let mut result = + apply_package_patch(purl, ©_dir, files, sources, uuid, false, policy).await; result.package_path = copy_dir.display().to_string(); // The golang sidecar advisory ("go mod verify will fail against go.sum") // is about in-cache patching; a `replace` copy bypasses go.sum entirely, so @@ -761,7 +763,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; assert!(result.success, "apply failed: {:?}", result.error); @@ -810,7 +812,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -830,7 +832,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; assert!(result.success); @@ -866,7 +868,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -884,7 +886,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; assert!(result.success); @@ -910,7 +912,7 @@ mod tests { &sources, None, true, - false, + MismatchPolicy::Warn, ) .await; assert!(result.success); @@ -945,7 +947,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; assert!(!result.success); @@ -980,7 +982,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; assert!(result.success, "apply failed: {:?}", result.error); @@ -1016,7 +1018,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; assert!(result.success, "apply failed: {:?}", result.error); @@ -1056,7 +1058,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -1090,7 +1092,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -1119,7 +1121,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; // Add a user-authored replace. @@ -1157,7 +1159,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -1209,7 +1211,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; // Drop the directive but keep the copy. @@ -1243,7 +1245,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -1287,7 +1289,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -1328,7 +1330,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -1365,7 +1367,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; assert!(result.success); @@ -1446,7 +1448,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; assert!(result.success, "apply failed: {:?}", result.error); @@ -1495,7 +1497,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -1536,7 +1538,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; assert!(!result.success); diff --git a/crates/socket-patch-core/src/patch/vendor/bun_lock.rs b/crates/socket-patch-core/src/patch/vendor/bun_lock.rs index 8199bc6..f35a2c2 100644 --- a/crates/socket-patch-core/src/patch/vendor/bun_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/bun_lock.rs @@ -82,7 +82,7 @@ pub async fn vendor_bun( Ok(coords) => coords, Err(outcome) => return *outcome, }; - let (name, version) = (coords.name, coords.version); + let (name, version) = (coords.name.as_str(), coords.version.as_str()); // BN3 spelling: BARE project-relative path, no `file:`/`./` prefix. let rel_tgz = format!("{}/{}", coords.uuid_dir_rel, tgz_rel_leaf(name, version)); @@ -136,6 +136,7 @@ pub async fn vendor_bun( sources, dry_run, force, + &mut warnings, ) .await { @@ -422,7 +423,7 @@ fn revert_one_record( // ───────────────────────── conservative line grammar ────────────────────── /// One parsed single-line packages entry. -struct BunEntry { +pub(super) struct BunEntry { line_idx: usize, /// Leading whitespace, re-emitted verbatim. indent: String, @@ -431,7 +432,7 @@ struct BunEntry { /// The key token exactly as spelled (incl. quotes), re-emitted verbatim. key_raw: String, /// Verbatim top-level tuple elements (trimmed). - elems: Vec, + pub(super) elems: Vec, trailing_comma: bool, } @@ -471,14 +472,14 @@ fn classify(entry: &BunEntry, target_spec: &str, name: &str) -> Option Option<(&str, &str)> { +pub(super) fn split_name_spec(s: &str) -> Option<(&str, &str)> { let at = s.rfind('@').filter(|&i| i > 0)?; Some((&s[..at], &s[at + 1..])) } /// `"lockfileVersion": ` head check — only the fixture-pinned text /// lockfile version is spliced (fail-closed on anything newer/older). -fn check_lock_version(text: &str) -> Result<(), String> { +pub(super) fn check_lock_version(text: &str) -> Result<(), String> { let version = text.lines().take(5).find_map(|line| { line.trim() .strip_prefix("\"lockfileVersion\":") @@ -513,7 +514,7 @@ fn packages_bounds(lines: &[String]) -> Option<(usize, usize)> { /// Strictly parse every entry line of the packages section. Any line that /// is neither blank nor a single-line `"key": [tuple]` entry fails CLOSED. -fn parse_packages_section(lines: &[String]) -> Result, String> { +pub(super) fn parse_packages_section(lines: &[String]) -> Result, String> { let Some((start, end)) = packages_bounds(lines) else { // No (or unterminated) packages section: an empty lock simply has // no entries; an unterminated one is malformed. @@ -650,7 +651,7 @@ fn split_top_level(interior: &str) -> Result, String> { } /// Decode a verbatim JSON string token; `None` if it is not one. -fn decode_json_string(token: &str) -> Option { +pub(super) fn decode_json_string(token: &str) -> Option { if !token.starts_with('"') { return None; } diff --git a/crates/socket-patch-core/src/patch/vendor/cargo.rs b/crates/socket-patch-core/src/patch/vendor/cargo.rs index 85ea5cb..5e4eeea 100644 --- a/crates/socket-patch-core/src/patch/vendor/cargo.rs +++ b/crates/socket-patch-core/src/patch/vendor/cargo.rs @@ -18,7 +18,7 @@ use std::path::{Path, PathBuf}; use crate::manifest::schema::{PatchFileInfo, PatchRecord}; use crate::patch::apply::{ - apply_package_patch, normalize_file_path, ApplyResult, PatchSources, VerifyResult, VerifyStatus, + normalize_file_path, ApplyResult, PatchSources, VerifyResult, VerifyStatus, }; use crate::patch::copy_tree::{fresh_copy, remove_tree}; use crate::patch::file_hash::compute_file_git_sha256; @@ -77,14 +77,8 @@ async fn lock_entry_detached(project_root: &Path, name: &str, version: &str) -> /// `afterHash`, the config entry points at this copy, and the lock entry is /// already detached — i.e. a re-run has nothing to do. Touch nothing then, so /// cargo's source fingerprint and the committed bytes stay stable. -async fn vendor_in_sync( - copy_dir: &Path, - files: &HashMap, - project_root: &Path, - name: &str, - version: &str, - copy_rel: &str, -) -> bool { +/// The committed copy exists and every patched file matches its afterHash. +async fn copy_hashes_ok(copy_dir: &Path, files: &HashMap) -> bool { if tokio::fs::metadata(copy_dir).await.is_err() { return false; } @@ -95,6 +89,12 @@ async fn vendor_in_sync( _ => return false, } } + true +} + +/// The config `[patch]` entry points at THIS copy and the lock entry is +/// already detached — the wiring half of the in-sync test. +async fn wiring_in_sync(project_root: &Path, name: &str, version: &str, copy_rel: &str) -> bool { let entries = cargo_config::read_patch_entries(project_root).await; if entries.get(name).and_then(|i| i.path.as_deref()) != Some(copy_rel) { return false; @@ -269,46 +269,91 @@ pub async fn vendor_cargo_crate( } if dry_run { - // Verify (read-only) against the pristine source — apply_package_patch - // never writes when dry_run — for an accurate "would patch" report, - // without creating the copy or editing config/lock. - let mut result = apply_package_patch( + // Verify (read-only) against the pristine source — the apply + // pipeline never writes when dry_run — for an accurate "would + // patch" report (including the auto-force overwrite warnings the + // real run would emit), without creating the copy or editing + // config/lock. + let mut dry_warnings: Vec = Vec::new(); + let mut result = super::force_apply_staged( purl, pristine_src, - &record.files, + record, sources, - Some(&record.uuid), true, force, + name, + version, + &mut dry_warnings, ) .await; result.package_path = copy_dir.display().to_string(); result.sidecar = None; - return done(result, None, Vec::new()); + return done(result, None, dry_warnings); } // Hot path: already in sync → touch nothing (entry stays with the caller's // existing ledger record, which holds the unrecoverable lock originals). - if vendor_in_sync( - ©_dir, - &record.files, - project_root, - name, - version, - ©_rel, - ) - .await - { - let verified = record - .files - .keys() - .map(|f| already_patched_verify(f)) - .collect(); - return done( - synthesized_result(purl, ©_dir, verified, true, None), - None, - Vec::new(), - ); + if wiring_in_sync(project_root, name, version, ©_rel).await { + if copy_hashes_ok(©_dir, &record.files).await { + let verified = record + .files + .keys() + .map(|f| already_patched_verify(f)) + .collect(); + return done( + synthesized_result(purl, ©_dir, verified, true, None), + None, + Vec::new(), + ); + } + // Wired but the committed copy is missing/stale: rebuild the + // ARTIFACT only — config + lock are already correct, and the full + // path's surgery would re-record live vendored state over the + // first run's unrecoverable lock originals. + if let Err(e) = fresh_copy(pristine_src, ©_dir, Some(".cargo-checksum.json")).await { + let _ = remove_tree(&uuid_dir).await; + return done( + synthesized_result( + purl, + ©_dir, + Vec::new(), + false, + Some(format!("failed to copy pristine source: {e}")), + ), + None, + Vec::new(), + ); + } + let mut warnings: Vec = Vec::new(); + let mut result = super::force_apply_staged( + purl, + ©_dir, + record, + sources, + false, + force, + name, + version, + &mut warnings, + ) + .await; + result.package_path = copy_dir.display().to_string(); + if !result.success { + let _ = remove_tree(&uuid_dir).await; + return done(result, None, warnings); + } + // Same path-dep invariant as the full path: no checksum sidecar. + let _ = tokio::fs::remove_file(copy_dir.join(".cargo-checksum.json")).await; + result.sidecar = None; + warnings.push(VendorWarning::new( + "vendor_artifact_rebuilt", + format!( + "the committed vendored copy for {name}@{version} was missing or stale; \ + rebuilt at {copy_rel} (config and lock untouched)" + ), + )); + return done(result, None, warnings); } // ── materialise the patched copy ────────────────────────────────────── @@ -333,15 +378,19 @@ pub async fn vendor_cargo_crate( ); } - // Delegate to the hardened pipeline, pointed at the copy. - let mut result = apply_package_patch( + // Delegate to the hardened pipeline (vendor auto-force policy — see + // `force_apply_staged`), pointed at the copy. + let mut warnings: Vec = Vec::new(); + let mut result = super::force_apply_staged( purl, ©_dir, - &record.files, + record, sources, - Some(&record.uuid), false, force, + name, + version, + &mut warnings, ) .await; result.package_path = copy_dir.display().to_string(); @@ -350,7 +399,7 @@ pub async fn vendor_cargo_crate( // Don't leave a half-built copy (or an empty uuid husk) that // verify/sweep would misjudge. let _ = remove_tree(&uuid_dir).await; - return done(result, None, Vec::new()); + return done(result, None, warnings); } // A path-dep copy must never carry a checksum sidecar. The fresh copy @@ -370,10 +419,9 @@ pub async fn vendor_cargo_crate( let _ = remove_tree(&uuid_dir).await; result.success = false; result.error = Some(format!("failed to update .cargo/config.toml: {e}")); - return done(result, None, Vec::new()); + return done(result, None, warnings); } - let mut warnings = Vec::new(); let prior_path = prior_entry.as_ref().and_then(|i| i.path.clone()); if prior_path.as_deref().is_some_and(is_legacy_redirect_path) { warnings.push(VendorWarning::new( @@ -1034,6 +1082,57 @@ mod tests { ); } + /// Wired config+lock with a deleted committed copy: the artifact is + /// rebuilt in place, config and lock stay byte-identical, no fresh entry. + #[tokio::test] + async fn test_wired_missing_copy_rebuilds_artifact_only() { + let (dir, blobs, pristine, record) = fixture().await; + let root = dir.path(); + expect_done(run_vendor(PURL, root, &blobs, &pristine, &record, false).await); + + let copy = root.join(copy_rel()).join("src/lib.rs"); + let cfg = root.join(".cargo/config.toml"); + let lock = root.join("Cargo.lock"); + let copy1 = tokio::fs::read(©).await.unwrap(); + let cfg1 = tokio::fs::read(&cfg).await.unwrap(); + let lock1 = tokio::fs::read(&lock).await.unwrap(); + + crate::patch::copy_tree::remove_tree(&root.join(copy_rel())) + .await + .unwrap(); + + let (result, entry, warnings) = + expect_done(run_vendor(PURL, root, &blobs, &pristine, &record, false).await); + assert!(result.success, "{:?}", result.error); + assert!( + entry.is_none(), + "artifact-only rebuild must not emit a fresh entry" + ); + assert!( + warnings.iter().any(|w| w.code == "vendor_artifact_rebuilt"), + "rebuild is surfaced: {warnings:?}" + ); + assert_eq!( + tokio::fs::read(©).await.unwrap(), + copy1, + "rebuilt copy carries the patched bytes" + ); + assert!( + !root.join(copy_rel()).join(".cargo-checksum.json").exists(), + "no checksum sidecar in the rebuilt path-dep copy" + ); + assert_eq!( + tokio::fs::read(&cfg).await.unwrap(), + cfg1, + "config untouched" + ); + assert_eq!( + tokio::fs::read(&lock).await.unwrap(), + lock1, + "lock untouched" + ); + } + #[tokio::test] async fn test_dry_run_writes_nothing() { let (dir, blobs, pristine, record) = fixture().await; diff --git a/crates/socket-patch-core/src/patch/vendor/composer_lock.rs b/crates/socket-patch-core/src/patch/vendor/composer_lock.rs index b2ae9cd..30a0334 100644 --- a/crates/socket-patch-core/src/patch/vendor/composer_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/composer_lock.rs @@ -35,8 +35,8 @@ use serde_json::{json, Map, Value}; use crate::manifest::schema::{PatchFileInfo, PatchRecord}; use crate::patch::apply::{ - apply_package_patch, is_safe_relative_subpath, normalize_file_path, ApplyResult, PatchSources, - VerifyResult, VerifyStatus, + is_safe_relative_subpath, normalize_file_path, ApplyResult, PatchSources, VerifyResult, + VerifyStatus, }; use crate::patch::copy_tree::{fresh_copy, remove_tree}; use crate::patch::file_hash::compute_file_git_sha256; @@ -176,38 +176,98 @@ pub async fn vendor_composer( // at the uuid path → touch nothing, report AlreadyPatched. `entry` stays // `None`: the first run's ledger entry holds the only copy of the // verbatim pre-vendor original, and re-recording here would clobber it. - if entry_is_wired(&lock[section][idx], ©_rel) - && copy_matches_after_hashes(©_dir, &record.files).await - { - let verified = record - .files - .keys() - .map(|f| already_patched_verify(f)) - .collect(); - return VendorOutcome::Done { - result: synthesized_result(purl, ©_dir, verified, true, None), - entry: None, - warnings: Vec::new(), - }; + if entry_is_wired(&lock[section][idx], ©_rel) { + if copy_matches_after_hashes(©_dir, &record.files).await { + let verified = record + .files + .keys() + .map(|f| already_patched_verify(f)) + .collect(); + return VendorOutcome::Done { + result: synthesized_result(purl, ©_dir, verified, true, None), + entry: None, + warnings: Vec::new(), + }; + } + // Wired but the committed copy is missing/stale: rebuild the + // ARTIFACT only. The lock is already correct and the first run's + // ledger entry holds the only pre-vendor original — running the + // full path here would re-record the live VENDORED fragment as + // `original`, breaking a later `--revert`. + if !dry_run { + if let Err(e) = fresh_copy(installed_dir, ©_dir, None).await { + return VendorOutcome::Done { + result: synthesized_result( + purl, + ©_dir, + Vec::new(), + false, + Some(format!("failed to copy installed package: {e}")), + ), + entry: None, + warnings: Vec::new(), + }; + } + let mut warnings: Vec = Vec::new(); + let mut result = super::force_apply_staged( + purl, + ©_dir, + record, + sources, + false, + force, + &pkg, + version, + &mut warnings, + ) + .await; + result.package_path = copy_dir.display().to_string(); + if !result.success { + // Don't leave a half-built copy; the pre-state was already + // broken, so removing restores the (missing) status quo. + let _ = remove_tree(&uuid_dir).await; + return VendorOutcome::Done { + result, + entry: None, + warnings, + }; + } + warnings.push(VendorWarning::new( + "vendor_artifact_rebuilt", + format!( + "the committed vendored copy for {pkg}@{version} was missing or stale; \ + rebuilt at {copy_rel} (composer.lock untouched)" + ), + )); + return VendorOutcome::Done { + result, + entry: None, + warnings, + }; + } + // Dry runs fall through to the verify-only preview below. } // ── dry run: verify-only against the installed dir, no writes ──────── if dry_run { - let mut result = apply_package_patch( + let mut dry_warnings: Vec = Vec::new(); + let mut result = super::force_apply_staged( purl, installed_dir, - &record.files, + record, sources, - Some(&record.uuid), true, force, + &pkg, + version, + &mut dry_warnings, ) .await; result.package_path = copy_dir.display().to_string(); return VendorOutcome::Done { result, entry: None, - warnings: Vec::new(), + warnings: dry_warnings, }; } @@ -225,14 +285,17 @@ pub async fn vendor_composer( warnings: Vec::new(), }; } - let mut result = apply_package_patch( + let mut warnings: Vec = Vec::new(); + let mut result = super::force_apply_staged( purl, ©_dir, - &record.files, + record, sources, - Some(&record.uuid), false, force, + &pkg, + version, + &mut warnings, ) .await; result.package_path = copy_dir.display().to_string(); @@ -242,7 +305,7 @@ pub async fn vendor_composer( return VendorOutcome::Done { result, entry: None, - warnings: Vec::new(), + warnings, }; } @@ -256,7 +319,7 @@ pub async fn vendor_composer( return VendorOutcome::Done { result, entry: None, - warnings: Vec::new(), + warnings, }; }; let rewritten = rewrite_lock_entry(original_obj, ©_rel, &record.uuid); @@ -272,12 +335,11 @@ pub async fn vendor_composer( return VendorOutcome::Done { result, entry: None, - warnings: Vec::new(), + warnings, }; } // ── marker + ledger entry ──────────────────────────────────────────── - let mut warnings = Vec::new(); let base_purl = build_composer_purl(&vendor, &name, version); let mut vulnerabilities: Vec = record.vulnerabilities.keys().cloned().collect(); vulnerabilities.sort(); @@ -1073,6 +1135,51 @@ mod tests { ); } + /// Wired lock + deleted/corrupt copy: the artifact is rebuilt in place, + /// the lock stays byte-identical, no ledger entry is re-recorded. + #[tokio::test] + async fn test_wired_missing_copy_rebuilds_artifact_only() { + let lock = lock_value("psr/log", "3.0.2", false); + let (dir, blobs, installed, record) = fixture(&lock).await; + let root = dir.path(); + + let (r1, e1, _) = + unwrap_done(run_vendor(root, &blobs, &installed, &record, PURL, false).await); + assert!(r1.success); + assert!(e1.is_some()); + let lock_bytes = tokio::fs::read(root.join(COMPOSER_LOCK)).await.unwrap(); + let patched = root.join(copy_rel()).join("src/LoggerInterface.php"); + let patched_bytes = tokio::fs::read(&patched).await.unwrap(); + + // Simulate the fresh-clone hole: the committed copy is gone. + crate::patch::copy_tree::remove_tree(&root.join(copy_rel())) + .await + .unwrap(); + + let (r2, e2, w2) = + unwrap_done(run_vendor(root, &blobs, &installed, &record, PURL, false).await); + assert!(r2.success, "{:?}", r2.error); + assert!( + e2.is_none(), + "artifact-only rebuild must not re-record (the live vendored \ + fragment would clobber the pre-vendor original)" + ); + assert!( + w2.iter().any(|w| w.code == "vendor_artifact_rebuilt"), + "rebuild is surfaced: {w2:?}" + ); + assert_eq!( + tokio::fs::read(&patched).await.unwrap(), + patched_bytes, + "rebuilt copy carries the patched bytes" + ); + assert_eq!( + tokio::fs::read(root.join(COMPOSER_LOCK)).await.unwrap(), + lock_bytes, + "composer.lock untouched by the rebuild" + ); + } + #[tokio::test] async fn test_dry_run_writes_nothing() { let lock = lock_value("psr/log", "3.0.2", false); diff --git a/crates/socket-patch-core/src/patch/vendor/gem.rs b/crates/socket-patch-core/src/patch/vendor/gem.rs index 5ce51a2..8a4ad73 100644 --- a/crates/socket-patch-core/src/patch/vendor/gem.rs +++ b/crates/socket-patch-core/src/patch/vendor/gem.rs @@ -53,8 +53,8 @@ use serde_json::Value; use crate::manifest::schema::{PatchFileInfo, PatchRecord}; use crate::patch::apply::{ - apply_package_patch, is_safe_relative_subpath, normalize_file_path, ApplyResult, PatchSources, - VerifyResult, VerifyStatus, + is_safe_relative_subpath, normalize_file_path, ApplyResult, PatchSources, VerifyResult, + VerifyStatus, }; use crate::patch::copy_tree::{fresh_copy, remove_tree}; use crate::patch::file_hash::compute_file_git_sha256; @@ -244,59 +244,137 @@ pub async fn vendor_gem( // the first run's ledger entry holds the only copy of the pre-vendor // originals. let remote_line = format!(" remote: {copy_rel}"); - let wired = copy_matches_after_hashes(©_dir, &record.files).await + let lock_wired = + lock_text.split('\n').any(|l| l == remote_line) && gemfile_text.contains(©_rel); + let copy_ok = copy_matches_after_hashes(©_dir, &record.files).await && tokio::fs::metadata(copy_dir.join(format!("{name}.gemspec"))) .await - .is_ok() - && lock_text.split('\n').any(|l| l == remote_line) - && gemfile_text.contains(©_rel); - if wired { + .is_ok(); + if lock_wired { if lock_checksum_in_sync(&lock_text, name, version) { - let verified = record - .files - .keys() - .map(|f| already_patched_verify(f)) - .collect(); - return VendorOutcome::Done { - result: synthesized_result(purl, ©_dir, verified, true, None), - entry: None, - warnings: Vec::new(), - }; + if copy_ok { + let verified = record + .files + .keys() + .map(|f| already_patched_verify(f)) + .collect(); + return VendorOutcome::Done { + result: synthesized_result(purl, ©_dir, verified, true, None), + entry: None, + warnings: Vec::new(), + }; + } + // Wired (Gemfile + lock + CHECKSUMS) but the committed copy is + // missing/stale: rebuild the ARTIFACT only — the pair edit is + // already correct and the full path would re-record the live + // vendored fragments as `original`, breaking a later --revert. + if !dry_run { + if let Err(e) = fresh_copy(installed_dir, ©_dir, None).await { + return VendorOutcome::Done { + result: synthesized_result( + purl, + ©_dir, + Vec::new(), + false, + Some(format!("failed to copy installed gem: {e}")), + ), + entry: None, + warnings: Vec::new(), + }; + } + if let Err(e) = + tokio::fs::write(copy_dir.join(format!("{name}.gemspec")), &spec_text).await + { + let _ = remove_tree(&uuid_dir).await; + return VendorOutcome::Done { + result: synthesized_result( + purl, + ©_dir, + Vec::new(), + false, + Some(format!( + "failed to copy the stub gemspec into the vendored dir: {e}" + )), + ), + entry: None, + warnings: Vec::new(), + }; + } + let mut warnings: Vec = Vec::new(); + let mut result = super::force_apply_staged( + purl, + ©_dir, + record, + sources, + false, + force, + name, + version, + &mut warnings, + ) + .await; + result.package_path = copy_dir.display().to_string(); + if !result.success { + let _ = remove_tree(&uuid_dir).await; + return VendorOutcome::Done { + result, + entry: None, + warnings, + }; + } + warnings.push(VendorWarning::new( + "vendor_artifact_rebuilt", + format!( + "the committed vendored copy for {name}@{version} was missing or \ + stale; rebuilt at {copy_rel} (Gemfile and Gemfile.lock untouched)" + ), + )); + return VendorOutcome::Done { + result, + entry: None, + warnings, + }; + } + // Dry runs fall through to the verify-only preview below. + } else { + // Wired everywhere EXCEPT the lock's CHECKSUMS entry, which still + // carries the registry form — a lock wired by a pre-CHECKSUMS-aware + // socket-patch. Bundler never repairs this itself (spike G4: install, + // frozen install and `bundle lock` all silently preserve a stale + // token), and we cannot strip it here: this run records no ledger + // entry, so a revert would put back everything EXCEPT the token — + // leaving a bare CHECKSUMS entry on a registry-sourced gem, which + // hard-fails frozen installs (exit 16). Refuse with the repair path + // instead of the generic "already carries `path:`" Gemfile refusal. + return refused( + "vendor_stale_lock_checksum", + format!( + "Gemfile.lock already wires `{name}` to {copy_rel} but its CHECKSUMS entry is not bundler's bare path-gem form (an earlier socket-patch left the registry line in place); run `vendor --revert` for {purl} and re-vendor to repair it" + ), + ); } - // Wired everywhere EXCEPT the lock's CHECKSUMS entry, which still - // carries the registry form — a lock wired by a pre-CHECKSUMS-aware - // socket-patch. Bundler never repairs this itself (spike G4: install, - // frozen install and `bundle lock` all silently preserve a stale - // token), and we cannot strip it here: this run records no ledger - // entry, so a revert would put back everything EXCEPT the token — - // leaving a bare CHECKSUMS entry on a registry-sourced gem, which - // hard-fails frozen installs (exit 16). Refuse with the repair path - // instead of the generic "already carries `path:`" Gemfile refusal. - return refused( - "vendor_stale_lock_checksum", - format!( - "Gemfile.lock already wires `{name}` to {copy_rel} but its CHECKSUMS entry is not bundler's bare path-gem form (an earlier socket-patch left the registry line in place); run `vendor --revert` for {purl} and re-vendor to repair it" - ), - ); } // ── dry run: verify-only against the installed dir, no writes ──────── if dry_run { - let mut result = apply_package_patch( + let mut dry_warnings: Vec = Vec::new(); + let mut result = super::force_apply_staged( purl, installed_dir, - &record.files, + record, sources, - Some(&record.uuid), true, force, + name, + version, + &mut dry_warnings, ) .await; result.package_path = copy_dir.display().to_string(); return VendorOutcome::Done { result, entry: None, - warnings: Vec::new(), + warnings: dry_warnings, }; } @@ -338,14 +416,17 @@ pub async fn vendor_gem( warnings: Vec::new(), }; } - let mut result = apply_package_patch( + let mut warnings: Vec = Vec::new(); + let mut result = super::force_apply_staged( purl, ©_dir, - &record.files, + record, sources, - Some(&record.uuid), false, force, + name, + version, + &mut warnings, ) .await; result.package_path = copy_dir.display().to_string(); @@ -355,7 +436,7 @@ pub async fn vendor_gem( return VendorOutcome::Done { result, entry: None, - warnings: Vec::new(), + warnings, }; } @@ -368,7 +449,7 @@ pub async fn vendor_gem( return VendorOutcome::Done { result, entry: None, - warnings: Vec::new(), + warnings, }; } @@ -395,13 +476,12 @@ pub async fn vendor_gem( return VendorOutcome::Done { result, entry: None, - warnings: Vec::new(), + warnings, }; } }; // ── marker + ledger entry ──────────────────────────────────────────── - let mut warnings = Vec::new(); let base_purl = build_gem_purl(name, version); let mut vulnerabilities: Vec = record.vulnerabilities.keys().cloned().collect(); vulnerabilities.sort(); @@ -1788,6 +1868,45 @@ mod tests { ); } + /// Wired Gemfile+lock with a deleted committed copy: the artifact (and + /// its stub gemspec) is rebuilt, the pair stays byte-identical, no entry. + #[tokio::test] + async fn test_wired_missing_copy_rebuilds_artifact_only() { + let (_tmp, root, installed, blobs, record) = fixture(GEMFILE_DIRECT, LOCK_DIRECT).await; + + let (r1, e1, _) = unwrap_done(run_vendor(&root, &blobs, &installed, &record, false).await); + assert!(r1.success); + assert!(e1.is_some()); + let gemfile1 = tokio::fs::read(root.join(GEMFILE)).await.unwrap(); + let lock1 = tokio::fs::read(root.join(GEMFILE_LOCK)).await.unwrap(); + let copy_root = root.join(format!(".socket/vendor/gem/{UUID}/rack-3.2.6")); + assert!(copy_root.exists()); + + crate::patch::copy_tree::remove_tree(©_root) + .await + .unwrap(); + + let (r2, e2, w2) = unwrap_done(run_vendor(&root, &blobs, &installed, &record, false).await); + assert!(r2.success, "{:?}", r2.error); + assert!( + e2.is_none(), + "artifact-only rebuild must not re-record the ledger entry" + ); + assert!( + w2.iter().any(|w| w.code == "vendor_artifact_rebuilt"), + "rebuild is surfaced: {w2:?}" + ); + assert!( + copy_root.join("rack.gemspec").exists(), + "stub gemspec regenerated with the rebuilt copy" + ); + assert_eq!(tokio::fs::read(root.join(GEMFILE)).await.unwrap(), gemfile1); + assert_eq!( + tokio::fs::read(root.join(GEMFILE_LOCK)).await.unwrap(), + lock1 + ); + } + #[tokio::test] async fn test_dry_run_writes_nothing() { let (_tmp, root, installed, blobs, record) = fixture(GEMFILE_DIRECT, LOCK_DIRECT).await; diff --git a/crates/socket-patch-core/src/patch/vendor/golang.rs b/crates/socket-patch-core/src/patch/vendor/golang.rs index 8961482..8134608 100644 --- a/crates/socket-patch-core/src/patch/vendor/golang.rs +++ b/crates/socket-patch-core/src/patch/vendor/golang.rs @@ -33,6 +33,24 @@ use super::state::{ }; use super::{RevertOutcome, VendorOutcome, VendorWarning}; +/// The committed copy exists and every patched file matches its afterHash. +async fn copy_hashes_ok( + copy_dir: &Path, + files: &std::collections::HashMap, +) -> bool { + if tokio::fs::metadata(copy_dir).await.is_err() { + return false; + } + for (file_name, info) in files { + let path = copy_dir.join(crate::patch::apply::normalize_file_path(file_name)); + match crate::patch::file_hash::compute_file_git_sha256(&path).await { + Ok(h) if h == info.after_hash => {} + _ => return false, + } + } + true +} + /// Vendor one Go module: patched copy in the uuid dir + a vendor-owned /// `replace` directive + marker, returning the ledger entry to persist. /// @@ -101,6 +119,41 @@ pub async fn vendor_go_module( .is_some_and(|e| e.owner == Some(ReplaceOwner::GoPatches)); let prior_path = prior.as_ref().and_then(|e| e.path.clone()); + // Re-run shape detection: the replace already points at THIS uuid's copy. + // The engine rebuilds a missing/stale copy and its replace upsert is a + // byte-stable no-op, so a wired re-run must return `entry: None` — the + // first run's ledger entry holds the only pre-vendor original, and the + // `prior_path` recorded here would be our own vendored pointer. + let wired = + prior_path.as_deref() == Some(replace_target_path(&base_rel, module, version).as_str()); + let copy_dir = project_root + .join(&base_rel) + .join(format!("{module}@{version}")); + let copy_was_ok = wired && copy_hashes_ok(©_dir, &record.files).await; + + // Vendor auto-force policy (the engine's copy is staged from the + // pristine source, never the user's tree — see `force_apply_staged`): + // missing patch targets still fail closed unless the caller's own + // `--force` asked for the skip tolerance, then the engine apply runs + // forced so a beforeHash mismatch (already-applied module, or a patch + // built against different bytes) overwrites with the verified patched + // content. The engine is shared with the in-place `apply` redirect + // path, whose strict semantics stay unchanged. + let mut warnings: Vec = Vec::new(); + if !force { + let missing = super::missing_existing_patch_files(pristine_src, &record.files).await; + if let Some(first) = missing.first() { + return VendorOutcome::Done { + result: super::failed_apply_result( + purl, + format!("Cannot apply patch: {first} - File not found"), + ), + entry: None, + warnings, + }; + } + } + // The engine does the heavy lifting: fresh copy → hardened apply pipeline // → `replace` upsert (which refuses a user-authored same-version pin). let result = apply_go_redirect( @@ -114,15 +167,18 @@ pub async fn vendor_go_module( sources, Some(&record.uuid), dry_run, - force, + crate::patch::apply::MismatchPolicy::Force, ) .await; + if result.success { + warnings.extend(super::mismatch_overwrite_warnings(&result, module, version)); + } if dry_run { return VendorOutcome::Done { result, entry: None, - warnings: Vec::new(), + warnings, }; } if !result.success { @@ -134,7 +190,7 @@ pub async fn vendor_go_module( return VendorOutcome::Done { result, entry: None, - warnings: Vec::new(), + warnings, }; } // A patch with no files is a no-op success: the engine wrote no copy and @@ -143,11 +199,48 @@ pub async fn vendor_go_module( return VendorOutcome::Done { result, entry: None, - warnings: Vec::new(), + warnings, }; } - let mut warnings = Vec::new(); + if wired { + // Already wired to this uuid: either the engine's in-sync hot path + // (copy intact) or an artifact-only rebuild (copy was missing/stale). + // Never re-record the ledger entry. + if !copy_was_ok { + // A wholesale-deleted uuid dir lost the informational marker; + // restore it alongside the rebuilt copy (never a trust input — + // a failed write only warns). + let mut vulnerabilities: Vec = record.vulnerabilities.keys().cloned().collect(); + vulnerabilities.sort(); + let marker = VendorMarker { + schema_version: 1, + purl: strip_purl_qualifiers(purl).to_string(), + patch_uuid: record.uuid.clone(), + ecosystem: "golang".to_string(), + vulnerabilities, + vendored_at: vendored_at.to_string(), + }; + if let Err(e) = write_marker(&project_root.join(&base_rel), &marker).await { + warnings.push(VendorWarning::new( + "marker_write_failed", + format!("could not write the vendor marker: {e}"), + )); + } + warnings.push(VendorWarning::new( + "vendor_artifact_rebuilt", + format!( + "the committed vendored copy for {module}@{version} was missing or \ + stale; rebuilt under {base_rel} (go.mod untouched)" + ), + )); + } + return VendorOutcome::Done { + result, + entry: None, + warnings, + }; + } if takeover { // The `replace` line was already atomically repointed by the upsert; @@ -314,6 +407,7 @@ mod tests { use crate::hash::git_sha256::compute_git_sha256_from_bytes; use crate::manifest::schema::{PatchFileInfo, VulnerabilityInfo}; use crate::patch::apply::ApplyResult; + use crate::patch::apply::MismatchPolicy; use crate::patch::vendor::state::VENDOR_MARKER_FILE; use std::collections::HashMap; use std::path::PathBuf; @@ -532,7 +626,7 @@ mod tests { &sources, Some(UUID), false, - false, + MismatchPolicy::Warn, ) .await; assert!(pre.success, "fixture redirect failed: {:?}", pre.error); @@ -571,6 +665,47 @@ mod tests { ); } + /// Wired go.mod with a deleted committed copy: the module copy is + /// rebuilt, go.mod stays byte-identical, no fresh ledger entry. + #[tokio::test] + async fn test_wired_missing_copy_rebuilds_artifact_only() { + let (dir, blobs, pristine, record) = fixture().await; + let root = dir.path(); + expect_done(run_vendor(PURL, root, &blobs, &pristine, &record, false).await); + + let copy = root.join(copy_rel()).join("bar.go"); + let gomod = root.join("go.mod"); + let copy1 = tokio::fs::read(©).await.unwrap(); + let mod1 = tokio::fs::read(&gomod).await.unwrap(); + + crate::patch::copy_tree::remove_tree(&root.join(copy_rel())) + .await + .unwrap(); + + let (result, entry, warnings) = + expect_done(run_vendor(PURL, root, &blobs, &pristine, &record, false).await); + assert!(result.success, "{:?}", result.error); + assert!( + entry.is_none(), + "artifact-only rebuild must not re-record (prior_path is our own \ + vendored pointer here, not a pre-vendor original)" + ); + assert!( + warnings.iter().any(|w| w.code == "vendor_artifact_rebuilt"), + "rebuild is surfaced: {warnings:?}" + ); + assert_eq!( + tokio::fs::read(©).await.unwrap(), + copy1, + "rebuilt copy carries the patched bytes" + ); + assert_eq!( + tokio::fs::read(&gomod).await.unwrap(), + mod1, + "go.mod byte-stable across the rebuild" + ); + } + #[tokio::test] async fn test_idempotent_rerun_is_byte_stable() { let (dir, blobs, pristine, record) = fixture().await; @@ -589,8 +724,11 @@ mod tests { result.files_patched.is_empty(), "in-sync re-run patches nothing" ); - assert!(entry.is_some(), "re-run still reports the ledger entry"); - assert!(!entry.unwrap().took_over_go_patches); + assert!( + entry.is_none(), + "an in-sync re-run records no entry — the first run's ledger \ + entry holds the only pre-vendor original" + ); assert!(warnings.is_empty(), "{warnings:?}"); assert_eq!( tokio::fs::read(©).await.unwrap(), @@ -696,7 +834,7 @@ mod tests { &sources, Some(UUID), false, - false, + MismatchPolicy::Warn, ) .await; let (_result, entry, _warnings) = diff --git a/crates/socket-patch-core/src/patch/vendor/lock_inventory.rs b/crates/socket-patch-core/src/patch/vendor/lock_inventory.rs new file mode 100644 index 0000000..1fad375 --- /dev/null +++ b/crates/socket-patch-core/src/patch/vendor/lock_inventory.rs @@ -0,0 +1,2308 @@ +//! Read-only lockfile inventories: the dependency set a project's lockfile +//! resolves, independent of what is installed on disk. +//! +//! Two consumers: +//! +//! * `scan` supplements its installed-tree crawl with lockfile-only entries +//! (discovery on fresh clones and partial installs), warning that those +//! packages are not yet installed; +//! * `vendor` fetches the pristine artifact for a lockfile-resolved package +//! with no installed copy ([`super::registry_fetch`]), verifying the bytes +//! against the integrity the lock records — FAIL-CLOSED: an entry whose +//! lock carries no content verifier is never fetched. +//! +//! Parsing is fail-soft per entry (a malformed entry is skipped, never an +//! error; a malformed file yields `None`) and fail-closed per value: +//! names/versions are path-safety-guarded before an entry is emitted — the +//! lockfile is committed, tamperable input that later feeds filesystem paths +//! and download URLs. + +use std::collections::HashMap; +use std::path::Path; + +use serde_json::Value; + +use crate::patch::path_safety; +use crate::utils::purl::strip_purl_qualifiers; + +use super::npm_common::is_safe_npm_name; +use super::npm_flavor::{detect_npm_lock_flavor, NpmLockFlavor}; +use super::path::parse_vendor_path; +use super::{bun_lock, pnpm_lock, yarn_berry_lock, yarn_classic_lock}; + +/// The content verifier a lockfile records for an entry. The fetch layer +/// refuses entries whose verifier is [`LockIntegrity::None`]. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum LockIntegrity { + /// SRI string (`sha512-`, possibly multi-hash space-separated) — + /// npm family; verified against the raw tarball bytes. + Sri(String), + /// yarn classic `resolved "...#"` fragment (40-hex) — verified + /// against the raw tarball bytes. + Sha1Hex(String), + /// yarn berry cache-zip checksum (`/`, e.g. `10c0/…`) — + /// verified by rebuilding the deterministic cache zip from the fetched + /// tarball and comparing (the lock never hashes the tarball itself). + BerryChecksum(String), + /// Hex sha256 of the artifact (Cargo.lock `checksum`, pypi file hashes, + /// Gemfile.lock `CHECKSUMS`). + Sha256Hex(String), + /// go.sum module-zip dirhash (`h1:`). + GoH1(String), + /// The lock records no content verifier. + None, +} + +/// One lockfile-resolved package. +#[derive(Debug, Clone)] +pub struct LockfileEntry { + /// Vendor-ecosystem tag (`npm`, `cargo`, `golang`, `pypi`, `gem`, + /// `composer`) — matches `VendorEntry::ecosystem`. + pub ecosystem: &'static str, + /// Literal (percent-decoded) package name, e.g. `@scope/name`. + pub name: String, + /// Exact resolved version. + pub version: String, + /// Canonical literal purl (`pkg:npm/@scope/name@1.0.0`) — the same form + /// the crawlers emit. + pub purl: String, + /// Artifact URL when the lock records one (package-lock `resolved`, + /// yarn `resolved` minus its `#sha1` fragment, pnpm `tarball:`); `None` + /// means the fetcher constructs the conventional registry URL. + pub resolved: Option, + pub integrity: LockIntegrity, +} + +impl LockfileEntry { + fn npm( + name: impl Into, + version: impl Into, + resolved: Option, + integrity: LockIntegrity, + ) -> Self { + let (name, version) = (name.into(), version.into()); + let purl = format!("pkg:npm/{name}@{version}"); + LockfileEntry { + ecosystem: "npm", + name, + version, + purl, + resolved, + integrity, + } + } +} + +/// Inventory the project's npm-family lockfile. Routes by +/// [`detect_npm_lock_flavor`] (PnP markers, bun.lockb, unsupported lock +/// versions, and a missing lockfile all yield `None`). +pub async fn inventory_npm_lock( + project_root: &Path, +) -> Option<(NpmLockFlavor, Vec)> { + let (flavor, _warnings) = detect_npm_lock_flavor(project_root).await.ok()?; + let raw = match flavor { + NpmLockFlavor::PackageLock => inventory_package_lock(project_root).await, + NpmLockFlavor::Pnpm => inventory_pnpm_lock(project_root).await, + NpmLockFlavor::YarnClassic => inventory_yarn_classic(project_root).await, + NpmLockFlavor::YarnBerry => inventory_yarn_berry(project_root).await, + NpmLockFlavor::Bun => inventory_bun(project_root).await, + }?; + Some((flavor, finalize_npm(raw))) +} + +/// Match a manifest/API purl (possibly percent-encoded, possibly carrying +/// qualifiers) against the inventory: components decode via +/// [`crate::utils::purl::normalize_purl`], so `pkg:npm/%40scope/x@1` +/// matches the literal entry. +pub fn lookup<'a>(entries: &'a [LockfileEntry], purl: &str) -> Option<&'a LockfileEntry> { + let decoded = crate::utils::purl::normalize_purl(strip_purl_qualifiers(purl)).into_owned(); + let rest = decoded.strip_prefix("pkg:")?; + let (purl_type, rest) = rest.split_once('/')?; + // purl type → vendor-ecosystem tag (same mapping the dispatcher uses). + let eco = match purl_type { + "npm" => "npm", + "cargo" => "cargo", + "golang" => "golang", + "pypi" => "pypi", + "gem" => "gem", + "composer" => "composer", + _ => return None, + }; + let at = rest.rfind('@').filter(|&i| i > 0)?; + let (name, version) = (&rest[..at], &rest[at + 1..]); + // pypi names compare in PEP 503 normalized form. + let name = if eco == "pypi" { + pep503(name) + } else { + name.to_string() + }; + entries + .iter() + .find(|e| e.ecosystem == eco && e.name == name && e.version == version) +} + +/// Everything every recognized lockfile in the project resolves — the +/// union the scan supplement and the vendor auto-fetch consume. +pub async fn inventory_project(project_root: &Path) -> Vec { + let mut out: Vec = Vec::new(); + if let Some((_, entries)) = inventory_npm_lock(project_root).await { + out.extend(entries); + } + #[cfg(feature = "cargo")] + if let Some(entries) = inventory_cargo_lock(project_root).await { + out.extend(entries); + } + #[cfg(feature = "golang")] + if let Some(entries) = inventory_go_sum(project_root).await { + out.extend(entries); + } + #[cfg(feature = "composer")] + if let Some(entries) = inventory_composer_lock(project_root).await { + out.extend(entries); + } + if let Some(entries) = inventory_gemfile_lock(project_root).await { + out.extend(entries); + } + if let Some(entries) = inventory_pypi_locks(project_root).await { + out.extend(entries); + } + out +} + +/// Guard + dedup the raw npm entries: unsafe names/versions are dropped +/// fail-closed; duplicate (name, version) instances collapse to one, +/// preferring the instance that carries a verifier. +fn finalize_npm(raw: Vec) -> Vec { + dedup_prefer_integrity( + raw.into_iter() + .filter(|e| { + is_safe_npm_name(&e.name) && path_safety::is_safe_single_segment(&e.version) + }) + .collect(), + ) +} + +/// Collapse duplicate (name, version) instances, preferring one that +/// carries a verifier. +fn dedup_prefer_integrity(raw: Vec) -> Vec { + let mut seen: HashMap<(String, String), usize> = HashMap::new(); + let mut out: Vec = Vec::new(); + for entry in raw { + let key = (entry.name.clone(), entry.version.clone()); + match seen.get(&key) { + Some(&i) => { + if out[i].integrity == LockIntegrity::None && entry.integrity != LockIntegrity::None + { + out[i] = entry; + } + } + None => { + seen.insert(key, out.len()); + out.push(entry); + } + } + } + out +} + +// ──────────────────────────────── Cargo.lock ──────────────────────────────── + +/// Inventory `Cargo.lock` `[[package]]` blocks. Only crates.io-sourced +/// entries are fetchable (their `checksum` is the sha256 of the `.crate` +/// file); workspace members (no `source`) are skipped, and git/custom- +/// registry sources stay listed for discovery without a verifier. +#[cfg(feature = "cargo")] +pub async fn inventory_cargo_lock(project_root: &Path) -> Option> { + let text = tokio::fs::read_to_string(project_root.join("Cargo.lock")) + .await + .ok()?; + /// One in-flight `[[package]]` block: name, version, source, checksum. + type CargoBlock = ( + Option, + Option, + Option, + Option, + ); + let mut out = Vec::new(); + let mut cur: Option = None; + let flush = |cur: &mut Option, out: &mut Vec| { + if let Some((Some(name), Some(version), source, checksum)) = cur.take() { + let Some(source) = source else { + return; // workspace member + }; + if !path_safety::is_safe_single_segment(&name) + || !path_safety::is_safe_single_segment(&version) + { + return; + } + let crates_io = source.contains("github.com/rust-lang/crates.io-index") + || source.contains("index.crates.io"); + let integrity = match checksum { + Some(c) + if crates_io && c.len() == 64 && c.bytes().all(|b| b.is_ascii_hexdigit()) => + { + LockIntegrity::Sha256Hex(c) + } + _ => LockIntegrity::None, + }; + let purl = format!("pkg:cargo/{name}@{version}"); + out.push(LockfileEntry { + ecosystem: "cargo", + name, + version, + purl, + resolved: None, + integrity, + }); + } + }; + for line in text.lines() { + let line = line.trim(); + if line == "[[package]]" { + flush(&mut cur, &mut out); + cur = Some((None, None, None, None)); + continue; + } + if line.starts_with('[') { + flush(&mut cur, &mut out); + continue; + } + let Some(slot) = cur.as_mut() else { continue }; + let Some((key, value)) = line.split_once('=') else { + continue; + }; + let value = value.trim().trim_matches('"').to_string(); + match key.trim() { + "name" => slot.0 = Some(value), + "version" => slot.1 = Some(value), + "source" => slot.2 = Some(value), + "checksum" => slot.3 = Some(value), + _ => {} + } + } + flush(&mut cur, &mut out); + Some(dedup_prefer_integrity(out)) +} + +// ────────────────────────────────── go.sum ────────────────────────────────── + +/// Inventory `go.sum` module-zip lines (` h1:`); the +/// `/go.mod`-suffixed lines hash only the manifest and are skipped. go.sum +/// may list more modules than the final build graph — acceptable for +/// discovery, and the manifest decides what actually gets vendored. +#[cfg(feature = "golang")] +pub async fn inventory_go_sum(project_root: &Path) -> Option> { + let text = tokio::fs::read_to_string(project_root.join("go.sum")) + .await + .ok()?; + let mut out = Vec::new(); + for line in text.lines() { + let mut parts = line.split_whitespace(); + let (Some(module), Some(version), Some(hash)) = (parts.next(), parts.next(), parts.next()) + else { + continue; + }; + if version.ends_with("/go.mod") || !hash.starts_with("h1:") { + continue; + } + // SECURITY: module path segments and the version feed paths/URLs. + if !path_safety::is_safe_multi_segment(module) + || !path_safety::is_safe_single_segment(version) + { + continue; + } + out.push(LockfileEntry { + ecosystem: "golang", + name: module.to_string(), + version: version.to_string(), + purl: format!("pkg:golang/{module}@{version}"), + resolved: None, + integrity: LockIntegrity::GoH1(hash.to_string()), + }); + } + Some(dedup_prefer_integrity(out)) +} + +/// Keep a lock-recorded URL only when it is a plain http(s) artifact URL +/// (drops `git+…`, `file:…`, `link:…` — content the registry conventions +/// cannot reproduce; such entries stay listed for discovery but the fetch +/// layer's integrity rule decides fetchability). +fn http_url(raw: &str) -> Option { + (raw.starts_with("https://") || raw.starts_with("http://")).then(|| raw.to_string()) +} + +// ──────────────────── package-lock.json / npm-shrinkwrap ──────────────────── + +async fn inventory_package_lock(root: &Path) -> Option> { + // Shrinkwrap wins, mirroring `npm_lock::select_lockfile`. + let mut bytes = None; + for lock in ["npm-shrinkwrap.json", "package-lock.json"] { + if let Ok(b) = tokio::fs::read(root.join(lock)).await { + bytes = Some(b); + break; + } + } + let doc: Value = serde_json::from_slice(&bytes?).ok()?; + // v1 legacy locks have no `packages` map — no inventory (documented). + let packages = doc.get("packages")?.as_object()?; + + let mut out = Vec::new(); + for (key, node) in packages { + // "" is the root project; keys without node_modules/ are workspace + // members (mirrors npm_lock::scan_lock_matches' member rule). + let Some((_, key_name)) = key.rsplit_once("node_modules/") else { + continue; + }; + if node.get("link").and_then(Value::as_bool).unwrap_or(false) + || node + .get("inBundle") + .and_then(Value::as_bool) + .unwrap_or(false) + { + continue; + } + let name = node + .get("name") + .and_then(Value::as_str) + .unwrap_or(key_name) + .to_string(); + let Some(version) = node.get("version").and_then(Value::as_str) else { + continue; + }; + let resolved_raw = node.get("resolved").and_then(Value::as_str); + // Our own vendored spec: not a registry dependency. + if resolved_raw.is_some_and(|r| parse_vendor_path(r).is_some()) { + continue; + } + let integrity = node + .get("integrity") + .and_then(Value::as_str) + .map(|i| LockIntegrity::Sri(i.to_string())) + .unwrap_or(LockIntegrity::None); + out.push(LockfileEntry::npm( + name, + version, + resolved_raw.and_then(http_url), + integrity, + )); + } + Some(out) +} + +// ─────────────────────────── pnpm-lock.yaml v9 ─────────────────────────── + +/// Extract one value from an inline YAML map fragment like +/// `{integrity: sha512-…, tarball: file:…}` (values optionally quoted). +fn inline_map_value(fragment: &str, field: &str) -> Option { + let at = fragment.find(&format!("{field}:"))?; + let rest = fragment[at + field.len() + 1..].trim_start(); + let end = rest.find([',', '}']).unwrap_or(rest.len()); + let value = rest[..end].trim().trim_matches(['\'', '"']); + (!value.is_empty()).then(|| value.to_string()) +} + +async fn inventory_pnpm_lock(root: &Path) -> Option> { + let text = tokio::fs::read_to_string(root.join("pnpm-lock.yaml")) + .await + .ok()?; + let lines = pnpm_lock::split_lines(&text); + let (start, end) = pnpm_lock::section_bounds(&lines, "packages")?; + + let mut out = Vec::new(); + let mut i = start + 1; + while let Some(block) = pnpm_lock::next_block(&lines, i, end) { + i = block.end; + // Key grammar: `name@version` (name may be `@scope/name`), with + // optional peer-dep suffixes `(peer@1.2.3)…` after the version. + let base = match block.key.find('(') { + Some(p) => block.key[..p].trim_end(), + None => block.key.as_str(), + }; + let Some(at) = base.rfind('@').filter(|&p| p > 0) else { + continue; + }; + let (name, version) = (&base[..at], &base[at + 1..]); + // Only plain registry versions: `file:`/`link:`/`https:`/git specs + // are not registry-resolvable. + if !version.chars().next().is_some_and(|c| c.is_ascii_digit()) { + continue; + } + let mut integrity = LockIntegrity::None; + let mut tarball: Option = None; + for line in &lines[block.header + 1..block.end] { + let t = line.trim(); + if let Some(rest) = t.strip_prefix("resolution:") { + if let Some(v) = inline_map_value(rest, "integrity") { + integrity = LockIntegrity::Sri(v); + } + tarball = inline_map_value(rest, "tarball"); + break; + } + } + // Our own vendored spec: not a registry dependency. + if tarball + .as_deref() + .is_some_and(|t| parse_vendor_path(t).is_some()) + { + continue; + } + out.push(LockfileEntry::npm( + name, + version, + tarball.as_deref().and_then(http_url), + integrity, + )); + } + Some(out) +} + +// ───────────────────────────── yarn.lock (classic) ───────────────────────────── + +async fn inventory_yarn_classic(root: &Path) -> Option> { + let text = tokio::fs::read_to_string(root.join("yarn.lock")) + .await + .ok()?; + let mut out = Vec::new(); + for block in yarn_classic_lock::scan_blocks(&text) { + // Our own vendored block: not a registry dependency. + if yarn_classic_lock::block_points_into_vendor(&block.lines) { + continue; + } + let patterns = yarn_classic_lock::split_key_patterns(&block.key); + let Some(name) = patterns + .first() + .and_then(|p| yarn_classic_lock::pattern_real_name(p)) + else { + continue; + }; + let Some(version) = yarn_classic_lock::classic_field(&block.lines, "version") else { + continue; + }; + let resolved_raw = yarn_classic_lock::classic_field(&block.lines, "resolved"); + // `resolved "url#sha1hex"` — the fragment is the legacy verifier. + let (resolved, sha1_hex) = match resolved_raw { + Some(raw) => match raw.split_once('#') { + Some((url, frag)) => ( + http_url(url), + (frag.len() == 40 && frag.bytes().all(|b| b.is_ascii_hexdigit())) + .then(|| frag.to_ascii_lowercase()), + ), + None => (http_url(raw), None), + }, + None => (None, None), + }; + let integrity = yarn_classic_lock::classic_field(&block.lines, "integrity") + .map(|i| LockIntegrity::Sri(i.to_string())) + .or(sha1_hex.map(LockIntegrity::Sha1Hex)) + .unwrap_or(LockIntegrity::None); + out.push(LockfileEntry::npm(name, version, resolved, integrity)); + } + Some(out) +} + +// ───────────────────────────── yarn.lock (berry) ───────────────────────────── + +async fn inventory_yarn_berry(root: &Path) -> Option> { + let text = tokio::fs::read_to_string(root.join("yarn.lock")) + .await + .ok()?; + let mut out = Vec::new(); + // Berry reuses classic's block grammar (same scanner the berry backend + // imports); `__metadata` and workspace/patch/file resolutions are not + // registry packages. + for block in yarn_classic_lock::scan_blocks(&text) { + if block.key.starts_with("__metadata") { + continue; + } + let Some(resolution) = yarn_berry_lock::berry_field(&block.lines, "resolution") else { + continue; + }; + // Registry resolutions are `name@npm:` (a `::binding` + // suffix may follow). Anything else (workspace:/patch:/file:/link:) + // is skipped — including our own vendored file: resolutions. + let Some((name, reference)) = yarn_classic_lock::split_pattern(resolution) else { + continue; + }; + let Some(reference) = reference.strip_prefix("npm:") else { + continue; + }; + let version_from_res = reference.split("::").next().unwrap_or(reference); + let version = + yarn_berry_lock::berry_field(&block.lines, "version").unwrap_or(version_from_res); + let integrity = yarn_berry_lock::berry_field(&block.lines, "checksum") + .map(|c| LockIntegrity::BerryChecksum(c.to_string())) + .unwrap_or(LockIntegrity::None); + out.push(LockfileEntry::npm(name, version, None, integrity)); + } + Some(out) +} + +// ──────────────────────────────── bun.lock ──────────────────────────────── + +async fn inventory_bun(root: &Path) -> Option> { + let text = tokio::fs::read_to_string(root.join("bun.lock")) + .await + .ok()?; + bun_lock::check_lock_version(&text).ok()?; + let lines: Vec = text.split('\n').map(str::to_string).collect(); + let entries = bun_lock::parse_packages_section(&lines).ok()?; + + let mut out = Vec::new(); + for entry in entries { + // Registry entries are 4-tuples `[spec, registry, {deps}, sha512]`; + // our vendored 3-tuples and other shapes are skipped. + if entry.elems.len() != 4 || !entry.elems[2].starts_with('{') { + continue; + } + let Some(spec) = entry + .elems + .first() + .and_then(|e| bun_lock::decode_json_string(e)) + else { + continue; + }; + let Some((name, version)) = bun_lock::split_name_spec(&spec) else { + continue; + }; + if !version.chars().next().is_some_and(|c| c.is_ascii_digit()) { + continue; + } + let Some(registry) = bun_lock::decode_json_string(&entry.elems[1]) else { + continue; + }; + let Some(integrity) = bun_lock::decode_json_string(&entry.elems[3]) else { + continue; + }; + // elem[1] is `""` for the default registry; a full `.tgz` URL is + // used verbatim; any other base falls back to conventional URL + // construction (the integrity check still gates the content). + let resolved = (registry.ends_with(".tgz")) + .then(|| http_url(®istry)) + .flatten(); + out.push(LockfileEntry::npm( + name, + version, + resolved, + LockIntegrity::Sri(integrity), + )); + } + Some(out) +} + +// ────────────────────────────── composer.lock ────────────────────────────── + +/// Inventory `composer.lock` `packages`/`packages-dev`. The `dist.shasum` +/// (sha1 of the dist zip) is frequently empty — such entries stay +/// discovery-only. Names lowercase to the canonical packagist form; +/// versions drop the pretty leading `v`. +#[cfg(feature = "composer")] +pub async fn inventory_composer_lock(project_root: &Path) -> Option> { + let bytes = tokio::fs::read(project_root.join("composer.lock")) + .await + .ok()?; + let doc: Value = serde_json::from_slice(&bytes).ok()?; + let mut out = Vec::new(); + for section in ["packages", "packages-dev"] { + let Some(list) = doc.get(section).and_then(Value::as_array) else { + continue; + }; + for pkg in list { + let Some(name) = pkg.get("name").and_then(Value::as_str) else { + continue; + }; + let Some(version) = pkg.get("version").and_then(Value::as_str) else { + continue; + }; + let name = name.to_ascii_lowercase(); + let version = version + .strip_prefix('v') + .filter(|r| r.chars().next().is_some_and(|c| c.is_ascii_digit())) + .unwrap_or(version) + .to_string(); + if !path_safety::is_safe_multi_segment(&name) + || name.split('/').count() != 2 + || !path_safety::is_safe_single_segment(&version) + { + continue; + } + let dist = pkg.get("dist"); + let dist_url = dist + .and_then(|d| d.get("url")) + .and_then(Value::as_str) + .unwrap_or(""); + // Our own vendored entries use a path dist — skip. + if dist + .and_then(|d| d.get("type")) + .and_then(Value::as_str) + .is_some_and(|t| t == "path") + || parse_vendor_path(dist_url).is_some() + { + continue; + } + let is_zip = dist + .and_then(|d| d.get("type")) + .and_then(Value::as_str) + .is_some_and(|t| t == "zip"); + let shasum = dist + .and_then(|d| d.get("shasum")) + .and_then(Value::as_str) + .unwrap_or(""); + let integrity = + if is_zip && shasum.len() == 40 && shasum.bytes().all(|b| b.is_ascii_hexdigit()) { + LockIntegrity::Sha1Hex(shasum.to_ascii_lowercase()) + } else { + LockIntegrity::None + }; + let purl = format!("pkg:composer/{name}@{version}"); + out.push(LockfileEntry { + ecosystem: "composer", + name, + version, + purl, + resolved: is_zip.then(|| http_url(dist_url)).flatten(), + integrity, + }); + } + } + Some(dedup_prefer_integrity(out)) +} + +// ────────────────────────────── Gemfile.lock ────────────────────────────── + +/// Inventory `Gemfile.lock`: `GEM`-section `specs:` entries (4-space +/// indent; deeper lines are dependency ranges) plus the bundler ≥ 2.6 +/// `CHECKSUMS` section's sha256 values when present (older locks stay +/// discovery-only). Platform-suffixed specs (`nokogiri (1.16.5-arm64-…)`) +/// are skipped — platform gems are unsupported for vendoring anyway. +pub async fn inventory_gemfile_lock(project_root: &Path) -> Option> { + let text = tokio::fs::read_to_string(project_root.join("Gemfile.lock")) + .await + .ok()?; + let mut remote: Option = None; + let mut checksums: HashMap<(String, String), String> = HashMap::new(); + let mut specs: Vec<(String, String)> = Vec::new(); + + let mut section = ""; + let mut in_specs = false; + for line in text.lines() { + if !line.starts_with(' ') { + section = line.trim(); + in_specs = false; + continue; + } + let trimmed = line.trim_start(); + let indent = line.len() - trimmed.len(); + match section { + "GEM" => { + if indent == 2 { + if let Some(r) = trimmed.strip_prefix("remote:") { + let r = r.trim().trim_end_matches('/'); + if remote.is_none() && !r.is_empty() { + remote = Some(r.to_string()); + } + } + in_specs = trimmed == "specs:"; + } else if in_specs && indent == 4 { + if let Some((name, version)) = parse_gem_spec_line(trimmed) { + specs.push((name, version)); + } + } + } + "CHECKSUMS" => { + // ` name (version) sha256=hex` + if let Some((spec_part, hash_part)) = + trimmed.rsplit_once(" sha256=").map(|(s, h)| (s, h.trim())) + { + if let Some((name, version)) = parse_gem_spec_line(spec_part) { + if hash_part.len() == 64 && hash_part.bytes().all(|b| b.is_ascii_hexdigit()) + { + checksums.insert((name, version), hash_part.to_ascii_lowercase()); + } + } + } + } + _ => {} + } + } + if specs.is_empty() { + return None; + } + let base = remote.unwrap_or_else(|| "https://rubygems.org".to_string()); + let mut out = Vec::new(); + for (name, version) in specs { + if !path_safety::is_safe_single_segment(&name) + || !path_safety::is_safe_single_segment(&version) + { + continue; + } + let integrity = checksums + .get(&(name.clone(), version.clone())) + .map(|h| LockIntegrity::Sha256Hex(h.clone())) + .unwrap_or(LockIntegrity::None); + out.push(LockfileEntry { + ecosystem: "gem", + purl: format!("pkg:gem/{name}@{version}"), + resolved: http_url(&format!("{base}/downloads/{name}-{version}.gem")), + name, + version, + integrity, + }); + } + Some(dedup_prefer_integrity(out)) +} + +/// `name (version)` → parts; platform-suffixed versions (`1.2.3-x86_64…`) +/// and dependency lines (no parens / range operators) yield `None`. +fn parse_gem_spec_line(line: &str) -> Option<(String, String)> { + let (name, rest) = line.split_once(" (")?; + let version = rest.strip_suffix(')')?; + if name.is_empty() + || version.is_empty() + || version.contains(' ') + || version.contains('-') + || !version.chars().next().is_some_and(|c| c.is_ascii_digit()) + { + return None; + } + Some((name.to_string(), version.to_string())) +} + +// ─────────────────────────────── pypi locks ─────────────────────────────── + +/// PEP 503 name normalization (`Foo._Bar` → `foo-bar`) — pypi purls and +/// lock entries must compare in this form. +fn pep503(name: &str) -> String { + let mut out = String::with_capacity(name.len()); + let mut last_dash = false; + for c in name.chars() { + let c = c.to_ascii_lowercase(); + if c == '-' || c == '_' || c == '.' { + if !last_dash { + out.push('-'); + last_dash = true; + } + } else { + out.push(c); + last_dash = false; + } + } + out +} + +/// Inventory the pypi lock the project carries. Fetchable resolution +/// (URL + sha256 of a pure `py3-none-any` wheel) comes from `uv.lock`; +/// `poetry.lock` and `--hash`-pinned `requirements.txt` contribute +/// DISCOVERY-only entries (no recorded URL; platform-independent wheel +/// choice is not derivable offline). Pipenv/pdm locks: not yet read. +pub async fn inventory_pypi_locks(project_root: &Path) -> Option> { + if let Some(out) = inventory_uv_lock(project_root).await { + return Some(out); + } + if let Some(out) = inventory_poetry_lock(project_root).await { + return Some(out); + } + inventory_requirements_txt(project_root).await +} + +/// uv.lock: TOML `[[package]]` blocks with `name`/`version` and +/// `wheels = [{ url, hash = "sha256:…" }, …]` entries. +async fn inventory_uv_lock(project_root: &Path) -> Option> { + let text = tokio::fs::read_to_string(project_root.join("uv.lock")) + .await + .ok()?; + let mut out = Vec::new(); + // Line-oriented: uv emits `[[package]]` blocks; wheels live either as + // inline `{ url = "…", hash = "sha256:…" }` table rows or one-line + // arrays. A pure-python wheel ends `py3-none-any.whl`. + let mut name: Option = None; + let mut version: Option = None; + let mut sourced_registry = true; + let mut wheel: Option<(String, String)> = None; + let flush = |name: &mut Option, + version: &mut Option, + sourced_registry: &mut bool, + wheel: &mut Option<(String, String)>, + out: &mut Vec| { + if let (Some(n), Some(v)) = (name.take(), version.take()) { + let canonical = pep503(&n); + if *sourced_registry + && path_safety::is_safe_single_segment(&canonical) + && path_safety::is_safe_single_segment(&v) + { + let (resolved, integrity) = match wheel.take() { + Some((url, sha)) => (http_url(&url), LockIntegrity::Sha256Hex(sha)), + None => (None, LockIntegrity::None), + }; + out.push(LockfileEntry { + ecosystem: "pypi", + purl: format!("pkg:pypi/{canonical}@{v}"), + name: canonical, + version: v, + resolved, + integrity, + }); + } + } + *sourced_registry = true; + *wheel = None; + }; + for line in text.lines() { + let t = line.trim(); + if t == "[[package]]" { + flush( + &mut name, + &mut version, + &mut sourced_registry, + &mut wheel, + &mut out, + ); + continue; + } + if let Some(v) = t.strip_prefix("name = ") { + name = Some(v.trim_matches('"').to_string()); + } else if let Some(v) = t.strip_prefix("version = ") { + version = Some(v.trim_matches('"').to_string()); + } else if t.starts_with("source = ") { + // Registry packages: `source = { registry = "…" }`; editable/ + // virtual/path/git sources are not fetchable artifacts. + sourced_registry = t.contains("registry"); + } else if wheel.is_none() && t.contains("py3-none-any.whl") { + // `{ url = "…py3-none-any.whl", hash = "sha256:…" }` + let url = t + .split("url = \"") + .nth(1) + .and_then(|r| r.split('"').next()) + .unwrap_or(""); + let sha = t + .split("hash = \"sha256:") + .nth(1) + .and_then(|r| r.split('"').next()) + .unwrap_or(""); + if !url.is_empty() && sha.len() == 64 && sha.bytes().all(|b| b.is_ascii_hexdigit()) { + wheel = Some((url.to_string(), sha.to_ascii_lowercase())); + } + } + } + flush( + &mut name, + &mut version, + &mut sourced_registry, + &mut wheel, + &mut out, + ); + Some(dedup_prefer_integrity(out)) +} + +/// poetry.lock: `[[package]]` blocks with `name`/`version` — discovery +/// only (file hashes exist but carry no URLs and no platform choice). +async fn inventory_poetry_lock(project_root: &Path) -> Option> { + let text = tokio::fs::read_to_string(project_root.join("poetry.lock")) + .await + .ok()?; + let mut out = Vec::new(); + let mut in_package = false; + let mut name: Option = None; + for line in text.lines() { + let t = line.trim(); + if t == "[[package]]" { + in_package = true; + name = None; + continue; + } + if t.starts_with('[') && t != "[[package]]" { + in_package = false; + continue; + } + if !in_package { + continue; + } + if let Some(v) = t.strip_prefix("name = ") { + name = Some(pep503(v.trim_matches('"'))); + } else if let Some(v) = t.strip_prefix("version = ") { + if let Some(n) = name.take() { + let v = v.trim_matches('"').to_string(); + if path_safety::is_safe_single_segment(&n) + && path_safety::is_safe_single_segment(&v) + { + out.push(LockfileEntry { + ecosystem: "pypi", + purl: format!("pkg:pypi/{n}@{v}"), + name: n, + version: v, + resolved: None, + integrity: LockIntegrity::None, + }); + } + } + } + } + if out.is_empty() { + return None; + } + Some(dedup_prefer_integrity(out)) +} + +/// requirements.txt with exact `==` pins — discovery only. +async fn inventory_requirements_txt(project_root: &Path) -> Option> { + let text = tokio::fs::read_to_string(project_root.join("requirements.txt")) + .await + .ok()?; + let mut out = Vec::new(); + for line in text.lines() { + let t = line.trim(); + if t.is_empty() || t.starts_with('#') || t.starts_with('-') { + continue; + } + // `name==version` (strip extras, env markers, hash continuations). + let spec = t.split(';').next().unwrap_or(t).trim(); + let spec = spec.split_whitespace().next().unwrap_or(spec); + let Some((raw_name, version)) = spec.split_once("==") else { + continue; + }; + let name = pep503(raw_name.split('[').next().unwrap_or(raw_name).trim()); + let version = version.trim().to_string(); + if name.is_empty() + || !path_safety::is_safe_single_segment(&name) + || !path_safety::is_safe_single_segment(&version) + || !version.chars().next().is_some_and(|c| c.is_ascii_digit()) + { + continue; + } + out.push(LockfileEntry { + ecosystem: "pypi", + purl: format!("pkg:pypi/{name}@{version}"), + name, + version, + resolved: None, + integrity: LockIntegrity::None, + }); + } + if out.is_empty() { + return None; + } + Some(dedup_prefer_integrity(out)) +} + +// ──────────────── registry-fragment recovery from the ledger ──────────────── + +/// Recover the PRE-VENDOR registry resolution of a vendored package from its +/// ledger entry's wiring `original` fragments (and `entry.lock` for cargo), +/// as a fetchable [`LockfileEntry`]. +/// +/// This is the rebuild path for artifacts that are referenced by the rewired +/// lockfile but missing on disk: the live lockfile no longer carries the +/// registry resolution (it points at `.socket/vendor/...`), but `--revert`'s +/// restore data does. golang is deliberately absent — go.sum is never +/// rewired, so the standard [`inventory_project`]/[`lookup`] path covers it. +/// +/// SECURITY: state.json is committed and tamper-able. Recovered URLs go +/// through the same http(s)-only gate as inventoried ones, recovered hashes +/// are shape-validated here and verified against the fetched bytes +/// fail-closed by the fetch layer — a poisoned fragment can at worst make +/// the fetch fail, never land unverified content. +pub async fn recover_lock_entry( + project_root: &Path, + entry: &super::state::VendorEntry, +) -> Result { + let (name, version) = parse_base_purl_coords(&entry.base_purl) + .ok_or_else(|| format!("unparseable base purl `{}`", entry.base_purl))?; + + match entry.ecosystem.as_str() { + "npm" => recover_npm_fragment(entry, &name, &version), + "cargo" => { + let checksum = entry + .lock + .as_ref() + .and_then(|l| l.checksum.clone()) + .filter(|c| is_hex_of_len(c, 64)) + .ok_or_else(|| { + "the ledger records no pre-vendor Cargo.lock checksum".to_string() + })?; + Ok(LockfileEntry { + ecosystem: "cargo", + purl: format!("pkg:cargo/{name}@{version}"), + name, + version, + resolved: None, + integrity: LockIntegrity::Sha256Hex(checksum.to_ascii_lowercase()), + }) + } + "composer" => { + let original = wiring_original(entry, &["composer_lock_package"]) + .ok_or_else(|| "no pre-vendor composer.lock fragment recorded".to_string())?; + let dist = original + .get("dist") + .ok_or_else(|| "the pre-vendor composer.lock fragment has no dist".to_string())?; + let url = dist + .get("url") + .and_then(serde_json::Value::as_str) + .and_then(http_url) + .ok_or_else(|| "the pre-vendor dist has no http(s) url".to_string())?; + let shasum = dist + .get("shasum") + .and_then(serde_json::Value::as_str) + .filter(|s| is_hex_of_len(s, 40)) + .ok_or_else(|| { + "the pre-vendor dist records no shasum; refusing an unverifiable fetch" + .to_string() + })?; + Ok(LockfileEntry { + ecosystem: "composer", + purl: format!("pkg:composer/{name}@{version}"), + name, + version, + resolved: Some(url), + integrity: LockIntegrity::Sha1Hex(shasum.to_ascii_lowercase()), + }) + } + "gem" => { + let line = wiring_original(entry, &["gemfile_lock_checksum"]) + .and_then(|v| v.as_str().map(str::to_string)) + .ok_or_else(|| "no pre-vendor Gemfile.lock checksum recorded".to_string())?; + let sha = line + .split("sha256=") + .nth(1) + .map(|rest| { + rest.trim_end_matches(',') + .trim() + .chars() + .take_while(|c| c.is_ascii_hexdigit()) + .collect::() + }) + .filter(|s| is_hex_of_len(s, 64)) + .ok_or_else(|| { + "the pre-vendor checksum line has no sha256; refusing an unverifiable fetch" + .to_string() + })?; + let base = gem_remote_base(project_root) + .await + .unwrap_or_else(|| "https://rubygems.org".to_string()); + Ok(LockfileEntry { + ecosystem: "gem", + purl: format!("pkg:gem/{name}@{version}"), + resolved: http_url(&format!( + "{}/downloads/{name}-{version}.gem", + base.trim_end_matches('/') + )), + name, + version, + integrity: LockIntegrity::Sha256Hex(sha.to_ascii_lowercase()), + }) + } + "pypi" => { + if entry.artifact.platform_locked == Some(true) { + return Err( + "the vendored wheel is platform-locked (compiled); it cannot be rebuilt from the registry" + .to_string(), + ); + } + let unit = wiring_original(entry, &["uv_lock_package"]) + .and_then(|v| v.as_str().map(str::to_string)) + .ok_or_else(|| "no pre-vendor uv.lock fragment recorded".to_string())?; + let (url, sha) = pure_wheel_from_uv_unit(&unit).ok_or_else(|| { + "the pre-vendor uv.lock fragment lists no verifiable pure wheel".to_string() + })?; + Ok(LockfileEntry { + ecosystem: "pypi", + purl: format!("pkg:pypi/{name}@{version}"), + name, + version, + resolved: Some(url), + integrity: LockIntegrity::Sha256Hex(sha), + }) + } + other => Err(format!( + "no ledger-based registry recovery for ecosystem `{other}`" + )), + } +} + +/// The integrity the REWIRED npm-family lockfile records for a vendored +/// artifact at `artifact_rel` (forward-slashed, no `./` prefix). This is +/// the integrity of OUR deterministically packed tarball — the trust +/// anchor for repair's no-ledger reconstruction: a rebuilt tarball that +/// matches it is exactly what the package manager would have installed. +/// +/// package-lock/shrinkwrap are parsed as JSON; the text formats (pnpm, +/// yarn classic/berry, bun) are scanned with a bounded forward window from +/// each reference line. +pub async fn wired_vendor_integrity( + project_root: &Path, + artifact_rel: &str, +) -> Option { + let rel = artifact_rel.trim_start_matches("./"); + + // JSON locks: resolved == "file:" (npm writes exactly this form). + for lock in ["npm-shrinkwrap.json", "package-lock.json"] { + let Ok(bytes) = tokio::fs::read(project_root.join(lock)).await else { + continue; + }; + let Ok(v) = serde_json::from_slice::(&bytes) else { + continue; + }; + if let Some(pkgs) = v.get("packages").and_then(serde_json::Value::as_object) { + for entry in pkgs.values() { + let resolved = entry.get("resolved").and_then(serde_json::Value::as_str); + if resolved.is_some_and(|r| r.trim_start_matches("file:") == rel) { + if let Some(sri) = entry + .get("integrity") + .and_then(serde_json::Value::as_str) + .filter(|s| looks_like_sri(s)) + { + return Some(LockIntegrity::Sri(sri.to_string())); + } + } + } + } + } + + // Text locks: any line referencing the artifact path, integrity within + // a short forward window (the same block). + for lock in ["pnpm-lock.yaml", "yarn.lock", "bun.lock"] { + let Ok(text) = tokio::fs::read_to_string(project_root.join(lock)).await else { + continue; + }; + let lines: Vec<&str> = text.lines().collect(); + for (i, line) in lines.iter().enumerate() { + if !line.contains(rel) { + continue; + } + for probe in lines.iter().take((i + 6).min(lines.len())).skip(i) { + // pnpm `resolution: {integrity: …}` / classic `integrity …` + // / bun tuple `"sha512-…"`. + if let Some(v) = inline_yaml_field(probe, "integrity:") { + if looks_like_sri(&v) { + return Some(LockIntegrity::Sri(v)); + } + } + if let Some(rest) = probe.trim().strip_prefix("integrity ") { + let v = rest.trim().trim_matches('"'); + if looks_like_sri(v) { + return Some(LockIntegrity::Sri(v.to_string())); + } + } + if let Some(sri) = probe.split('"').rev().find(|tok| looks_like_sri(tok)) { + return Some(LockIntegrity::Sri(sri.to_string())); + } + // yarn berry: `checksum: 10c0/…`. + if let Some(v) = inline_yaml_field(probe, "checksum:") { + if v.split_once('/') + .is_some_and(|(k, b)| !k.is_empty() && !b.is_empty()) + { + return Some(LockIntegrity::BerryChecksum(v)); + } + } + } + } + } + None +} + +/// `pkg:/@` → (name, version). The name may itself +/// contain `/` (npm scopes, go modules); the version is after the LAST `@`. +fn parse_base_purl_coords(base_purl: &str) -> Option<(String, String)> { + let rest = base_purl.strip_prefix("pkg:")?; + let (_, name_ver) = rest.split_once('/')?; + let (name, version) = name_ver.rsplit_once('@')?; + if name.is_empty() || version.is_empty() { + return None; + } + Some((name.to_string(), version.to_string())) +} + +/// First wiring record of one of `kinds` carrying an `original` payload. +fn wiring_original<'a>( + entry: &'a super::state::VendorEntry, + kinds: &[&str], +) -> Option<&'a serde_json::Value> { + entry + .wiring + .iter() + .find(|r| kinds.contains(&r.kind.as_str()) && r.original.is_some()) + .and_then(|r| r.original.as_ref()) +} + +fn is_hex_of_len(s: &str, len: usize) -> bool { + s.len() == len && s.bytes().all(|b| b.is_ascii_hexdigit()) +} + +/// Per-flavor npm recovery: the wiring kinds disambiguate the lock flavor, +/// each fragment yields (resolved?, integrity). +fn recover_npm_fragment( + entry: &super::state::VendorEntry, + name: &str, + version: &str, +) -> Result { + let mk = |resolved: Option, integrity: LockIntegrity| LockfileEntry { + ecosystem: "npm", + purl: format!("pkg:npm/{name}@{version}"), + name: name.to_string(), + version: version.to_string(), + resolved, + integrity, + }; + + // package-lock / shrinkwrap: the original is the full lock entry object. + if let Some(obj) = wiring_original(entry, &["npm_lock_entry", "npm_lock_legacy_entry"]) { + let resolved = obj + .get("resolved") + .and_then(serde_json::Value::as_str) + .and_then(http_url); + if let Some(sri) = obj + .get("integrity") + .and_then(serde_json::Value::as_str) + .filter(|s| looks_like_sri(s)) + { + return Ok(mk(resolved, LockIntegrity::Sri(sri.to_string()))); + } + } + // pnpm: the original is the packages block's lines; pull + // `resolution: {integrity: …, tarball: …}`. + if let Some(lines) = wiring_original(entry, &["pnpm_lock_package"]).and_then(lines_of) { + let mut sri = None; + let mut tarball = None; + for line in &lines { + if let Some(v) = inline_yaml_field(line, "integrity:") { + sri = sri.or(Some(v)); + } + if let Some(v) = inline_yaml_field(line, "tarball:") { + tarball = tarball.or(http_url(&v)); + } + } + if let Some(sri) = sri.filter(|s| looks_like_sri(s)) { + return Ok(mk(tarball, LockIntegrity::Sri(sri))); + } + } + // yarn classic: block lines carry `integrity ` (preferred) and/or + // `resolved "#"`. + if let Some(lines) = wiring_original(entry, &["yarn_lock_block"]).and_then(lines_of) { + let mut url = None; + let mut sha1 = None; + let mut sri = None; + for line in &lines { + let t = line.trim(); + if let Some(rest) = t.strip_prefix("integrity ") { + let v = rest.trim().trim_matches('"'); + if looks_like_sri(v) { + sri = Some(v.to_string()); + } + } + if let Some(rest) = t.strip_prefix("resolved ") { + let v = rest.trim().trim_matches('"'); + let (u, frag) = v.split_once('#').unwrap_or((v, "")); + url = http_url(u); + if is_hex_of_len(frag, 40) { + sha1 = Some(frag.to_ascii_lowercase()); + } + } + } + if let Some(sri) = sri { + return Ok(mk(url, LockIntegrity::Sri(sri))); + } + if let Some(sha1) = sha1 { + return Ok(mk(url, LockIntegrity::Sha1Hex(sha1))); + } + } + // yarn berry: block lines carry `checksum: /`. + if let Some(lines) = wiring_original(entry, &["yarn_berry_lock_entry"]).and_then(lines_of) { + for line in &lines { + if let Some(v) = inline_yaml_field(line, "checksum:") { + if v.split_once('/') + .is_some_and(|(k, b)| !k.is_empty() && !b.is_empty()) + { + return Ok(mk(None, LockIntegrity::BerryChecksum(v))); + } + } + } + } + // bun: the original is the raw tuple line; the integrity is its last + // quoted SRI string. + if let Some(line) = + wiring_original(entry, &["bun_lock_package"]).and_then(|v| v.as_str().map(str::to_string)) + { + if let Some(sri) = line + .split('"') + .rev() + .find(|tok| looks_like_sri(tok)) + .map(str::to_string) + { + return Ok(mk(None, LockIntegrity::Sri(sri))); + } + } + Err("no pre-vendor npm registry fragment with a verifiable integrity recorded".to_string()) +} + +fn looks_like_sri(s: &str) -> bool { + ["sha512-", "sha384-", "sha256-", "sha1-"] + .iter() + .any(|p| s.starts_with(p) && s.len() > p.len()) +} + +/// A wiring `original` recorded as an array of text lines. +fn lines_of(v: &serde_json::Value) -> Option> { + v.as_array().map(|arr| { + arr.iter() + .filter_map(|l| l.as_str().map(str::to_string)) + .collect() + }) +} + +/// `… field: value` (optionally inside an inline `{…}` map) → value, with +/// trailing `,`/`}` and quotes stripped. +fn inline_yaml_field(line: &str, field: &str) -> Option { + let idx = line.find(field)?; + let rest = &line[idx + field.len()..]; + let end = rest.find([',', '}']).unwrap_or(rest.len()); + let v = rest[..end].trim().trim_matches(['\'', '"']).to_string(); + (!v.is_empty()).then_some(v) +} + +/// The `GEM remote:` base of the (unrewired) Gemfile.lock. +async fn gem_remote_base(project_root: &Path) -> Option { + let text = tokio::fs::read_to_string(project_root.join("Gemfile.lock")) + .await + .ok()?; + let mut in_gem = false; + for line in text.lines() { + if line.trim_end() == "GEM" { + in_gem = true; + continue; + } + if in_gem { + if let Some(rest) = line.trim().strip_prefix("remote:") { + return http_url(rest.trim()); + } + if !line.starts_with(' ') && !line.trim().is_empty() { + in_gem = false; + } + } + } + None +} + +/// First `{ url = "…", hash = "sha256:…" }` wheel in a uv.lock `[[package]]` +/// unit whose filename is a PURE wheel (`-none-any.whl`). +fn pure_wheel_from_uv_unit(unit: &str) -> Option<(String, String)> { + let mut search = unit; + while let Some(uidx) = search.find("url = \"") { + let after = &search[uidx + 7..]; + let uend = after.find('"')?; + let url = &after[..uend]; + let rest = &after[uend..]; + let advance = uidx + 7 + uend; + if url.ends_with("-none-any.whl") { + if let Some(hidx) = rest.find("hash = \"sha256:") { + let hafter = &rest[hidx + 15..]; + let hend = hafter.find('"')?; + let sha = &hafter[..hend]; + if is_hex_of_len(sha, 64) { + if let Some(url) = http_url(url) { + return Some((url, sha.to_ascii_lowercase())); + } + } + } + } + search = &search[advance..]; + } + None +} + +#[cfg(test)] +mod tests { + use super::*; + + async fn write(root: &Path, name: &str, content: &str) { + tokio::fs::write(root.join(name), content).await.unwrap(); + } + + fn entry<'a>(entries: &'a [LockfileEntry], name: &str) -> &'a LockfileEntry { + entries + .iter() + .find(|e| e.name == name) + .unwrap_or_else(|| panic!("no entry for {name}: {entries:?}")) + } + + // ── package-lock ────────────────────────────────────────────────────── + + const PACKAGE_LOCK: &str = r#"{ + "name": "fixture", + "version": "1.0.0", + "lockfileVersion": 3, + "packages": { + "": { "name": "fixture", "version": "1.0.0" }, + "packages/member": { "name": "member", "version": "0.0.1" }, + "node_modules/member": { "resolved": "packages/member", "link": true }, + "node_modules/left-pad": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz", + "integrity": "sha512-XI5MPz==" + }, + "node_modules/@scope/pkg": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/@scope/pkg/-/pkg-2.0.0.tgz", + "integrity": "sha512-scoped==" + }, + "node_modules/bundled-dep": { + "version": "1.0.0", + "inBundle": true + }, + "node_modules/git-dep": { + "version": "0.5.0", + "resolved": "git+ssh://git@github.com/x/git-dep.git#abc" + }, + "node_modules/vendored": { + "version": "3.0.0", + "resolved": "file:.socket/vendor/npm/9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f/vendored-3.0.0.tgz", + "integrity": "sha512-ours==" + }, + "node_modules/evil": { + "version": "../../escape", + "resolved": "https://registry.npmjs.org/evil/-/evil-1.0.0.tgz", + "integrity": "sha512-evil==" + } + } +} +"#; + + #[tokio::test] + async fn package_lock_inventories_registry_entries() { + let tmp = tempfile::tempdir().unwrap(); + write(tmp.path(), "package-lock.json", PACKAGE_LOCK).await; + + let (flavor, entries) = inventory_npm_lock(tmp.path()).await.unwrap(); + assert_eq!(flavor, NpmLockFlavor::PackageLock); + + let lp = entry(&entries, "left-pad"); + assert_eq!(lp.version, "1.3.0"); + assert_eq!(lp.purl, "pkg:npm/left-pad@1.3.0"); + assert_eq!( + lp.resolved.as_deref(), + Some("https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz") + ); + assert_eq!(lp.integrity, LockIntegrity::Sri("sha512-XI5MPz==".into())); + + let scoped = entry(&entries, "@scope/pkg"); + assert_eq!(scoped.purl, "pkg:npm/@scope/pkg@2.0.0"); + + // git deps stay listed (discovery) but carry no fetchable URL. + let git = entry(&entries, "git-dep"); + assert_eq!(git.resolved, None); + assert_eq!(git.integrity, LockIntegrity::None); + + // Workspace members, links, bundled deps, our vendored spec, and + // the unsafe-version entry are all absent. + for absent in ["member", "fixture", "bundled-dep", "vendored", "evil"] { + assert!( + !entries.iter().any(|e| e.name == absent), + "{absent} must not be inventoried: {entries:?}" + ); + } + } + + #[tokio::test] + async fn shrinkwrap_wins_over_package_lock() { + let tmp = tempfile::tempdir().unwrap(); + write(tmp.path(), "package-lock.json", PACKAGE_LOCK).await; + write( + tmp.path(), + "npm-shrinkwrap.json", + r#"{ "lockfileVersion": 3, "packages": { + "node_modules/only-in-shrinkwrap": { "version": "9.9.9" } } }"#, + ) + .await; + + let (_, entries) = inventory_npm_lock(tmp.path()).await.unwrap(); + assert!(entries.iter().any(|e| e.name == "only-in-shrinkwrap")); + assert!(!entries.iter().any(|e| e.name == "left-pad")); + } + + #[tokio::test] + async fn legacy_v1_lock_without_packages_map_yields_none() { + let tmp = tempfile::tempdir().unwrap(); + write( + tmp.path(), + "package-lock.json", + r#"{ "lockfileVersion": 1, "dependencies": { "left-pad": { "version": "1.3.0" } } }"#, + ) + .await; + assert!(inventory_npm_lock(tmp.path()).await.is_none()); + } + + // ── pnpm ────────────────────────────────────────────────────────────── + + const PNPM_LOCK: &str = "lockfileVersion: '9.0' + +settings: + autoInstallPeers: true + +importers: + + .: + dependencies: + left-pad: + specifier: 1.3.0 + version: 1.3.0 + +packages: + + left-pad@1.3.0: + resolution: {integrity: sha512-XI5MPz==} + + '@scope/pkg@2.0.0': + resolution: {integrity: sha512-scoped==} + + peer-user@4.0.0(left-pad@1.3.0): + resolution: {integrity: sha512-peer==} + + local-thing@file:packages/local: + resolution: {directory: packages/local, type: directory} + + vendored@file:.socket/vendor/npm/9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f/vendored-3.0.0.tgz: + resolution: {integrity: sha512-ours==, tarball: file:.socket/vendor/npm/9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f/vendored-3.0.0.tgz} + +snapshots: + + left-pad@1.3.0: {} +"; + + #[tokio::test] + async fn pnpm_v9_keys_parse_with_peer_suffix_and_scoped_quoting() { + let tmp = tempfile::tempdir().unwrap(); + write(tmp.path(), "pnpm-lock.yaml", PNPM_LOCK).await; + + let (flavor, entries) = inventory_npm_lock(tmp.path()).await.unwrap(); + assert_eq!(flavor, NpmLockFlavor::Pnpm); + + assert_eq!( + entry(&entries, "left-pad").integrity, + LockIntegrity::Sri("sha512-XI5MPz==".into()) + ); + assert_eq!(entry(&entries, "@scope/pkg").version, "2.0.0"); + assert_eq!(entry(&entries, "peer-user").version, "4.0.0"); + // registry entries carry no URL in v9 — constructed at fetch time. + assert_eq!(entry(&entries, "left-pad").resolved, None); + for absent in ["local-thing", "vendored"] { + assert!(!entries.iter().any(|e| e.name == absent), "{entries:?}"); + } + } + + // ── yarn classic ────────────────────────────────────────────────────── + + const YARN_CLASSIC: &str = "# THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY. +# yarn lockfile v1 + + +\"@scope/pkg@^2.0.0\": + version \"2.0.0\" + resolved \"https://registry.yarnpkg.com/@scope/pkg/-/pkg-2.0.0.tgz#aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\" + integrity sha512-scoped== + +left-pad@1.3.0, left-pad@^1.3.0: + version \"1.3.0\" + resolved \"https://registry.yarnpkg.com/left-pad/-/left-pad-1.3.0.tgz#bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb\" + integrity sha512-XI5MPz== + +old-school@0.1.0: + version \"0.1.0\" + resolved \"https://registry.yarnpkg.com/old-school/-/old-school-0.1.0.tgz#cccccccccccccccccccccccccccccccccccccccc\" + +aliased@npm:real-name@^3.0.0: + version \"3.0.0\" + resolved \"https://registry.yarnpkg.com/real-name/-/real-name-3.0.0.tgz#dddddddddddddddddddddddddddddddddddddddd\" + integrity sha512-alias== +"; + + #[tokio::test] + async fn yarn_classic_blocks_yield_resolved_sha1_and_integrity() { + let tmp = tempfile::tempdir().unwrap(); + write(tmp.path(), "yarn.lock", YARN_CLASSIC).await; + + let (flavor, entries) = inventory_npm_lock(tmp.path()).await.unwrap(); + assert_eq!(flavor, NpmLockFlavor::YarnClassic); + + let lp = entry(&entries, "left-pad"); + assert_eq!( + lp.resolved.as_deref(), + Some("https://registry.yarnpkg.com/left-pad/-/left-pad-1.3.0.tgz"), + "the #sha1 fragment is split off the URL" + ); + assert_eq!(lp.integrity, LockIntegrity::Sri("sha512-XI5MPz==".into())); + + // Integrity-less old locks fall back to the sha1 fragment. + assert_eq!( + entry(&entries, "old-school").integrity, + LockIntegrity::Sha1Hex("c".repeat(40)) + ); + + // `alias@npm:real@range` resolves to the real name. + assert!(entries.iter().any(|e| e.name == "real-name")); + assert_eq!(entry(&entries, "@scope/pkg").version, "2.0.0"); + } + + // ── yarn berry ──────────────────────────────────────────────────────── + + const YARN_BERRY: &str = + "# This file is generated by running \"yarn install\" inside your project. +# Manifest files (package.json) are also used. + +__metadata: + version: 8 + cacheKey: 10c0 + +\"fixture@workspace:.\": + version: 0.0.0-use.local + resolution: \"fixture@workspace:.\" + languageName: unknown + linkType: soft + +\"left-pad@npm:1.3.0\": + version: 1.3.0 + resolution: \"left-pad@npm:1.3.0\" + checksum: 10c0/deadbeefcafe== + languageName: node + linkType: hard + +\"@scope/pkg@npm:^2.0.0\": + version: 2.0.0 + resolution: \"@scope/pkg@npm:2.0.0\" + checksum: 10c0/scopedchecksum== + languageName: node + linkType: hard +"; + + #[tokio::test] + async fn yarn_berry_registry_resolutions_inventory_with_checksums() { + let tmp = tempfile::tempdir().unwrap(); + write(tmp.path(), "yarn.lock", YARN_BERRY).await; + + let (flavor, entries) = inventory_npm_lock(tmp.path()).await.unwrap(); + assert_eq!(flavor, NpmLockFlavor::YarnBerry); + + let lp = entry(&entries, "left-pad"); + assert_eq!(lp.version, "1.3.0"); + assert_eq!( + lp.integrity, + LockIntegrity::BerryChecksum("10c0/deadbeefcafe==".into()) + ); + assert_eq!(entry(&entries, "@scope/pkg").version, "2.0.0"); + // The workspace root is not a registry package. + assert!(!entries.iter().any(|e| e.name == "fixture"), "{entries:?}"); + } + + // ── bun ─────────────────────────────────────────────────────────────── + + const BUN_LOCK: &str = r#"{ + "lockfileVersion": 1, + "workspaces": { + "": { "name": "fixture", "dependencies": { "left-pad": "1.3.0" } }, + }, + "packages": { + "left-pad": ["left-pad@1.3.0", "", {}, "sha512-XI5MPz=="], + "@scope/pkg": ["@scope/pkg@2.0.0", "", {}, "sha512-scoped=="], + "vendored": ["vendored@file:.socket/vendor/npm/9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f/vendored-3.0.0.tgz", {}], + "linked": ["linked@workspace:packages/linked", {}], + } +} +"#; + + #[tokio::test] + async fn bun_registry_tuples_parse_and_locals_are_skipped() { + let tmp = tempfile::tempdir().unwrap(); + write(tmp.path(), "bun.lock", BUN_LOCK).await; + + let (flavor, entries) = inventory_npm_lock(tmp.path()).await.unwrap(); + assert_eq!(flavor, NpmLockFlavor::Bun); + + assert_eq!( + entry(&entries, "left-pad").integrity, + LockIntegrity::Sri("sha512-XI5MPz==".into()) + ); + assert_eq!(entry(&entries, "left-pad").resolved, None); + assert_eq!(entry(&entries, "@scope/pkg").version, "2.0.0"); + for absent in ["vendored", "linked"] { + assert!(!entries.iter().any(|e| e.name == absent), "{entries:?}"); + } + } + + // ── shared semantics ────────────────────────────────────────────────── + + #[tokio::test] + async fn lookup_bridges_percent_encoded_purls() { + let entries = vec![ + LockfileEntry::npm("@scope/pkg", "2.0.0", None, LockIntegrity::None), + LockfileEntry::npm("left-pad", "1.3.0", None, LockIntegrity::None), + ]; + assert!(lookup(&entries, "pkg:npm/%40scope/pkg@2.0.0").is_some()); + assert!(lookup(&entries, "pkg:npm/@scope/pkg@2.0.0").is_some()); + assert!(lookup(&entries, "pkg:npm/left-pad@1.3.0?artifact_id=x").is_some()); + assert!(lookup(&entries, "pkg:npm/left-pad@9.9.9").is_none()); + assert!(lookup(&entries, "pkg:pypi/left-pad@1.3.0").is_none()); + } + + #[tokio::test] + async fn dedup_prefers_integrity_bearing_instance() { + let raw = vec![ + LockfileEntry::npm("dup", "1.0.0", None, LockIntegrity::None), + LockfileEntry::npm( + "dup", + "1.0.0", + None, + LockIntegrity::Sri("sha512-x==".into()), + ), + LockfileEntry::npm("dup", "1.0.0", None, LockIntegrity::None), + ]; + let out = finalize_npm(raw); + assert_eq!(out.len(), 1); + assert_eq!(out[0].integrity, LockIntegrity::Sri("sha512-x==".into())); + } + + #[cfg(feature = "cargo")] + #[tokio::test] + async fn cargo_lock_inventories_crates_io_entries() { + let tmp = tempfile::tempdir().unwrap(); + write( + tmp.path(), + "Cargo.lock", + r#"# This file is automatically @generated by Cargo. +version = 4 + +[[package]] +name = "fixture" +version = "0.1.0" + +[[package]] +name = "serde" +version = "1.0.200" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddc6f9cc94d67c0e21aaf7eda3a010fd3af78ebf6e096aa6e2e13c79749cce4f" + +[[package]] +name = "git-dep" +version = "0.5.0" +source = "git+https://github.com/x/git-dep?rev=abc#abc" + +[[package]] +name = "sparse-crate" +version = "2.0.0" +source = "sparse+https://index.crates.io/" +checksum = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" +"#, + ) + .await; + + let entries = inventory_cargo_lock(tmp.path()).await.unwrap(); + let serde_entry = entry(&entries, "serde"); + assert_eq!(serde_entry.version, "1.0.200"); + assert_eq!(serde_entry.purl, "pkg:cargo/serde@1.0.200"); + assert_eq!( + serde_entry.integrity, + LockIntegrity::Sha256Hex( + "ddc6f9cc94d67c0e21aaf7eda3a010fd3af78ebf6e096aa6e2e13c79749cce4f".into() + ) + ); + assert!(matches!( + entry(&entries, "sparse-crate").integrity, + LockIntegrity::Sha256Hex(_) + )); + // Workspace member (no source) excluded; git source unverifiable. + assert!(!entries.iter().any(|e| e.name == "fixture")); + assert_eq!(entry(&entries, "git-dep").integrity, LockIntegrity::None); + } + + #[cfg(feature = "golang")] + #[tokio::test] + async fn go_sum_inventories_module_zip_lines() { + let tmp = tempfile::tempdir().unwrap(); + write( + tmp.path(), + "go.sum", + "github.com/gin-gonic/gin v1.9.1 h1:4idEAncQnU5cB7BeOkPtxjfCSye0AAm1R0RVIqJ+Jmg=\n\ + github.com/gin-gonic/gin v1.9.1/go.mod h1:hPrL7YrpYKXt5YId3A/Tnip5kqbEAP+KLuI3SUcPTeU=\n\ + golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ=\n", + ) + .await; + + let entries = inventory_go_sum(tmp.path()).await.unwrap(); + assert_eq!(entries.len(), 2, "the /go.mod line is skipped: {entries:?}"); + let gin = entry(&entries, "github.com/gin-gonic/gin"); + assert_eq!(gin.version, "v1.9.1"); + assert_eq!(gin.purl, "pkg:golang/github.com/gin-gonic/gin@v1.9.1"); + assert_eq!( + gin.integrity, + LockIntegrity::GoH1("h1:4idEAncQnU5cB7BeOkPtxjfCSye0AAm1R0RVIqJ+Jmg=".into()) + ); + } + + #[tokio::test] + async fn lookup_matches_cargo_and_golang_purls() { + let entries = vec![ + LockfileEntry { + ecosystem: "cargo", + name: "serde".into(), + version: "1.0.200".into(), + purl: "pkg:cargo/serde@1.0.200".into(), + resolved: None, + integrity: LockIntegrity::None, + }, + LockfileEntry { + ecosystem: "golang", + name: "github.com/x/y".into(), + version: "v1.0.0".into(), + purl: "pkg:golang/github.com/x/y@v1.0.0".into(), + resolved: None, + integrity: LockIntegrity::None, + }, + ]; + assert!(lookup(&entries, "pkg:cargo/serde@1.0.200").is_some()); + assert!(lookup(&entries, "pkg:golang/github.com/x/y@v1.0.0").is_some()); + assert!(lookup(&entries, "pkg:cargo/serde@9.9.9").is_none()); + assert!( + lookup(&entries, "pkg:npm/serde@1.0.200").is_none(), + "ecosystem tags must match, not just name@version" + ); + } + + #[cfg(feature = "composer")] + #[tokio::test] + async fn composer_lock_inventories_dist_entries() { + let tmp = tempfile::tempdir().unwrap(); + write( + tmp.path(), + "composer.lock", + r#"{ + "packages": [ + { + "name": "Monolog/Monolog", + "version": "v3.5.0", + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/Seldaek/monolog/zipball/abc", + "shasum": "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" + } + }, + { + "name": "vendored/pkg", + "version": "1.0.0", + "dist": { "type": "path", "url": ".socket/vendor/composer/9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f/vendored/pkg@1.0.0" } + } + ], + "packages-dev": [ + { + "name": "symfony/console", + "version": "v6.4.1", + "dist": { "type": "zip", "url": "https://example.com/console.zip", "shasum": "" } + } + ] +}"#, + ) + .await; + + let entries = inventory_composer_lock(tmp.path()).await.unwrap(); + let monolog = entry(&entries, "monolog/monolog"); + assert_eq!( + monolog.version, "3.5.0", + "leading v dropped, name lowercased" + ); + assert_eq!(monolog.purl, "pkg:composer/monolog/monolog@3.5.0"); + assert!(matches!(monolog.integrity, LockIntegrity::Sha1Hex(_))); + assert!(monolog.resolved.as_deref().unwrap().contains("zipball")); + // Empty shasum → discovery-only; path dist (ours) excluded. + assert_eq!( + entry(&entries, "symfony/console").integrity, + LockIntegrity::None + ); + assert!(!entries.iter().any(|e| e.name == "vendored/pkg")); + } + + #[tokio::test] + async fn gemfile_lock_inventories_specs_and_checksums() { + let tmp = tempfile::tempdir().unwrap(); + write( + tmp.path(), + "Gemfile.lock", + "GEM\n remote: https://rubygems.org/\n specs:\n rails (7.1.0)\n \ + actionpack (= 7.1.0)\n rack (3.0.8)\n nokogiri (1.16.5-arm64-darwin)\n\n\ + PLATFORMS\n ruby\n\nDEPENDENCIES\n rails\n\nCHECKSUMS\n \ + rails (7.1.0) sha256=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n\n\ + BUNDLED WITH\n 2.6.0\n", + ) + .await; + + let entries = inventory_gemfile_lock(tmp.path()).await.unwrap(); + let rails = entry(&entries, "rails"); + assert_eq!(rails.version, "7.1.0"); + assert_eq!(rails.purl, "pkg:gem/rails@7.1.0"); + assert!(matches!(rails.integrity, LockIntegrity::Sha256Hex(_))); + assert_eq!( + rails.resolved.as_deref(), + Some("https://rubygems.org/downloads/rails-7.1.0.gem") + ); + // No CHECKSUMS entry → discovery-only; platform gem skipped; + // dependency range lines never parse as specs. + assert_eq!(entry(&entries, "rack").integrity, LockIntegrity::None); + assert!(!entries.iter().any(|e| e.name == "nokogiri")); + assert!(!entries.iter().any(|e| e.name == "actionpack")); + } + + #[tokio::test] + async fn uv_lock_inventories_pure_wheels() { + let tmp = tempfile::tempdir().unwrap(); + write( + tmp.path(), + "uv.lock", + r#"version = 1 + +[[package]] +name = "Requests" +version = "2.28.0" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/aa/requests-2.28.0-py3-none-any.whl", hash = "sha256:aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" }, +] + +[[package]] +name = "native-only" +version = "1.0.0" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bb/native_only-1.0.0-cp312-macosx.whl", hash = "sha256:bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" }, +] + +[[package]] +name = "local-proj" +version = "0.0.1" +source = { editable = "." } +"#, + ) + .await; + + let entries = inventory_pypi_locks(tmp.path()).await.unwrap(); + let requests = entry(&entries, "requests"); + assert_eq!(requests.purl, "pkg:pypi/requests@2.28.0", "PEP 503 name"); + assert!(matches!(requests.integrity, LockIntegrity::Sha256Hex(_))); + assert!(requests + .resolved + .as_deref() + .unwrap() + .ends_with("py3-none-any.whl")); + // Platform-only wheels → discovery-only; editable sources excluded. + assert_eq!( + entry(&entries, "native-only").integrity, + LockIntegrity::None + ); + assert!(!entries.iter().any(|e| e.name == "local-proj")); + } + + #[tokio::test] + async fn poetry_and_requirements_are_discovery_only() { + let tmp = tempfile::tempdir().unwrap(); + write( + tmp.path(), + "poetry.lock", + "[[package]]\nname = \"Flask_Login\"\nversion = \"0.6.3\"\n\n[metadata]\nlock-version = \"2.0\"\n", + ) + .await; + let entries = inventory_pypi_locks(tmp.path()).await.unwrap(); + let fl = entry(&entries, "flask-login"); + assert_eq!(fl.purl, "pkg:pypi/flask-login@0.6.3"); + assert_eq!(fl.integrity, LockIntegrity::None); + + let tmp = tempfile::tempdir().unwrap(); + write( + tmp.path(), + "requirements.txt", + "# pinned\nrequests[security]==2.28.0 --hash=sha256:abc \\\n --hash=sha256:def\nflask>=2.0\n-e .\n", + ) + .await; + let entries = inventory_pypi_locks(tmp.path()).await.unwrap(); + assert_eq!(entries.len(), 1, "{entries:?}"); + assert_eq!(entries[0].purl, "pkg:pypi/requests@2.28.0"); + } + + #[tokio::test] + async fn unsupported_flavors_yield_none() { + // PnP marker wins over any lockfile. + let tmp = tempfile::tempdir().unwrap(); + write(tmp.path(), ".pnp.cjs", "/* pnp */").await; + write(tmp.path(), "package-lock.json", PACKAGE_LOCK).await; + assert!(inventory_npm_lock(tmp.path()).await.is_none()); + + // pnpm v6. + let tmp = tempfile::tempdir().unwrap(); + write(tmp.path(), "pnpm-lock.yaml", "lockfileVersion: '6.0'\n").await; + assert!(inventory_npm_lock(tmp.path()).await.is_none()); + + // No lockfile at all. + let tmp = tempfile::tempdir().unwrap(); + assert!(inventory_npm_lock(tmp.path()).await.is_none()); + } +} + +#[cfg(test)] +mod recover_tests { + use super::super::state::WiringAction; + use super::super::state::{CargoLockOriginal, VendorArtifact, VendorEntry, WiringRecord}; + use super::*; + + const UUID: &str = "9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f"; + + fn entry(eco: &str, base_purl: &str, wiring: Vec) -> VendorEntry { + VendorEntry { + ecosystem: eco.into(), + base_purl: base_purl.into(), + uuid: UUID.into(), + artifact: VendorArtifact { + path: format!(".socket/vendor/{eco}/{UUID}/x"), + sha256: String::new(), + size: None, + platform_locked: None, + }, + wiring, + lock: None, + took_over_go_patches: false, + detached: false, + record: None, + flavor: None, + uv: None, + pnpm: None, + poetry: None, + pdm: None, + pipenv: None, + } + } + + fn rec(kind: &str, original: serde_json::Value) -> WiringRecord { + WiringRecord { + file: "lock".into(), + kind: kind.into(), + action: WiringAction::Rewritten, + key: Some("k".into()), + original: Some(original), + new: None, + } + } + + #[tokio::test] + async fn npm_lock_entry_fragment_recovers_sri_and_url() { + let tmp = tempfile::tempdir().unwrap(); + let e = entry( + "npm", + "pkg:npm/@scope/x@1.2.3", + vec![rec( + "npm_lock_entry", + serde_json::json!({ + "resolved": "https://registry.npmjs.org/@scope/x/-/x-1.2.3.tgz", + "integrity": "sha512-AAAA", + }), + )], + ); + let got = recover_lock_entry(tmp.path(), &e).await.unwrap(); + assert_eq!(got.ecosystem, "npm"); + assert_eq!(got.name, "@scope/x"); + assert_eq!(got.version, "1.2.3"); + assert_eq!( + got.resolved.as_deref(), + Some("https://registry.npmjs.org/@scope/x/-/x-1.2.3.tgz") + ); + assert_eq!(got.integrity, LockIntegrity::Sri("sha512-AAAA".into())); + } + + #[tokio::test] + async fn pnpm_package_lines_recover_integrity_and_tarball() { + let tmp = tempfile::tempdir().unwrap(); + let e = entry( + "npm", + "pkg:npm/left-pad@1.3.0", + vec![rec( + "pnpm_lock_package", + serde_json::json!([ + " left-pad@1.3.0:", + " resolution: {integrity: sha512-BBBB, tarball: https://npm.corp/left-pad-1.3.0.tgz}", + ]), + )], + ); + let got = recover_lock_entry(tmp.path(), &e).await.unwrap(); + assert_eq!(got.integrity, LockIntegrity::Sri("sha512-BBBB".into())); + assert_eq!( + got.resolved.as_deref(), + Some("https://npm.corp/left-pad-1.3.0.tgz") + ); + } + + #[tokio::test] + async fn yarn_classic_block_prefers_sri_else_sha1() { + let tmp = tempfile::tempdir().unwrap(); + let sha1 = "a".repeat(40); + let with_both = entry( + "npm", + "pkg:npm/x@1.0.0", + vec![rec( + "yarn_lock_block", + serde_json::json!([ + "x@^1.0.0:", + " version \"1.0.0\"", + format!(" resolved \"https://registry.yarnpkg.com/x/-/x-1.0.0.tgz#{sha1}\""), + " integrity sha512-CCCC", + ]), + )], + ); + let got = recover_lock_entry(tmp.path(), &with_both).await.unwrap(); + assert_eq!(got.integrity, LockIntegrity::Sri("sha512-CCCC".into())); + assert_eq!( + got.resolved.as_deref(), + Some("https://registry.yarnpkg.com/x/-/x-1.0.0.tgz") + ); + + let sha1_only = entry( + "npm", + "pkg:npm/x@1.0.0", + vec![rec( + "yarn_lock_block", + serde_json::json!([format!( + " resolved \"https://registry.yarnpkg.com/x/-/x-1.0.0.tgz#{sha1}\"" + )]), + )], + ); + let got = recover_lock_entry(tmp.path(), &sha1_only).await.unwrap(); + assert_eq!(got.integrity, LockIntegrity::Sha1Hex(sha1)); + } + + #[tokio::test] + async fn berry_checksum_and_bun_tuple_recover() { + let tmp = tempfile::tempdir().unwrap(); + let berry = entry( + "npm", + "pkg:npm/x@1.0.0", + vec![rec( + "yarn_berry_lock_entry", + serde_json::json!(["x@npm:1.0.0:", " checksum: 10c0/abcdef"]), + )], + ); + let got = recover_lock_entry(tmp.path(), &berry).await.unwrap(); + assert_eq!( + got.integrity, + LockIntegrity::BerryChecksum("10c0/abcdef".into()) + ); + assert_eq!(got.resolved, None); + + let bun = entry( + "npm", + "pkg:npm/x@1.0.0", + vec![rec( + "bun_lock_package", + serde_json::json!(" \"x\": [\"x@1.0.0\", \"\", {}, \"sha512-DDDD\"],"), + )], + ); + let got = recover_lock_entry(tmp.path(), &bun).await.unwrap(); + assert_eq!(got.integrity, LockIntegrity::Sri("sha512-DDDD".into())); + } + + #[tokio::test] + async fn cargo_recovers_from_entry_lock_checksum() { + let tmp = tempfile::tempdir().unwrap(); + let sha = "b".repeat(64); + let mut e = entry("cargo", "pkg:cargo/serde@1.0.0", vec![]); + e.lock = Some(CargoLockOriginal { + source: "registry+https://github.com/rust-lang/crates.io-index".into(), + checksum: Some(sha.clone()), + }); + let got = recover_lock_entry(tmp.path(), &e).await.unwrap(); + assert_eq!(got.ecosystem, "cargo"); + assert_eq!(got.integrity, LockIntegrity::Sha256Hex(sha)); + assert_eq!(got.resolved, None); + + // No checksum recorded → unrecoverable, never an unverified fetch. + let mut bare = entry("cargo", "pkg:cargo/serde@1.0.0", vec![]); + bare.lock = None; + assert!(recover_lock_entry(tmp.path(), &bare).await.is_err()); + } + + #[tokio::test] + async fn composer_gem_uv_fragments_recover() { + let tmp = tempfile::tempdir().unwrap(); + let sha1 = "c".repeat(40); + let composer = entry( + "composer", + "pkg:composer/monolog/monolog@2.9.1", + vec![rec( + "composer_lock_package", + serde_json::json!({ + "name": "monolog/monolog", + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/Seldaek/monolog/zipball/abc", + "shasum": sha1, + }, + }), + )], + ); + let got = recover_lock_entry(tmp.path(), &composer).await.unwrap(); + assert_eq!(got.name, "monolog/monolog"); + assert_eq!(got.integrity, LockIntegrity::Sha1Hex(sha1)); + + // gem: checksum line + remote read from the unrewired Gemfile.lock. + let sha256 = "d".repeat(64); + tokio::fs::write( + tmp.path().join("Gemfile.lock"), + "GEM\n remote: https://rubygems.org/\n specs:\n rack (3.0.0)\n", + ) + .await + .unwrap(); + let gem = entry( + "gem", + "pkg:gem/rack@3.0.0", + vec![rec( + "gemfile_lock_checksum", + serde_json::json!(format!(" rack (3.0.0) sha256={sha256}")), + )], + ); + let got = recover_lock_entry(tmp.path(), &gem).await.unwrap(); + assert_eq!(got.integrity, LockIntegrity::Sha256Hex(sha256.clone())); + assert_eq!( + got.resolved.as_deref(), + Some("https://rubygems.org/downloads/rack-3.0.0.gem") + ); + + // uv: the original [[package]] unit lists wheels; only the PURE one + // is recoverable. + let wheel_sha = "e".repeat(64); + let unit = format!( + "[[package]]\nname = \"six\"\nversion = \"1.16.0\"\nwheels = [\n {{ url = \"https://files.pythonhosted.org/packages/six-1.16.0-cp39-cp39-linux_x86_64.whl\", hash = \"sha256:{}\" }},\n {{ url = \"https://files.pythonhosted.org/packages/six-1.16.0-py2.py3-none-any.whl\", hash = \"sha256:{wheel_sha}\" }},\n]\n", + "f".repeat(64) + ); + let uv = entry( + "pypi", + "pkg:pypi/six@1.16.0", + vec![rec("uv_lock_package", serde_json::json!(unit))], + ); + let got = recover_lock_entry(tmp.path(), &uv).await.unwrap(); + assert_eq!(got.integrity, LockIntegrity::Sha256Hex(wheel_sha)); + assert!(got.resolved.unwrap().ends_with("py2.py3-none-any.whl")); + + // platform-locked wheels are explicitly unrepairable from the registry. + let mut locked = entry("pypi", "pkg:pypi/six@1.16.0", vec![]); + locked.artifact.platform_locked = Some(true); + assert!(recover_lock_entry(tmp.path(), &locked).await.is_err()); + } + + #[tokio::test] + async fn unrecoverable_fragments_fail_closed() { + let tmp = tempfile::tempdir().unwrap(); + // No wiring at all. + let bare = entry("npm", "pkg:npm/x@1.0.0", vec![]); + assert!(recover_lock_entry(tmp.path(), &bare).await.is_err()); + // golang routes through go.sum, never the ledger. + let go = entry("golang", "pkg:golang/golang.org/x/text@v0.14.0", vec![]); + assert!(recover_lock_entry(tmp.path(), &go).await.is_err()); + // Poisoned integrity shapes are rejected. + let bad = entry( + "npm", + "pkg:npm/x@1.0.0", + vec![rec( + "npm_lock_entry", + serde_json::json!({"resolved": "https://x/", "integrity": "lol"}), + )], + ); + assert!(recover_lock_entry(tmp.path(), &bad).await.is_err()); + } +} diff --git a/crates/socket-patch-core/src/patch/vendor/mod.rs b/crates/socket-patch-core/src/patch/vendor/mod.rs index 7d60fdc..1906cfc 100644 --- a/crates/socket-patch-core/src/patch/vendor/mod.rs +++ b/crates/socket-patch-core/src/patch/vendor/mod.rs @@ -55,6 +55,7 @@ pub mod composer_lock; pub mod gem; #[cfg(feature = "golang")] pub mod golang; +pub mod lock_inventory; mod npm_common; pub mod npm_flavor; pub mod npm_lock; @@ -67,6 +68,7 @@ pub mod pypi_poetry; pub mod pypi_requirements; pub mod pypi_uv; pub mod pypi_wheel; +pub mod registry_fetch; mod toml_surgery; pub mod verify; pub mod yarn_berry_lock; @@ -74,8 +76,16 @@ pub mod yarn_classic_lock; pub use path::{ecosystem_dir_for_purl, parse_vendor_path, VendorPathParts, VENDOR_DIR}; pub use state::{load_state, save_state, VendorEntry, VendorState, VENDOR_STATE_REL}; +pub use verify::{check_vendored_artifact, file_sha256_hex, ArtifactHealth}; -use crate::patch::apply::ApplyResult; +use std::collections::HashMap; +use std::path::Path; + +use crate::manifest::schema::{PatchFileInfo, PatchRecord}; +use crate::patch::apply::{ + apply_package_patch, is_safe_relative_subpath, normalize_file_path, ApplyResult, PatchSources, + VerifyStatus, +}; /// A non-fatal advisory surfaced as a warning event (`code` is a stable /// reason tag from the CLI contract; `detail` is human text). @@ -94,6 +104,276 @@ impl VendorWarning { } } +/// One warning per staged file whose pre-patch content matched NEITHER +/// `beforeHash` nor `afterHash` and was overwritten with the verified +/// patched content (vendor staging always force-applies — the stage is a +/// private copy, and every apply write path is hash-gated to exactly +/// `afterHash`). +/// +/// Detection rides the verify signature `apply_package_patch` leaves +/// behind: a force-promoted file keeps `status: Ready` WITH +/// `expected_hash: Some(..)` and a differing `current_hash`, whereas a +/// cleanly-verified file carries `expected_hash: None` (see +/// `verify_file_patch`). +pub(crate) fn mismatch_overwrite_warnings( + result: &ApplyResult, + name: &str, + version: &str, +) -> Vec { + let mut warnings: Vec = result + .files_verified + .iter() + .filter(|v| { + v.status == VerifyStatus::Ready + && v.expected_hash.is_some() + && v.current_hash != v.expected_hash + }) + .map(|v| { + VendorWarning::new( + "vendor_content_mismatch_overwritten", + format!( + "installed {name}@{version} does not match this patch's expected original \ + ({}); vendored the patched content anyway", + v.file + ), + ) + }) + .collect(); + // HashMap-driven verify order is randomized; keep warning order stable. + warnings.sort_by(|a, b| a.detail.cmp(&b.detail)); + warnings +} + +/// Patch-target files (non-empty `beforeHash`) absent from the staged +/// copy. Vendor staging force-applies (see [`force_apply_staged`]), and +/// force silently SKIPS missing files — which would pack an artifact +/// without the fix. This pre-check restores the strict apply's +/// fail-closed behavior for the non-`--force` path. Unsafe keys are +/// skipped here: the apply pipeline itself rejects them fail-closed. +pub(crate) async fn missing_existing_patch_files( + staged_dir: &Path, + files: &HashMap, +) -> Vec { + let mut missing: Vec = Vec::new(); + for (file_name, info) in files { + if info.before_hash.is_empty() { + continue; // a new file is expected to not exist yet + } + let normalized = normalize_file_path(file_name); + if !is_safe_relative_subpath(normalized) { + continue; + } + if tokio::fs::metadata(staged_dir.join(normalized)) + .await + .is_err() + { + missing.push(file_name.clone()); + } + } + missing.sort(); + missing +} + +/// A failed synthesized [`ApplyResult`] in the shape the strict apply +/// pipeline would have produced (success=false, `error` set, no files). +pub(crate) fn failed_apply_result(purl: &str, error: String) -> ApplyResult { + ApplyResult { + package_key: purl.to_string(), + package_path: String::new(), + success: false, + files_verified: Vec::new(), + files_patched: Vec::new(), + applied_via: HashMap::new(), + error: Some(error), + sidecar: None, + } +} + +/// Patched-content blobs harvested from the committed vendor artifacts: +/// for every manifest record whose patch uuid matches its ledger entry, +/// hash the artifact's files (git-sha256, the manifest hash) and keep the +/// ones matching the record's `afterHash`es. +/// +/// This is what lets vendor RE-RUNS (in-sync verification, re-vendor) run +/// with no network and no `.socket/blobs` — the committed artifact IS the +/// patched content. Artifact shapes: npm/pypi tarball-or-wheel files and +/// the dir-shaped ecosystems (cargo/golang/composer/gem copies). Fail-soft +/// per entry; tampered/oversized artifacts contribute nothing (the apply +/// pipeline's afterHash gate decides correctness either way). +pub async fn harvest_artifact_blobs( + project_root: &Path, + manifest_patches: &HashMap, +) -> HashMap> { + use crate::hash::git_sha256::compute_git_sha256_from_bytes; + + const MAX_ARTIFACT_BYTES: u64 = 256 * 1024 * 1024; + const MAX_FILE_BYTES: u64 = 64 * 1024 * 1024; + + let mut out: HashMap> = HashMap::new(); + let Ok(state) = load_state(project_root).await else { + return out; + }; + if state.entries.is_empty() { + return out; + } + + for (purl, record) in manifest_patches { + let needed: std::collections::HashSet<&str> = record + .files + .values() + .map(|f| f.after_hash.as_str()) + .filter(|h| !h.is_empty() && !out.contains_key(*h)) + .collect(); + if needed.is_empty() { + continue; + } + let Some(entry) = state.entries.get(purl).or_else(|| { + state + .entries + .values() + .find(|e| e.base_purl == crate::utils::purl::strip_purl_qualifiers(purl)) + }) else { + continue; + }; + if entry.uuid != record.uuid { + continue; // stale artifact: a re-vendor is pending, don't trust it + } + // SECURITY: the artifact path comes from the committed, tamperable + // ledger and is joined onto the project root for READING only — + // still, never follow an escaping path. + if !crate::patch::apply::is_safe_relative_subpath(&entry.artifact.path) { + continue; + } + let artifact = project_root.join(&entry.artifact.path); + + // Tarball/wheel artifacts: read entries in memory. + let lower = entry.artifact.path.to_ascii_lowercase(); + if lower.ends_with(".tgz") || lower.ends_with(".tar.gz") { + if let Ok(map) = crate::patch::package::read_archive_to_map(&artifact) { + for bytes in map.into_values() { + let h = compute_git_sha256_from_bytes(&bytes); + if needed.contains(h.as_str()) { + out.insert(h, bytes); + } + } + } + continue; + } + if lower.ends_with(".whl") || lower.ends_with(".zip") { + let Ok(bytes) = tokio::fs::read(&artifact).await else { + continue; + }; + if bytes.len() as u64 > MAX_ARTIFACT_BYTES { + continue; + } + let Ok(mut archive) = zip::ZipArchive::new(std::io::Cursor::new(bytes)) else { + continue; + }; + for i in 0..archive.len() { + use std::io::Read as _; + let Ok(mut file) = archive.by_index(i) else { + continue; + }; + if file.is_dir() || file.size() > MAX_FILE_BYTES { + continue; + } + let mut content = Vec::with_capacity(file.size() as usize); + if file.read_to_end(&mut content).is_err() { + continue; + } + let h = compute_git_sha256_from_bytes(&content); + if needed.contains(h.as_str()) { + out.insert(h, content); + } + } + continue; + } + // Dir-shaped artifacts (cargo/golang/composer/gem copies): the + // record keys are package-relative, so resolve each needed file + // directly instead of walking the whole tree. + if tokio::fs::metadata(&artifact) + .await + .is_ok_and(|m| m.is_dir()) + { + for (file_name, info) in &record.files { + if !needed.contains(info.after_hash.as_str()) { + continue; + } + let rel = crate::patch::apply::normalize_file_path(file_name); + if !crate::patch::apply::is_safe_relative_subpath(rel) { + continue; + } + if let Ok(content) = tokio::fs::read(artifact.join(rel)).await { + if content.len() as u64 > MAX_FILE_BYTES { + continue; + } + let h = compute_git_sha256_from_bytes(&content); + if h == info.after_hash { + out.insert(h, content); + } + } + } + } + } + out +} + +/// Run the hardened apply pipeline against a vendor stage/copy with the +/// vendor auto-force policy: +/// +/// * Missing patch-target files fail closed unless the caller's own +/// `--force` asked for that skip tolerance. +/// * The apply itself ALWAYS forces: the stage is a private copy (never +/// the user's tree), and every apply write path is hash-gated to +/// exactly `afterHash` (the archive and blob paths verify content +/// BEFORE writing; the diff path self-disables on a base mismatch) — +/// forcing can only produce the verified patched content or fail +/// closed. This is what lets vendor succeed on a package already +/// patched in place by `apply`, or on a patch whose `beforeHash` was +/// built against different bytes than the installed artifact. +/// * Every force-overwritten file (content matched NEITHER hash) emits a +/// `vendor_content_mismatch_overwritten` warning — including on dry +/// runs, so previews predict the real outcome. +#[allow(clippy::too_many_arguments)] +pub(crate) async fn force_apply_staged( + purl: &str, + staged_dir: &Path, + record: &PatchRecord, + sources: &PatchSources<'_>, + dry_run: bool, + force: bool, + name: &str, + version: &str, + warnings: &mut Vec, +) -> ApplyResult { + if !force { + let missing = missing_existing_patch_files(staged_dir, &record.files).await; + if let Some(first) = missing.first() { + return failed_apply_result( + purl, + format!("Cannot apply patch: {first} - File not found"), + ); + } + } + let result = apply_package_patch( + purl, + staged_dir, + &record.files, + sources, + Some(&record.uuid), + dry_run, + // The stage is private and every write path is afterHash-gated; + // Force additionally covers the caller's --force NotFound-skip + // (the missing-file pre-check above handles the default case). + crate::patch::apply::MismatchPolicy::Force, + ) + .await; + if result.success { + warnings.extend(mismatch_overwrite_warnings(&result, name, version)); + } + result +} + /// The result of one backend `vendor_*` call. // // `large_enum_variant`: `Done` is much bigger than `Refused` because it carries @@ -187,3 +467,212 @@ pub async fn vendored_purl_keys( Err(_) => std::collections::HashSet::new(), } } + +#[cfg(test)] +mod policy_tests { + use super::*; + use crate::patch::apply::VerifyResult; + + fn verify(status: VerifyStatus, expected: Option<&str>, current: Option<&str>) -> VerifyResult { + VerifyResult { + file: "package/index.js".to_string(), + status, + message: None, + current_hash: current.map(str::to_string), + expected_hash: expected.map(str::to_string), + target_hash: None, + } + } + + fn result_with(files_verified: Vec) -> ApplyResult { + ApplyResult { + package_key: "pkg:npm/x@1.0.0".to_string(), + package_path: String::new(), + success: true, + files_verified, + files_patched: Vec::new(), + applied_via: HashMap::new(), + error: None, + sidecar: None, + } + } + + /// Only the force-promoted signature (`Ready` + `expected_hash: Some` + + /// differing `current_hash`) flags an overwrite; clean verifies and + /// AlreadyPatched files never do. + #[test] + fn mismatch_overwrite_warnings_detects_promoted_ready() { + // Force-promoted mismatch: flagged. + let r = result_with(vec![verify(VerifyStatus::Ready, Some("aa"), Some("bb"))]); + let w = mismatch_overwrite_warnings(&r, "left-pad", "1.3.0"); + assert_eq!(w.len(), 1); + assert_eq!(w[0].code, "vendor_content_mismatch_overwritten"); + assert!(w[0].detail.contains("left-pad@1.3.0")); + assert!(w[0].detail.contains("package/index.js")); + + // Clean Ready (verify matched beforeHash): expected_hash is None. + let r = result_with(vec![verify(VerifyStatus::Ready, None, Some("aa"))]); + assert!(mismatch_overwrite_warnings(&r, "x", "1").is_empty()); + + // AlreadyPatched (afterHash content): not a mismatch. + let r = result_with(vec![verify( + VerifyStatus::AlreadyPatched, + None, + Some("after"), + )]); + assert!(mismatch_overwrite_warnings(&r, "x", "1").is_empty()); + + // NotFound (force-skipped): not an overwrite. + let r = result_with(vec![verify(VerifyStatus::NotFound, None, None)]); + assert!(mismatch_overwrite_warnings(&r, "x", "1").is_empty()); + } +} + +#[cfg(test)] +mod harvest_tests { + use super::*; + use crate::hash::git_sha256::compute_git_sha256_from_bytes; + use crate::manifest::schema::{PatchFileInfo, PatchRecord}; + use std::collections::HashMap; + use std::io::Write as _; + + const UUID: &str = "11111111-2222-4333-8444-555555555555"; + const PATCHED: &[u8] = b"module.exports = patched;\n"; + + fn record(purl: &str, uuid: &str, file: &str, after: &[u8]) -> (String, PatchRecord) { + let mut files = HashMap::new(); + files.insert( + file.to_string(), + PatchFileInfo { + before_hash: compute_git_sha256_from_bytes(b"original"), + after_hash: compute_git_sha256_from_bytes(after), + }, + ); + ( + purl.to_string(), + PatchRecord { + uuid: uuid.to_string(), + exported_at: "2024-01-01T00:00:00Z".to_string(), + files, + vulnerabilities: HashMap::new(), + description: String::new(), + license: "MIT".to_string(), + tier: "free".to_string(), + }, + ) + } + + fn write_ledger(root: &Path, purl: &str, uuid: &str, artifact_path: &str) { + let vendor_dir = root.join(".socket/vendor"); + std::fs::create_dir_all(&vendor_dir).unwrap(); + let state = serde_json::json!({ + "version": 1, + "entries": { + purl: { + "ecosystem": "npm", + "basePurl": purl, + "uuid": uuid, + "artifact": { "path": artifact_path }, + "wiring": [], + } + } + }); + std::fs::write( + vendor_dir.join("state.json"), + serde_json::to_vec(&state).unwrap(), + ) + .unwrap(); + } + + fn write_tgz(path: &Path, entry_name: &str, content: &[u8]) { + std::fs::create_dir_all(path.parent().unwrap()).unwrap(); + let gz = flate2::write::GzEncoder::new( + std::fs::File::create(path).unwrap(), + flate2::Compression::default(), + ); + let mut tar = tar::Builder::new(gz); + let mut header = tar::Header::new_gnu(); + header.set_size(content.len() as u64); + header.set_mode(0o644); + header.set_cksum(); + tar.append_data(&mut header, entry_name, content).unwrap(); + tar.into_inner().unwrap().finish().unwrap().flush().unwrap(); + } + + #[tokio::test] + async fn harvests_after_blobs_from_committed_tgz() { + let tmp = tempfile::tempdir().unwrap(); + let purl = "pkg:npm/left-pad@1.3.0"; + let rel = format!(".socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz"); + write_tgz(&tmp.path().join(&rel), "package/index.js", PATCHED); + write_ledger(tmp.path(), purl, UUID, &rel); + + let (k, r) = record(purl, UUID, "package/index.js", PATCHED); + let patches = HashMap::from([(k, r)]); + let mem = harvest_artifact_blobs(tmp.path(), &patches).await; + let hash = compute_git_sha256_from_bytes(PATCHED); + assert_eq!( + mem.get(&hash).map(|b| b.as_slice()), + Some(PATCHED), + "tgz artifact must yield its afterHash blob" + ); + } + + #[tokio::test] + async fn stale_uuid_artifact_contributes_nothing() { + let tmp = tempfile::tempdir().unwrap(); + let purl = "pkg:npm/left-pad@1.3.0"; + let rel = format!(".socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz"); + write_tgz(&tmp.path().join(&rel), "package/index.js", PATCHED); + // Ledger still points at an OLD patch uuid: a re-vendor is pending + // and the artifact's content must not be trusted for the new record. + write_ledger( + tmp.path(), + purl, + "99999999-aaaa-4bbb-8ccc-dddddddddddd", + &rel, + ); + + let (k, r) = record(purl, UUID, "package/index.js", PATCHED); + let patches = HashMap::from([(k, r)]); + assert!(harvest_artifact_blobs(tmp.path(), &patches) + .await + .is_empty()); + } + + #[tokio::test] + async fn escaping_artifact_path_is_rejected() { + let tmp = tempfile::tempdir().unwrap(); + let purl = "pkg:npm/left-pad@1.3.0"; + // The artifact CONTENT would match — only the committed, tamperable + // ledger path escapes the project. Must contribute nothing. + let project = tmp.path().join("project"); + write_tgz(&tmp.path().join("outside.tgz"), "package/index.js", PATCHED); + write_ledger(&project, purl, UUID, "../outside.tgz"); + + let (k, r) = record(purl, UUID, "package/index.js", PATCHED); + let patches = HashMap::from([(k, r)]); + assert!(harvest_artifact_blobs(&project, &patches).await.is_empty()); + } + + #[tokio::test] + async fn dir_shaped_artifact_resolves_record_relative_files() { + let tmp = tempfile::tempdir().unwrap(); + let purl = "pkg:cargo/serde@1.0.0"; + let rel = format!(".socket/vendor/cargo/{UUID}/serde-1.0.0"); + let file_dir = tmp.path().join(&rel).join("src"); + std::fs::create_dir_all(&file_dir).unwrap(); + std::fs::write(file_dir.join("lib.rs"), PATCHED).unwrap(); + write_ledger(tmp.path(), purl, UUID, &rel); + + let (k, r) = record(purl, UUID, "src/lib.rs", PATCHED); + let patches = HashMap::from([(k, r)]); + let mem = harvest_artifact_blobs(tmp.path(), &patches).await; + let hash = compute_git_sha256_from_bytes(PATCHED); + assert_eq!( + mem.get(&hash).map(|b| b.as_slice()), + Some(PATCHED), + "dir-shaped artifact must yield its afterHash blob" + ); + } +} diff --git a/crates/socket-patch-core/src/patch/vendor/npm_common.rs b/crates/socket-patch-core/src/patch/vendor/npm_common.rs index 1b26cc4..c9f5a8e 100644 --- a/crates/socket-patch-core/src/patch/vendor/npm_common.rs +++ b/crates/socket-patch-core/src/patch/vendor/npm_common.rs @@ -12,30 +12,33 @@ //! the project byte-untouched (a dry run stops after verification and //! creates nothing on disk). -use std::collections::HashMap; use std::path::Path; use serde_json::Value; use crate::manifest::schema::PatchRecord; -use crate::patch::apply::{apply_package_patch, normalize_file_path, ApplyResult, PatchSources}; +use crate::patch::apply::{normalize_file_path, ApplyResult, PatchSources}; use crate::patch::copy_tree::{fresh_copy, remove_tree}; use crate::patch::path_safety; -use crate::utils::purl::strip_purl_qualifiers; +use crate::utils::purl::{percent_decode_purl_component, strip_purl_qualifiers}; use super::npm_pack::{pack_deterministic, PackedTarball}; use super::path::vendor_uuid_dir_rel; -use super::VendorOutcome; +use super::{VendorOutcome, VendorWarning}; /// Validated npm vendoring coordinates (the output of -/// [`guard_coordinates`]). `name`/`version` borrow from the purl. +/// [`guard_coordinates`]). `name`/`version` are the percent-DECODED purl +/// components (the API serves scoped purls as `%40scope/name`; the +/// lockfile and node_modules carry the literal `@scope/name`). #[derive(Debug)] -pub(super) struct NpmCoords<'a> { - pub name: &'a str, - pub version: &'a str, +pub(super) struct NpmCoords { + pub name: String, + pub version: String, /// `.socket/vendor/npm/` (validated, forward slashes). pub uuid_dir_rel: String, - /// Qualifier-free base PURL. + /// Qualifier-free base PURL — VERBATIM (still encoded when the API + /// encoded it): the ledger's `base_purl`/entry keys must keep + /// matching the manifest keys, which store the purl as-served. pub base_purl: String, } @@ -49,17 +52,17 @@ pub(super) struct NpmCoords<'a> { /// vendor, arbitrary delete on revert) — reject fail-closed before any disk /// access. `Err` carries a ready [`VendorOutcome::Refused`] to bubble /// verbatim. -pub(super) fn guard_coordinates<'a>( - purl: &'a str, +pub(super) fn guard_coordinates( + purl: &str, record: &PatchRecord, -) -> Result, Box> { +) -> Result> { let Some((name, version)) = parse_npm_purl(purl) else { return Err(Box::new(refused( "unsafe_coordinates", format!("cannot parse an npm name@version out of `{purl}`"), ))); }; - if !is_safe_npm_name(name) || !path_safety::is_safe_single_segment(version) { + if !is_safe_npm_name(&name) || !path_safety::is_safe_single_segment(&version) { return Err(Box::new(refused( "unsafe_coordinates", format!( @@ -118,6 +121,7 @@ pub(super) struct NpmStagedPack { /// verification — no pack, no dirs created). /// * `Ok((Some(staged), result))` — full success: the tarball is on disk at /// `staged.rel_tgz` and the caller proceeds to its lockfile wiring. +#[allow(clippy::too_many_arguments)] pub(super) async fn stage_patch_pack( purl: &str, installed_dir: &Path, @@ -126,6 +130,7 @@ pub(super) async fn stage_patch_pack( sources: &PatchSources<'_>, dry_run: bool, force: bool, + warnings: &mut Vec, ) -> Result<(Option, ApplyResult), Box> { let coords = guard_coordinates(purl, record)?; @@ -175,18 +180,21 @@ pub(super) async fn stage_patch_pack( } } - // Delegate to the hardened apply pipeline, pointed at the stage (which + // Delegate to the hardened apply pipeline (with the vendor auto-force + // policy — see `force_apply_staged`), pointed at the stage (which // plays the role of the installed package dir — manifest npm keys carry // the `package/` prefix and `apply` strips it via `normalize_file_path`, // exactly as it does for an in-place npm apply). - let result = apply_package_patch( + let result = super::force_apply_staged( purl, &stage, - &record.files, + record, sources, - Some(&record.uuid), dry_run, force, + &coords.name, + &coords.version, + warnings, ) .await; // A failed patch never packs (wiring is last — the caller returns with @@ -199,7 +207,7 @@ pub(super) async fn stage_patch_pack( let rel_tgz = format!( "{}/{}", coords.uuid_dir_rel, - tgz_rel_leaf(coords.name, coords.version) + tgz_rel_leaf(&coords.name, &coords.version) ); let dest = project_root.join(&rel_tgz); if let Some(parent) = dest.parent() { @@ -236,8 +244,8 @@ pub(super) async fn stage_patch_pack( Ok(( Some(NpmStagedPack { - name: coords.name.to_string(), - version: coords.version.to_string(), + name: coords.name, + version: coords.version, rel_tgz, packed, staged_pkg_json, @@ -251,14 +259,27 @@ pub(super) async fn stage_patch_pack( /// `pkg:npm/[@scope/]name@version` → `(name, version)`; scoped names keep /// the `@scope/` prefix. The LAST `@` separates the version (a leading /// scope-`@` is at index 0 and never the last `@` of a versioned purl). -pub(super) fn parse_npm_purl(purl: &str) -> Option<(&str, &str)> { +/// +/// Components are percent-DECODED (the API serves `pkg:npm/%40scope/...`). +/// SECURITY: each segment decodes independently AFTER the `/`/`@` splits, +/// and the post-decode `is_safe_npm_name`/`is_safe_single_segment` gates in +/// [`guard_coordinates`] reject any separator or traversal sequence a +/// decode may have surfaced (`%2e%2e`, `%2f`, ...) — decoding never runs +/// after the guards. +pub(super) fn parse_npm_purl(purl: &str) -> Option<(String, String)> { let base = strip_purl_qualifiers(purl); let rest = base.strip_prefix("pkg:npm/")?; let at = rest.rfind('@').filter(|&i| i > 0)?; - let (name, version) = (&rest[..at], &rest[at + 1..]); - if name.is_empty() || version.is_empty() { + let (name_raw, version_raw) = (&rest[..at], &rest[at + 1..]); + if name_raw.is_empty() || version_raw.is_empty() { return None; } + let name = name_raw + .split('/') + .map(percent_decode_purl_component) + .collect::>() + .join("/"); + let version = percent_decode_purl_component(version_raw).into_owned(); Some((name, version)) } @@ -314,16 +335,7 @@ pub(super) fn refused(code: &'static str, detail: String) -> VendorOutcome { /// results. pub(super) fn done_failure(purl: &str, error: String) -> VendorOutcome { VendorOutcome::Done { - result: ApplyResult { - package_key: purl.to_string(), - package_path: String::new(), - success: false, - files_verified: Vec::new(), - files_patched: Vec::new(), - applied_via: HashMap::new(), - error: Some(error), - sidecar: None, - }, + result: super::failed_apply_result(purl, error), entry: None, warnings: Vec::new(), } @@ -333,6 +345,7 @@ pub(super) fn done_failure(purl: &str, error: String) -> VendorOutcome { mod tests { use super::*; use crate::manifest::schema::PatchFileInfo; + use std::collections::HashMap; const UUID: &str = "9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f"; @@ -369,18 +382,48 @@ mod tests { fn guard_coordinates_accepts_plain_and_scoped_names() { let record = record_with_uuid(UUID); let coords = guard_coordinates("pkg:npm/left-pad@1.3.0", &record).unwrap(); - assert_eq!((coords.name, coords.version), ("left-pad", "1.3.0")); + assert_eq!( + (coords.name.as_str(), coords.version.as_str()), + ("left-pad", "1.3.0") + ); assert_eq!(coords.uuid_dir_rel, format!(".socket/vendor/npm/{UUID}")); assert_eq!(coords.base_purl, "pkg:npm/left-pad@1.3.0"); let coords = guard_coordinates("pkg:npm/@scope/pkg@1.0.0?artifact_id=x", &record).unwrap(); - assert_eq!((coords.name, coords.version), ("@scope/pkg", "1.0.0")); + assert_eq!( + (coords.name.as_str(), coords.version.as_str()), + ("@scope/pkg", "1.0.0") + ); assert_eq!( coords.base_purl, "pkg:npm/@scope/pkg@1.0.0", "qualifiers stripped" ); } + /// The API serves scoped purls percent-encoded; the coordinates must + /// decode to the literal `@scope/name` (which keys the lockfile and + /// the artifact path), while `base_purl` stays verbatim — the ledger + /// must keep matching the manifest key as-served. + #[test] + fn guard_coordinates_decodes_percent_encoded_scope() { + let record = record_with_uuid(UUID); + let coords = + guard_coordinates("pkg:npm/%40modelcontextprotocol/sdk@1.12.0", &record).unwrap(); + assert_eq!( + (coords.name.as_str(), coords.version.as_str()), + ("@modelcontextprotocol/sdk", "1.12.0") + ); + assert_eq!( + coords.base_purl, "pkg:npm/%40modelcontextprotocol/sdk@1.12.0", + "base_purl stays verbatim-encoded (manifest/ledger key parity)" + ); + assert_eq!( + tgz_rel_leaf(&coords.name, &coords.version), + "@modelcontextprotocol/sdk-1.12.0.tgz", + "artifact leaf is built from the decoded name" + ); + } + #[test] fn guard_coordinates_refuses_fail_closed() { let record = record_with_uuid(UUID); @@ -399,6 +442,20 @@ mod tests { guard_coordinates("pkg:npm/x@../1.0.0", &record).unwrap_err(), "unsafe_coordinates", ); + // SECURITY: percent-encoded traversal must be rejected POST-decode — + // guarding the encoded form would be a bypass (`%2e%2e` → `..`). + expect_refusal( + guard_coordinates("pkg:npm/%2e%2e/escape@1.0.0", &record).unwrap_err(), + "unsafe_coordinates", + ); + expect_refusal( + guard_coordinates("pkg:npm/@scope/%2e%2e%2f%2e%2e@1.0.0", &record).unwrap_err(), + "unsafe_coordinates", + ); + expect_refusal( + guard_coordinates("pkg:npm/x@%2e%2e%2f1.0.0", &record).unwrap_err(), + "unsafe_coordinates", + ); // Tampered uuid. let record = record_with_uuid("../../x"); expect_refusal( diff --git a/crates/socket-patch-core/src/patch/vendor/npm_flavor.rs b/crates/socket-patch-core/src/patch/vendor/npm_flavor.rs index eb04e9c..b0efc65 100644 --- a/crates/socket-patch-core/src/patch/vendor/npm_flavor.rs +++ b/crates/socket-patch-core/src/patch/vendor/npm_flavor.rs @@ -366,6 +366,54 @@ pub async fn vendor_npm_any( outcome } +/// Is this npm-vendored entry still consumed by its lockfile's dependency +/// graph? +/// +/// `Some(true)`: the lockfile still resolves something to the entry's +/// artifact. `Some(false)`: the lockfile is present and parses but no +/// resolution references `.socket/vendor/npm//` — the dependency +/// was removed and re-locked, so the vendoring is unused (an override/ +/// resolutions DECLARATION alone does not count: pnpm's mirrored +/// `overrides:` section is excluded by the flavor probe, and the other +/// flavors carry no declaration inside the lock at all). `None`: cannot +/// determine (missing lock, unknown flavor) — callers keep the entry, +/// fail-safe. Detached entries are lockfile-invisible BY DESIGN and must +/// never be routed here (the probe would always call them unused). +pub async fn vendored_entry_in_use(entry: &VendorEntry, project_root: &Path) -> Option { + match entry.flavor.as_deref() { + Some("pnpm") => super::pnpm_lock::pnpm_entry_in_use(entry, project_root).await, + // The remaining flavors wire resolutions into the lock itself + // (resolved URLs / file: ranges / package tuples), so a textual + // probe for the uuid dir is exact: the path appears iff some + // resolution still points at the artifact. shrinkwrap wins over + // package-lock, mirroring the vendor/revert lockfile selection. + None | Some("package-lock") => { + lock_text_mentions_uuid( + project_root, + &["npm-shrinkwrap.json", "package-lock.json"], + &entry.uuid, + ) + .await + } + Some("yarn-classic") | Some("yarn-berry") => { + lock_text_mentions_uuid(project_root, &["yarn.lock"], &entry.uuid).await + } + Some("bun") => lock_text_mentions_uuid(project_root, &["bun.lock"], &entry.uuid).await, + Some(_) => None, // unknown flavor: cannot determine + } +} + +/// First readable lockfile from `names`, probed for the uuid artifact dir. +async fn lock_text_mentions_uuid(project_root: &Path, names: &[&str], uuid: &str) -> Option { + let needle = format!(".socket/vendor/npm/{uuid}/"); + for name in names { + if let Ok(text) = tokio::fs::read_to_string(project_root.join(name)).await { + return Some(text.contains(&needle)); + } + } + None +} + /// Revert one recorded npm vendor entry through the flavor that wired it. /// Entries from before the flavor field existed (`None`) are package-lock /// wirings; an unknown flavor fails CLOSED (an older binary must not guess @@ -773,4 +821,85 @@ mod tests { assert!(outcome.success, "flavor {flavor:?}: {:?}", outcome.error); } } + + /// One minimal entry per flavor for the in-use probe. + fn probe_entry(flavor: Option<&str>) -> VendorEntry { + VendorEntry { + ecosystem: "npm".into(), + base_purl: "pkg:npm/left-pad@1.3.0".into(), + uuid: UUID.into(), + artifact: VendorArtifact { + path: format!(".socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz"), + sha256: String::new(), + size: None, + platform_locked: None, + }, + wiring: Vec::new(), + lock: None, + took_over_go_patches: false, + detached: false, + record: None, + flavor: flavor.map(str::to_string), + uv: None, + pnpm: None, + poetry: None, + pdm: None, + pipenv: None, + } + } + + /// The textual flavors: a resolution pointing at the uuid dir means in + /// use; a clean lock means unused; a missing lock or unknown flavor + /// cannot be determined (keep, fail-safe). + #[tokio::test] + async fn vendored_entry_in_use_textual_flavors() { + let entry = probe_entry(Some("package-lock")); + + // Missing lock: undeterminable. + let tmp = tempfile::tempdir().unwrap(); + assert_eq!(vendored_entry_in_use(&entry, tmp.path()).await, None); + + // Lock resolves to our artifact: in use. + touch( + tmp.path(), + "package-lock.json", + &format!( + "{{\"packages\":{{\"node_modules/left-pad\":{{\"resolved\":\"file:.socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz\"}}}}}}" + ), + ) + .await; + assert_eq!(vendored_entry_in_use(&entry, tmp.path()).await, Some(true)); + + // Dep removed + re-locked (no reference left): unused. + touch(tmp.path(), "package-lock.json", "{\"packages\":{}}").await; + assert_eq!(vendored_entry_in_use(&entry, tmp.path()).await, Some(false)); + + // shrinkwrap wins over package-lock (same precedence as vendoring). + touch( + tmp.path(), + "npm-shrinkwrap.json", + &format!( + "{{\"packages\":{{\"node_modules/left-pad\":{{\"resolved\":\"file:.socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz\"}}}}}}" + ), + ) + .await; + assert_eq!(vendored_entry_in_use(&entry, tmp.path()).await, Some(true)); + + // yarn flavors probe yarn.lock. + let entry = probe_entry(Some("yarn-classic")); + let tmp = tempfile::tempdir().unwrap(); + touch( + tmp.path(), + "yarn.lock", + &format!("left-pad@1.3.0:\n resolved \"file:./.socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz#abc\"\n"), + ) + .await; + assert_eq!(vendored_entry_in_use(&entry, tmp.path()).await, Some(true)); + touch(tmp.path(), "yarn.lock", "# yarn lockfile v1\n").await; + assert_eq!(vendored_entry_in_use(&entry, tmp.path()).await, Some(false)); + + // Unknown flavor: undeterminable, fail-safe keep. + let entry = probe_entry(Some("future-pm")); + assert_eq!(vendored_entry_in_use(&entry, tmp.path()).await, None); + } } diff --git a/crates/socket-patch-core/src/patch/vendor/npm_lock.rs b/crates/socket-patch-core/src/patch/vendor/npm_lock.rs index 2be4c7a..c10ea29 100644 --- a/crates/socket-patch-core/src/patch/vendor/npm_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/npm_lock.rs @@ -91,7 +91,7 @@ pub async fn vendor_npm( Ok(coords) => coords, Err(outcome) => return *outcome, }; - let (name, version) = (coords.name, coords.version); + let (name, version) = (coords.name.as_str(), coords.version.as_str()); let uuid_dir_rel = coords.uuid_dir_rel; let base_purl = coords.base_purl; @@ -175,6 +175,7 @@ pub async fn vendor_npm( sources, dry_run, force, + &mut warnings, ) .await { @@ -1090,6 +1091,170 @@ mod tests { assert!(found, "package/index.js missing from the tarball"); } + /// Read one member's bytes out of the packed tarball. + fn tgz_member(tgz: &[u8], member: &str) -> Option> { + let mut archive = tar::Archive::new(flate2::read::GzDecoder::new(tgz)); + for e in archive.entries().unwrap() { + let mut e = e.unwrap(); + if e.path().unwrap().to_string_lossy() == member { + let mut data = Vec::new(); + std::io::Read::read_to_end(&mut e, &mut data).unwrap(); + return Some(data); + } + } + None + } + + /// Vendor auto-force policy: installed content matching NEITHER hash + /// (e.g. a patch built against different bytes than the registry + /// artifact) is overwritten in the STAGE with the verified patched + /// content; the run succeeds, wires the lock, and surfaces the + /// overwrite as a `vendor_content_mismatch_overwritten` warning. The + /// installed tree is never touched. + #[tokio::test] + async fn vendor_overwrites_mismatched_content_with_warning() { + let fx = fixture().await; + let divergent: &[u8] = b"module.exports = () => 'divergent';\n"; + tokio::fs::write(fx.installed().join("index.js"), divergent) + .await + .unwrap(); + + let (result, entry, warnings) = expect_done(fx.vendor(false).await); + assert!(result.success, "{:?}", result.error); + assert!(entry.is_some(), "first vendor records a ledger entry"); + assert_eq!( + warnings + .iter() + .filter(|w| w.code == "vendor_content_mismatch_overwritten") + .count(), + 1, + "overwrite surfaced exactly once: {warnings:?}" + ); + assert!( + warnings[0].detail.contains("left-pad@1.3.0") + && warnings[0].detail.contains("package/index.js"), + "warning names the package and file: {warnings:?}" + ); + + // The tarball carries the VERIFIED patched bytes, not the divergent + // ones — every apply write path is hash-gated to afterHash. + let tgz = tokio::fs::read(fx.root().join(fx.expected_rel_tgz())) + .await + .unwrap(); + assert_eq!(tgz_member(&tgz, "package/index.js").unwrap(), PATCHED_INDEX); + + // The installed tree keeps its (divergent) bytes — only the stage + // was overwritten. + assert_eq!( + tokio::fs::read(fx.installed().join("index.js")) + .await + .unwrap(), + divergent + ); + + // The lock was rewired to the vendored artifact. + let lock = fx.read_lock().await; + assert_eq!( + lock["packages"]["node_modules/left-pad"]["resolved"], + json!(format!("file:{}", fx.expected_rel_tgz())) + ); + } + + /// Auto-force must NOT inherit force's silent NotFound skip: a missing + /// patch-target file still fails closed (a tarball without the fix + /// must never be packed), leaving the project byte-untouched. + #[tokio::test] + async fn vendor_missing_patch_file_fails_without_force() { + let fx = fixture().await; + tokio::fs::remove_file(fx.installed().join("index.js")) + .await + .unwrap(); + + let (result, entry, _) = expect_done(fx.vendor(false).await); + assert!(!result.success, "missing file must fail closed"); + assert!( + result + .error + .as_deref() + .unwrap_or("") + .contains("File not found"), + "error names the missing file: {:?}", + result.error + ); + assert!(entry.is_none()); + assert_eq!( + tokio::fs::read(fx.lock_path()).await.unwrap(), + fx.lock_bytes, + "lock byte-untouched on failure" + ); + assert!( + tokio::fs::metadata(fx.root().join(".socket/vendor")) + .await + .is_err(), + "no artifact dir on failure" + ); + } + + /// `vendor --force` keeps its missing-file tolerance (strict superset + /// of the auto-force policy). + #[tokio::test] + async fn vendor_force_still_skips_missing_files() { + let fx = fixture().await; + tokio::fs::remove_file(fx.installed().join("index.js")) + .await + .unwrap(); + + let blobs = fx.root().join(".socket/blobs"); + let sources = PatchSources::blobs_only(&blobs); + let outcome = vendor_npm( + &fx.purl(), + &fx.installed(), + fx.root(), + &fx.record, + &sources, + "2026-06-09T00:00:00Z", + false, + /*force=*/ true, + ) + .await; + let (result, entry, _) = expect_done(outcome); + assert!(result.success, "{:?}", result.error); + assert!(entry.is_some()); + } + + /// A package already patched IN PLACE by `apply` vendors cleanly: the + /// staged copy verifies AlreadyPatched (no mismatch warning — the + /// content is exactly the patch's afterHash) and the tarball ships the + /// patched bytes. + #[tokio::test] + async fn vendor_of_already_applied_package_succeeds() { + let fx = fixture().await; + // Simulate a prior in-place `socket-patch apply`. + tokio::fs::write(fx.installed().join("index.js"), PATCHED_INDEX) + .await + .unwrap(); + + let (result, entry, warnings) = expect_done(fx.vendor(false).await); + assert!(result.success, "{:?}", result.error); + assert!(entry.is_some(), "first vendor records a ledger entry"); + assert!( + warnings + .iter() + .all(|w| w.code != "vendor_content_mismatch_overwritten"), + "afterHash content is AlreadyPatched, not a mismatch: {warnings:?}" + ); + + let tgz = tokio::fs::read(fx.root().join(fx.expected_rel_tgz())) + .await + .unwrap(); + assert_eq!(tgz_member(&tgz, "package/index.js").unwrap(), PATCHED_INDEX); + let lock = fx.read_lock().await; + assert_eq!( + lock["packages"]["node_modules/left-pad"]["resolved"], + json!(format!("file:{}", fx.expected_rel_tgz())) + ); + } + #[tokio::test] async fn rerun_is_in_sync_and_byte_stable() { let fx = fixture().await; @@ -1633,11 +1798,11 @@ mod tests { fn purl_and_name_helpers() { assert_eq!( parse_npm_purl("pkg:npm/left-pad@1.3.0"), - Some(("left-pad", "1.3.0")) + Some(("left-pad".into(), "1.3.0".into())) ); assert_eq!( parse_npm_purl("pkg:npm/@scope/pkg@1.0.0?foo=bar"), - Some(("@scope/pkg", "1.0.0")) + Some(("@scope/pkg".into(), "1.0.0".into())) ); assert_eq!(parse_npm_purl("pkg:npm/@scope/pkg"), None, "no version"); assert_eq!( diff --git a/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs b/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs index cac16e9..01e7151 100644 --- a/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs @@ -93,7 +93,7 @@ pub async fn vendor_pnpm( Ok(coords) => coords, Err(outcome) => return *outcome, }; - let (name, version) = (coords.name, coords.version); + let (name, version) = (coords.name.as_str(), coords.version.as_str()); let rel_tgz = format!("{}/{}", coords.uuid_dir_rel, tgz_rel_leaf(name, version)); // pnpm spells the override target `file:` with NO // `./` (spike P1 fixtures, verbatim). @@ -138,10 +138,15 @@ pub async fn vendor_pnpm( let mut lines = split_lines(&lock_text); // ── 3. Pre-flight refusals (override conflicts, entry present) ─────── - if let Err(detail) = check_pkg_override_conflict(&pkg, name, &override_key) { - return refused("vendor_override_conflict", detail); - } - if let Err(detail) = check_lock_override_conflict(&lines, name, &override_key) { + // A user-authored exact-version pin equal to `version` is TAKEN OVER + // (the pin's key is rewritten to our spec on both surfaces and the + // original value recorded for revert); anything else same-name refuses. + let disposition = match classify_pkg_override(&pkg, name, version, &override_key) { + Ok(d) => d, + Err(detail) => return refused("vendor_override_conflict", detail), + }; + let effective_key = disposition.effective_key(&override_key).to_string(); + if let Err(detail) = check_lock_override(&lines, name, version, &effective_key) { return refused("vendor_override_conflict", detail); } if !lock_has_target_package(&lines, name, version) { @@ -163,6 +168,7 @@ pub async fn vendor_pnpm( sources, dry_run, force, + &mut warnings, ) .await { @@ -200,11 +206,12 @@ pub async fn vendor_pnpm( rel_tgz: &rel_tgz, spec: &spec, integrity: &packed.integrity, + override_key: &effective_key, }; let mut wiring: Vec = Vec::new(); let (pkg_changed, created_pnpm_table, created_overrides_table) = - match apply_pkg_override(&mut pkg, &override_key, &spec, &mut wiring) { + match apply_pkg_override(&mut pkg, &effective_key, &spec, &mut wiring) { Ok(out) => out, Err(e) => return done_failure(purl, e), }; @@ -307,6 +314,46 @@ pub async fn vendor_pnpm( } } +/// Is this pnpm-vendored entry still consumed by the lock's dependency +/// graph? +/// +/// `Some(true)`: a `packages:`/`snapshots:` block resolves to the entry's +/// artifact (`@file:.socket/vendor/npm//...`) — some importer +/// still depends on the package. `Some(false)`: the lock parses cleanly +/// and carries NO such block — the dependency was removed and re-locked +/// (the `overrides:` declaration alone does NOT count as usage: pnpm +/// keeps it mirrored from package.json even when nothing matches it). +/// `None`: cannot determine (missing/unreadable/unsupported lock) — +/// callers must keep the entry, fail-safe. +pub async fn pnpm_entry_in_use(entry: &VendorEntry, project_root: &Path) -> Option { + let text = tokio::fs::read_to_string(project_root.join(PNPM_LOCK)) + .await + .ok()?; + if check_lock_version(&text).is_err() { + return None; + } + let lines = split_lines(&text); + for section in ["packages", "snapshots"] { + let Some((start, end)) = section_bounds(&lines, section) else { + continue; + }; + let mut i = start + 1; + while let Some(block) = next_block(&lines, i, end) { + let resolved_to_ours = block + .key + .find("@file:") + .map(|at| &block.key[at + 1..]) + .and_then(parse_vendor_path) + .is_some_and(|p| p.eco == "npm" && p.uuid == entry.uuid); + if resolved_to_ours { + return Some(true); + } + i = block.end; + } + } + Some(false) +} + /// Undo one pnpm-vendored package: restore the recorded pair fragments and /// remove the artifact dir. Reverse application order; per-record ownership /// is re-checked against the live fragment (drift ⇒ warning, left alone). @@ -485,6 +532,11 @@ struct EditCtx<'a> { spec: &'a str, /// `sha512-` of the packed tarball. integrity: &'a str, + /// The override key BOTH surfaces edit (see + /// [`OverrideDisposition::effective_key`]): our canonical + /// `name@version` on a fresh insert, or the user's existing key on a + /// takeover / re-run over a taken-over key. + override_key: &'a str, } impl EditCtx<'_> { @@ -498,10 +550,15 @@ impl EditCtx<'_> { format!("{}@{}", self.name, self.spec) } - /// Does `value` point into `.socket/vendor/npm/` (ours — any uuid; a - /// stale uuid is rewritten to the current one with `original: None`)? + /// Does `value` point at OUR vendored tarball for THIS name@version + /// (any uuid — a stale uuid is rewritten to the current one with + /// `original: None`)? The leaf binding is load-bearing: a project can + /// vendor the SAME package at several versions, and a name-only match + /// would let one version's edit clobber another's entries. fn is_ours(&self, value: &str) -> bool { - parse_vendor_path(value).is_some_and(|p| p.eco == "npm") + parse_vendor_path(value).is_some_and(|p| { + p.eco == "npm" && p.leaf == super::npm_common::tgz_rel_leaf(self.name, self.version) + }) } /// The per-importer `specifier:` spelling: re-relativized for nested @@ -560,46 +617,147 @@ fn override_key_name(key: &str) -> &str { } } -/// Is this (key, value) override pair OURS for the target package — the -/// exact versioned selector pointing into `.socket/vendor/npm/`? -fn override_is_ours(key: &str, value: &str, our_key: &str) -> bool { - key == our_key && parse_vendor_path(value).is_some_and(|p| p.eco == "npm") +/// Does `value` point into `.socket/vendor/npm/` (ours — any uuid)? +fn is_vendor_value(value: &str) -> bool { + parse_vendor_path(value).is_some_and(|p| p.eco == "npm") } -/// A user-authored override already steering this package would be -/// silently fought over by ours; refuse instead (fail-closed). -fn check_pkg_override_conflict(pkg: &Value, name: &str, our_key: &str) -> Result<(), String> { +/// A vendor value belonging to THIS `name@version`'s tarball (any uuid). +/// The leaf binding matters: a project can vendor the same package at +/// several versions, and edits must never treat a SIBLING version's +/// override/entry as their own. +fn vendor_value_is_for(value: &str, name: &str, version: &str) -> bool { + parse_vendor_path(value) + .is_some_and(|p| p.eco == "npm" && p.leaf == super::npm_common::tgz_rel_leaf(name, version)) +} + +/// How the package.json `pnpm.overrides` table relates to the package +/// being vendored. The lock's `overrides:` section must mirror this map +/// key-for-key (pnpm hard-checks the two and fails +/// `ERR_PNPM_LOCKFILE_CONFIG_MISMATCH` on any drift), so whichever key +/// this classification yields is the one BOTH surfaces edit. +#[derive(Debug, Clone, PartialEq, Eq)] +enum OverrideDisposition { + /// No same-name key: insert our canonical `name@version` key. + Insert, + /// A same-name key already points into `.socket/vendor/npm/` — ours + /// (any uuid; possibly a user key an earlier vendor took over). + /// Rewrite that key's value in place; our own value is never + /// recorded as an `original`. + Ours { key: String }, + /// A user-authored exact-version pin equal to the version being + /// vendored (`"tar-fs": "3.1.0"` or `"tar-fs@3.1.0": "3.1.0"`): take + /// the key over — rewrite its VALUE to the `file:` spec (the user's + /// pin already forces every `tar-fs` to this exact version, so + /// redirecting the same key preserves their semantics) and record + /// the pin as the wiring `original` so revert restores it exactly. + Takeover { key: String, original: String }, +} + +impl OverrideDisposition { + /// The override key both surfaces edit: the matched existing key, or + /// our canonical `name@version` on a fresh insert. + fn effective_key<'a>(&'a self, our_key: &'a str) -> &'a str { + match self { + OverrideDisposition::Insert => our_key, + OverrideDisposition::Ours { key } | OverrideDisposition::Takeover { key, .. } => key, + } + } +} + +/// Classify the package.json override state for `name` (see +/// [`OverrideDisposition`]). `Err` is a genuine conflict (fail-closed): +/// a range/different-version value, a `parent>child` selector chain +/// (scoped to one dependent — our whole-graph rewrite has different +/// semantics), a non-string value, or several same-name keys. +fn classify_pkg_override( + pkg: &Value, + name: &str, + version: &str, + our_key: &str, +) -> Result { let Some(overrides) = pkg.get("pnpm").and_then(|p| p.get("overrides")) else { - return Ok(()); + return Ok(OverrideDisposition::Insert); }; let Some(map) = overrides.as_object() else { return Err("package.json pnpm.overrides is not an object".to_string()); }; + let mut found: Option = None; for (key, value) in map { if override_key_name(key) != name { continue; } let value_str = value.as_str().unwrap_or(""); - if override_is_ours(key, value_str, our_key) { - continue; // ours (possibly a stale uuid) — the edit handles it + // A SIBLING version's vendored override coexists — not ours to + // touch (and not a conflict): skip it entirely. + if is_vendor_value(value_str) && !vendor_value_is_for(value_str, name, version) { + continue; + } + if found.is_some() { + return Err(format!( + "package.json carries more than one pnpm override for `{name}`; vendoring \ + cannot pick one — remove the extras first" + )); + } + let classified = if key.contains('>') { + None + } else if is_vendor_value(value_str) { + Some(OverrideDisposition::Ours { key: key.clone() }) + } else if value_str == version && (key == name || key == our_key) { + Some(OverrideDisposition::Takeover { + key: key.clone(), + original: value_str.to_string(), + }) + } else { + None + }; + match classified { + Some(d) => found = Some(d), + None => { + return Err(format!( + "package.json already carries a pnpm override for `{key}` ({value}); \ + vendoring would fight it — remove the override (or vendor --revert) \ + first (an exact-version pin equal to {version} is taken over \ + automatically)" + )) + } } - return Err(format!( - "package.json already carries a pnpm override for `{key}` ({value}); vendoring \ - would fight it — remove the override (or vendor --revert) first" - )); } - Ok(()) + Ok(found.unwrap_or(OverrideDisposition::Insert)) } -/// Same conflict check against the lock's own `overrides:` section (a -/// desynced lock-side override would be silently clobbered otherwise). -fn check_lock_override_conflict(lines: &[String], name: &str, our_key: &str) -> Result<(), String> { +/// Lock-side mirror check against the effective key. Every same-name key +/// in the lock's `overrides:` section must BE `effective_key` (pnpm +/// requires the lock's override map to equal package.json's — a key-shape +/// drift means the pair is already desynced) with a value the edit can +/// own: ours, the exact pinned `version` (takeover), or already our spec. +/// A missing section/key is fine — the edit inserts it, restoring parity. +fn check_lock_override( + lines: &[String], + name: &str, + version: &str, + effective_key: &str, +) -> Result<(), String> { let Some((start, end)) = section_bounds(lines, "overrides") else { return Ok(()); }; for line in &lines[start + 1..end] { if let Some((key, _repr, rest)) = parse_key_line(line, 2) { - if override_key_name(&key) == name && !override_is_ours(&key, &rest, our_key) { + if override_key_name(&key) != name { + continue; + } + // A sibling version's vendored override coexists — skip it. + if is_vendor_value(&rest) && !vendor_value_is_for(&rest, name, version) { + continue; + } + if key != effective_key { + return Err(format!( + "{PNPM_LOCK} carries an override key `{key}` for `{name}` that does not \ + match package.json's `{effective_key}` — the two override maps must \ + agree (run `pnpm install` to re-sync them) before vendoring" + )); + } + if !(is_vendor_value(&rest) || rest == version) { return Err(format!( "{PNPM_LOCK} already carries an override for `{key}` ({rest}); vendoring \ would fight it — remove the override (or vendor --revert) first" @@ -665,20 +823,24 @@ fn apply_pkg_override( if existing == Some(spec) { return Ok((false, false, false)); // in sync, no record } - // The conflict pre-flight guarantees any existing value here is OURS - // (a stale uuid): never record our own edit as the "original". - let was_ours = existing.is_some(); + // The classify pre-flight guarantees an existing value here is either + // OURS (a stale uuid — never recorded as an "original") or the user's + // exact-version pin being TAKEN OVER (recorded so revert restores it). + let was_present = existing.is_some(); + let original = existing + .filter(|v| !is_vendor_value(v)) + .map(|v| Value::String(v.to_string())); overrides.insert(our_key.to_string(), Value::String(spec.to_string())); wiring.push(WiringRecord { file: PACKAGE_JSON.to_string(), kind: KIND_PKG_OVERRIDE.to_string(), - action: if was_ours { + action: if was_present { WiringAction::Rewritten } else { WiringAction::Added }, key: Some(our_key.to_string()), - original: None, // Added has none; Rewritten-over-ours records none by design + original, new: Some(Value::String(spec.to_string())), }); Ok((true, created_pnpm_table, created_overrides_table)) @@ -694,7 +856,7 @@ fn edit_overrides( ctx: &EditCtx<'_>, wiring: &mut Vec, ) -> Result { - let our_key = ctx.reg_key(); + let our_key = ctx.override_key.to_string(); let entry_line = format!(" {}: {}", yaml_key(&our_key), ctx.spec); if let Some((start, end)) = section_bounds(lines, "overrides") { // Immutable scan first: our line's position (if present) + the last @@ -702,29 +864,38 @@ fn edit_overrides( let mut ours = None; let mut last_entry = start; for (i, line) in lines.iter().enumerate().take(end).skip(start + 1) { - if let Some((key, _repr, rest)) = parse_key_line(line, 2) { + if let Some((key, repr, rest)) = parse_key_line(line, 2) { last_entry = i; if key == our_key { - ours = Some((i, rest)); + ours = Some((i, repr, rest)); break; } } } - if let Some((i, rest)) = ours { + if let Some((i, repr, rest)) = ours { if rest == ctx.spec { return Ok(false); // in sync } - // Ours with a stale uuid (conflict pre-flight proved it). - lines[i] = entry_line; + // Ours with a stale uuid (no original), or the user's pinned + // value being TAKEN OVER (recorded as original; the live key + // repr/quoting is preserved so revert is byte-faithful). + let original = (!is_vendor_value(&rest)).then(|| rest.clone()); + lines[i] = format!(" {}: {}", yaml_key_like(&our_key, &repr), ctx.spec); wiring.push(overrides_record( &our_key, ctx.spec, WiringAction::Rewritten, + original, )); return Ok(true); } lines.insert(last_entry + 1, entry_line); - wiring.push(overrides_record(&our_key, ctx.spec, WiringAction::Added)); + wiring.push(overrides_record( + &our_key, + ctx.spec, + WiringAction::Added, + None, + )); return Ok(true); } // No overrides section: insert one right before `importers:` (with the @@ -735,17 +906,29 @@ fn edit_overrides( importers..importers, ["overrides:".to_string(), entry_line, String::new()], ); - wiring.push(overrides_record(&our_key, ctx.spec, WiringAction::Added)); + wiring.push(overrides_record( + &our_key, + ctx.spec, + WiringAction::Added, + None, + )); Ok(true) } -fn overrides_record(key: &str, spec: &str, action: WiringAction) -> WiringRecord { +fn overrides_record( + key: &str, + spec: &str, + action: WiringAction, + original: Option, +) -> WiringRecord { WiringRecord { file: PNPM_LOCK.to_string(), kind: KIND_LOCK_OVERRIDES.to_string(), action, key: Some(key.to_string()), - original: None, // Added, or rewritten-over-ours (never an original) + // `Some` only on a takeover (the user's pinned value); Added and + // rewritten-over-ours never record an original. + original: original.map(Value::String), new: Some(Value::String(spec.to_string())), } } @@ -844,13 +1027,41 @@ fn edit_packages( let new_key = ctx.new_key(); let ours_prefix = format!("{}@file:", ctx.name); + // Fail closed on a half-drifted lock: when BOTH the registry-keyed + // entry and a socket file:-keyed entry for this package exist, a rekey + // would splice a DUPLICATE mapping key (pnpm refuses to parse those) + // and surgery cannot decide which block carries the truth. + { + let mut has_registry = false; + let mut has_ours = false; + let mut j = start + 1; + while let Some(block) = next_block(lines, j, end) { + if block.key == reg_key { + has_registry = true; + } else if block + .key + .strip_prefix(&ours_prefix) + .is_some_and(|rest| ctx.is_ours(rest)) + { + has_ours = true; + } + j = block.end; + } + if has_registry && has_ours { + return Err(format!( + "packages section carries BOTH `{reg_key}` and a `{ours_prefix}…` entry (a \ + half-edited lock); run `pnpm install` to re-resolve it, then re-vendor" + )); + } + } + let mut i = start + 1; while let Some(block) = next_block(lines, i, end) { let is_registry = block.key == reg_key; let is_ours_key = block .key .strip_prefix(&ours_prefix) - .is_some_and(|rest| parse_vendor_path(rest).is_some_and(|p| p.eco == "npm")); + .is_some_and(|rest| ctx.is_ours(rest)); if !is_registry && !is_ours_key { i = block.end; continue; @@ -926,13 +1137,37 @@ fn edit_snapshot_rekey( let reg_key = ctx.reg_key(); let new_key = ctx.new_key(); let ours_prefix = format!("{}@file:", ctx.name); + // Same duplicate-key fail-closed guard as edit_packages. + { + let mut has_registry = false; + let mut has_ours = false; + let mut j = start + 1; + while let Some(block) = next_block(lines, j, end) { + if block.key == reg_key { + has_registry = true; + } else if block + .key + .strip_prefix(&ours_prefix) + .is_some_and(|rest| ctx.is_ours(rest)) + { + has_ours = true; + } + j = block.end; + } + if has_registry && has_ours { + return Err(format!( + "snapshots section carries BOTH `{reg_key}` and a `{ours_prefix}…` entry (a \ + half-edited lock); run `pnpm install` to re-resolve it, then re-vendor" + )); + } + } let mut i = start + 1; while let Some(block) = next_block(lines, i, end) { let is_registry = block.key == reg_key; let is_ours_key = block .key .strip_prefix(&ours_prefix) - .is_some_and(|rest| parse_vendor_path(rest).is_some_and(|p| p.eco == "npm")); + .is_some_and(|rest| ctx.is_ours(rest)); if !is_registry && !is_ours_key { i = block.end; continue; @@ -1062,7 +1297,17 @@ fn revert_pkg_record( ))); return; } - overrides.shift_remove(key); + // A takeover recorded the user's pinned value as `original`: restore + // it in place (the key stays). A plain Added/Rewritten-over-ours + // record has no original — remove the key as before. + match rec.original.as_ref().and_then(Value::as_str) { + Some(orig) => { + overrides.insert(key.to_string(), Value::String(orig.to_string())); + } + None => { + overrides.shift_remove(key); + } + } *dirty = true; } @@ -1112,15 +1357,15 @@ fn revert_overrides_line( let mut ours_at = None; let mut others = 0usize; for (i, line) in lines.iter().enumerate().take(end).skip(start + 1) { - if let Some((k, _repr, rest)) = parse_key_line(line, 2) { + if let Some((k, repr, rest)) = parse_key_line(line, 2) { if k == key && ours_at.is_none() { - ours_at = Some((i, rest)); + ours_at = Some((i, repr, rest)); } else { others += 1; } } } - let Some((idx, rest)) = ours_at else { + let Some((idx, repr, rest)) = ours_at else { warnings.push(drifted(format!("overrides entry `{key}` no longer exists"))); return; }; @@ -1132,6 +1377,13 @@ fn revert_overrides_line( ))); return; } + // A takeover recorded the user's pinned value: restore it in place + // (key + quoting preserved; the section obviously stays). + if let Some(orig) = rec.original.as_ref().and_then(Value::as_str) { + lines[idx] = format!(" {}: {orig}", yaml_key_like(key, &repr)); + *dirty = true; + return; + } lines.remove(idx); *dirty = true; if others == 0 { @@ -1404,7 +1656,7 @@ async fn commit_pair( // pnpm-lock.yaml is machine-emitted with a fixed 2/4/6/8-space shape; these // helpers splice line blocks and never interpret YAML generically. -fn split_lines(text: &str) -> Vec { +pub(super) fn split_lines(text: &str) -> Vec { text.split('\n').map(str::to_string).collect() } @@ -1415,7 +1667,7 @@ fn join_lines(lines: &[String]) -> String { /// `(header_idx, end_idx)` of a top-level `name:` section; `end` is the /// first following column-0 line (exclusive), so trailing blank separator /// lines belong to the section. -fn section_bounds(lines: &[String], name: &str) -> Option<(usize, usize)> { +pub(super) fn section_bounds(lines: &[String], name: &str) -> Option<(usize, usize)> { let header = format!("{name}:"); let start = lines.iter().position(|l| l == &header)?; let end = lines @@ -1431,10 +1683,10 @@ fn section_bounds(lines: &[String], name: &str) -> Option<(usize, usize)> { /// One 2-space-keyed block inside a section (`[header, end)`; `end` stops at /// the blank separator / next block header, so the captured fragment is the /// verbatim entry without surrounding blanks). -struct YamlBlock { - header: usize, - end: usize, - key: String, +pub(super) struct YamlBlock { + pub(super) header: usize, + pub(super) end: usize, + pub(super) key: String, /// The key exactly as spelled in the file (incl. quotes) — rekeys /// preserve the file's quoting style. repr: String, @@ -1454,7 +1706,7 @@ impl YamlBlock { } /// The next block at or after line `i` (within `[i, end)`). -fn next_block(lines: &[String], mut i: usize, end: usize) -> Option { +pub(super) fn next_block(lines: &[String], mut i: usize, end: usize) -> Option { while i < end { if let Some((key, repr, rest)) = parse_key_line(&lines[i], 2) { let mut j = i + 1; @@ -2111,6 +2363,230 @@ snapshots: assert!(live_lock.contains("overrides:\n other-pkg: 2.0.0\n\nimporters:")); } + // ── in-use probe ─────────────────────────────────────────────────────── + + /// The prune-time in-use probe: a packages/snapshots block resolving to + /// the artifact means in use; an overrides declaration ALONE (the state + /// pnpm leaves after the dependency is removed and re-locked) does not; + /// a missing or unsupported-version lock is undeterminable (keep). + #[tokio::test] + async fn pnpm_entry_in_use_reflects_lock_graph() { + let fx = fixture_with(P1_BEFORE_PKG, P1_BEFORE_LOCK).await; + let (_, entry, _) = expect_done(fx.vendor(false).await); + let entry = entry.unwrap(); + + // Freshly vendored: the rekeyed file: blocks are in the graph. + assert_eq!(pnpm_entry_in_use(&entry, fx.root()).await, Some(true)); + + // Dep removed + re-locked: pnpm prunes the file: blocks but keeps + // the overrides declaration mirrored from package.json. + let removed_lock = format!( + "lockfileVersion: '9.0'\n\nsettings:\n autoInstallPeers: true\n\ + \noverrides:\n left-pad@1.3.0: file:{}\n\nimporters:\n\n .:\n \ + dependencies:\n consumer:\n specifier: file:./consumer\n \ + version: file:consumer\n\npackages:\n\n consumer@file:consumer:\n \ + resolution: {{directory: consumer, type: directory}}\n\nsnapshots:\n\n \ + consumer@file:consumer: {{}}\n", + fx.rel_tgz() + ); + tokio::fs::write(fx.root().join(PNPM_LOCK), &removed_lock) + .await + .unwrap(); + assert_eq!( + pnpm_entry_in_use(&entry, fx.root()).await, + Some(false), + "the lingering overrides declaration alone is not usage" + ); + + // Unsupported lock version: undeterminable. + tokio::fs::write(fx.root().join(PNPM_LOCK), "lockfileVersion: '6.0'\n") + .await + .unwrap(); + assert_eq!(pnpm_entry_in_use(&entry, fx.root()).await, None); + + // Missing lock: undeterminable. + tokio::fs::remove_file(fx.root().join(PNPM_LOCK)) + .await + .unwrap(); + assert_eq!(pnpm_entry_in_use(&entry, fx.root()).await, None); + } + + // ── exact-version pin takeover ───────────────────────────────────────── + + /// package.json with a user-authored override pin (`key: value`) plus the + /// matching lock-side `overrides:` mirror line. + fn pin_fixture_inputs(key: &str, value: &str) -> (String, String) { + let pkg = format!( + "{{\n \"name\": \"vendor-spike\",\n \"version\": \"1.0.0\",\n \"private\": true,\n \"dependencies\": {{\n \"consumer\": \"file:./consumer\",\n \"left-pad\": \"1.3.0\",\n \"left-pad-old\": \"npm:left-pad@1.2.0\"\n }},\n \"pnpm\": {{\n \"overrides\": {{\n \"{key}\": \"{value}\"\n }}\n }}\n}}\n" + ); + let lock = P1_BEFORE_LOCK.replace( + "importers:", + &format!("overrides:\n {key}: {value}\n\nimporters:"), + ); + (pkg, lock) + } + + /// A user-authored EXACT-version pin equal to the patched version is + /// taken over: the user's key keeps its spelling on both surfaces, its + /// value moves to our `file:` spec, the wiring records the pin as + /// `original`, and a full revert restores both files byte-identically. + #[tokio::test] + async fn user_exact_pin_bare_key_is_taken_over_and_revert_restores_it() { + let (pkg_before, lock_before) = pin_fixture_inputs("left-pad", "1.3.0"); + let fx = fixture_with(&pkg_before, &lock_before).await; + + let (result, entry, _) = expect_done(fx.vendor(false).await); + assert!(result.success, "{:?}", result.error); + let entry = entry.unwrap(); + + // package.json: the USER'S key (`left-pad`) now carries our spec; + // no `left-pad@1.3.0` key was added; tables pre-existed. + let pkg: Value = serde_json::from_str(&fx.read(PACKAGE_JSON).await).unwrap(); + let overrides = &pkg["pnpm"]["overrides"]; + assert_eq!( + overrides["left-pad"], + Value::String(format!("file:{}", fx.rel_tgz())) + ); + assert!(overrides.get("left-pad@1.3.0").is_none()); + assert_eq!( + entry.pnpm, + Some(PnpmMeta { + created_overrides_table: false, + created_pnpm_table: false + }) + ); + + // Lock: same key, same value (map parity — pnpm hard-checks it). + let live_lock = fx.read(PNPM_LOCK).await; + assert!( + live_lock.contains(&format!("overrides:\n left-pad: file:{}", fx.rel_tgz())), + "{live_lock}" + ); + + // Wiring: both override records carry the user's key, action + // Rewritten, and the pin as `original`. + for kind in [KIND_PKG_OVERRIDE, KIND_LOCK_OVERRIDES] { + let rec = entry + .wiring + .iter() + .find(|r| r.kind == kind) + .unwrap_or_else(|| panic!("no {kind} record: {:?}", entry.wiring)); + assert_eq!(rec.key.as_deref(), Some("left-pad"), "{kind}"); + assert_eq!(rec.action, WiringAction::Rewritten, "{kind}"); + assert_eq!( + rec.original, + Some(Value::String("1.3.0".to_string())), + "{kind}: the user's pin is the original" + ); + } + + // Full revert restores the pin on both surfaces byte-identically. + let outcome = revert_pnpm(&entry, fx.root(), false).await; + assert!(outcome.success, "{:?}", outcome.error); + assert_eq!(fx.read(PACKAGE_JSON).await, pkg_before); + assert_eq!(fx.read(PNPM_LOCK).await, lock_before); + } + + /// The versioned key shape (`left-pad@1.3.0: 1.3.0`) is taken over the + /// same way — the key happens to equal our canonical key. + #[tokio::test] + async fn user_exact_pin_versioned_key_is_taken_over() { + let (pkg_before, lock_before) = pin_fixture_inputs("left-pad@1.3.0", "1.3.0"); + let fx = fixture_with(&pkg_before, &lock_before).await; + + let (result, entry, _) = expect_done(fx.vendor(false).await); + assert!(result.success, "{:?}", result.error); + let entry = entry.unwrap(); + + let pkg: Value = serde_json::from_str(&fx.read(PACKAGE_JSON).await).unwrap(); + assert_eq!( + pkg["pnpm"]["overrides"]["left-pad@1.3.0"], + Value::String(format!("file:{}", fx.rel_tgz())) + ); + let rec = entry + .wiring + .iter() + .find(|r| r.kind == KIND_PKG_OVERRIDE) + .unwrap(); + assert_eq!(rec.original, Some(Value::String("1.3.0".to_string()))); + + // Revert restores the pin. + let outcome = revert_pnpm(&entry, fx.root(), false).await; + assert!(outcome.success, "{:?}", outcome.error); + assert_eq!(fx.read(PACKAGE_JSON).await, pkg_before); + assert_eq!(fx.read(PNPM_LOCK).await, lock_before); + } + + /// A second vendor over a taken-over key is the in-sync hot path: + /// AlreadyPatched, no new ledger entry, bytes stable. (Guards the + /// `Ours` classification accepting the user-keyed vendor value — the + /// old `key == our_key` requirement would refuse its own wiring.) + #[tokio::test] + async fn takeover_rerun_is_in_sync_and_records_nothing() { + let (pkg_before, lock_before) = pin_fixture_inputs("left-pad", "1.3.0"); + let fx = fixture_with(&pkg_before, &lock_before).await; + let (result, entry, _) = expect_done(fx.vendor(false).await); + assert!(result.success, "{:?}", result.error); + assert!(entry.is_some()); + let pkg_after = fx.read(PACKAGE_JSON).await; + let lock_after = fx.read(PNPM_LOCK).await; + + let (result, entry, _) = expect_done(fx.vendor(false).await); + assert!(result.success, "{:?}", result.error); + assert!(entry.is_none(), "in-sync rerun records nothing"); + assert!(result + .files_verified + .iter() + .all(|v| v.status == crate::patch::apply::VerifyStatus::AlreadyPatched)); + assert_eq!(fx.read(PACKAGE_JSON).await, pkg_after, "bytes stable"); + assert_eq!(fx.read(PNPM_LOCK).await, lock_after, "bytes stable"); + } + + /// Selector chains and duplicate same-name keys still refuse — only a + /// plain exact pin is taken over. (Range keys and different-version + /// values are covered by `existing_user_override_for_the_name_is_refused`.) + #[tokio::test] + async fn chain_and_duplicate_override_keys_still_refuse() { + // `parent>child` chain, even with the exact version value. + let (pkg, lock) = pin_fixture_inputs("consumer>left-pad", "1.3.0"); + let fx = fixture_with(&pkg, &lock).await; + let detail = expect_refused(fx.vendor(false).await, "vendor_override_conflict"); + assert!(detail.contains("consumer>left-pad"), "{detail}"); + + // Two same-name keys (one ours-shaped pin + one bare pin). + let pkg = "{\n \"name\": \"x\",\n \"pnpm\": {\n \"overrides\": {\n \"left-pad\": \"1.3.0\",\n \"left-pad@1.3.0\": \"1.3.0\"\n }\n }\n}\n".to_string(); + let fx = fixture_with(&pkg, P1_BEFORE_LOCK).await; + let detail = expect_refused(fx.vendor(false).await, "vendor_override_conflict"); + assert!(detail.contains("more than one"), "{detail}"); + } + + /// pkg↔lock override-key shape drift refuses (pnpm itself would fail + /// `ERR_PNPM_LOCKFILE_CONFIG_MISMATCH`); a pkg-side pin with NO lock + /// mirror is fine — the edit inserts the same key, restoring parity. + #[tokio::test] + async fn takeover_lock_shape_mismatch_refuses_but_missing_section_inserts() { + // Shape drift: pkg keys `left-pad`, lock keys `left-pad@1.3.0`. + let (pkg, _) = pin_fixture_inputs("left-pad", "1.3.0"); + let lock = P1_BEFORE_LOCK.replace( + "importers:", + "overrides:\n left-pad@1.3.0: 1.3.0\n\nimporters:", + ); + let fx = fixture_with(&pkg, &lock).await; + let detail = expect_refused(fx.vendor(false).await, "vendor_override_conflict"); + assert!(detail.contains("must"), "{detail}"); + + // No lock overrides section at all: takeover inserts the pkg key. + let fx = fixture_with(&pkg, P1_BEFORE_LOCK).await; + let (result, entry, _) = expect_done(fx.vendor(false).await); + assert!(result.success, "{:?}", result.error); + let live_lock = fx.read(PNPM_LOCK).await; + assert!( + live_lock.contains(&format!("overrides:\n left-pad: file:{}", fx.rel_tgz())), + "lock key matches the pkg key: {live_lock}" + ); + assert!(entry.is_some()); + } + #[tokio::test] async fn created_tables_bookkeeping_and_revert_prunes_them() { // pnpm table exists (other keys), overrides created by us: revert @@ -2212,6 +2688,162 @@ snapshots: ); } + /// A half-edited lock carrying BOTH the registry-keyed packages entry + /// AND a socket file:-keyed one: a rekey would splice a DUPLICATE + /// mapping key (pnpm refuses to parse those) — fail closed, nothing + /// written. + #[tokio::test] + async fn half_drifted_duplicate_keys_fail_closed() { + let dup_lock = P1_BEFORE_LOCK.replace( + " left-pad@1.3.0:\n resolution: {integrity: sha512-XI5MPzVNApjAyhQzphX8BkmKsKUxD4LdyK24iZeQGinBN9yTQT3bFlCBy/aVx2HrNcqQGsdot8ghrjyrvMCoEA==}\n deprecated: use String.prototype.padStart()", + &format!( + " left-pad@1.3.0:\n resolution: {{integrity: sha512-XI5MPzVNApjAyhQzphX8BkmKsKUxD4LdyK24iZeQGinBN9yTQT3bFlCBy/aVx2HrNcqQGsdot8ghrjyrvMCoEA==}}\n deprecated: use String.prototype.padStart()\n\n left-pad@file:.socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz:\n resolution: {{integrity: sha512-stale==, tarball: file:.socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz}}\n version: 1.3.0" + ), + ); + assert_ne!(dup_lock, P1_BEFORE_LOCK, "fixture edit must apply"); + let fx = fixture_with(P1_BEFORE_PKG, &dup_lock).await; + let lock_before = fx.read(PNPM_LOCK).await; + let pkg_before = fx.read(PACKAGE_JSON).await; + + let (result, entry, _) = expect_done(fx.vendor(false).await); + assert!(!result.success, "half-drifted lock must fail closed"); + assert!( + result + .error + .as_deref() + .is_some_and(|e| e.contains("half-edited lock")), + "{:?}", + result.error + ); + assert!(entry.is_none()); + assert_eq!(fx.read(PNPM_LOCK).await, lock_before, "lock untouched"); + assert_eq!(fx.read(PACKAGE_JSON).await, pkg_before, "pkg untouched"); + } + + /// Two VERSIONS of the same package vendored in sequence: each edit + /// must bind to its own version's entries — a name-only "ours" match + /// would let the second vendor clobber/rekey the first one's blocks + /// (live-debugged on Flowise: identical duplicated mapping keys). + #[tokio::test] + async fn multi_version_vendor_does_not_clobber_sibling_entries() { + let fx = fixture_with(P1_BEFORE_PKG, P1_BEFORE_LOCK).await; + let (r1, e1, _) = expect_done(fx.vendor(false).await); + assert!(r1.success); + assert!(e1.is_some()); + let tgz_13 = fx.rel_tgz(); + + // Vendor left-pad@1.2.0 under a DIFFERENT uuid (the `left-pad-old` + // npm: alias resolves it in the same lock). + let uuid2 = "22222222-3333-4444-8555-666666666666"; + let installed2 = fx.root().join("node_modules/left-pad-old"); + tokio::fs::create_dir_all(&installed2).await.unwrap(); + tokio::fs::write( + installed2.join("package.json"), + br#"{"name":"left-pad","version":"1.2.0"}"#, + ) + .await + .unwrap(); + tokio::fs::write(installed2.join("index.js"), ORIG_INDEX) + .await + .unwrap(); + let mut record2 = fx.record.clone(); + record2.uuid = uuid2.to_string(); + let blobs = fx.root().join(".socket/blobs"); + let sources = PatchSources::blobs_only(&blobs); + let outcome = vendor_pnpm( + "pkg:npm/left-pad@1.2.0", + &installed2, + fx.root(), + &record2, + &sources, + "2026-06-09T00:00:00Z", + false, + false, + ) + .await; + let (r2, e2, _) = expect_done(outcome); + assert!(r2.success, "{:?}", r2.error); + assert!(e2.is_some()); + + let lock = fx.read(PNPM_LOCK).await; + let key13 = format!(" left-pad@file:{tgz_13}:"); + let key12 = format!(" left-pad@file:.socket/vendor/npm/{uuid2}/left-pad-1.2.0.tgz:"); + // Both versions' packages + snapshots blocks exist exactly once + // each (snapshot entries may be inline `key: {}`). + for (key, label) in [(&key13, "1.3.0"), (&key12, "1.2.0")] { + assert_eq!( + lock.lines().filter(|l| l.starts_with(key.as_str())).count(), + 2, // packages + snapshots + "{label} entries intact:\n{lock}" + ); + } + // No duplicated mapping keys within a section (what pnpm + // hard-rejects): each section's 2-space keys are unique. + for section in ["overrides", "packages", "snapshots"] { + let Some((start, end)) = section_bounds(&split_lines(&lock), section) else { + continue; + }; + let lines = split_lines(&lock); + let mut keys: Vec = lines[start + 1..end] + .iter() + .filter_map(|l| parse_key_line(l, 2).map(|(k, _, _)| k)) + .collect(); + let total = keys.len(); + keys.sort_unstable(); + keys.dedup(); + assert_eq!(total, keys.len(), "duplicated keys in {section}:\n{lock}"); + } + } + + /// Re-vendor over a wired lock whose recorded integrity DRIFTED (e.g. + /// the artifact was rebuilt from a differently-shaped source): the + /// stale-ours refresh must REPLACE the file:-keyed blocks, never + /// duplicate them. + #[tokio::test] + async fn integrity_drift_refresh_never_duplicates_keys() { + let fx = fixture_with(P1_BEFORE_PKG, P1_BEFORE_LOCK).await; + let (_, entry, _) = expect_done(fx.vendor(false).await); + assert!(entry.is_some()); + + // Simulate drift: the lock records a DIFFERENT integrity for OUR + // file: entry (only) than the tarball the next run will pack. + let lock = fx.read(PNPM_LOCK).await; + let drifted = lock + .lines() + .map(|l| { + if l.contains("tarball: file:.socket") { + l.replace("integrity: sha512-", "integrity: sha512-DRIFT") + } else { + l.to_string() + } + }) + .collect::>() + .join("\n"); + assert_ne!(drifted, lock); + tokio::fs::write(fx.root().join(PNPM_LOCK), &drifted) + .await + .unwrap(); + + let (result, _, _) = expect_done(fx.vendor(false).await); + assert!(result.success, "{:?}", result.error); + let healed = fx.read(PNPM_LOCK).await; + let ours_key = format!(" left-pad@file:{}:", fx.rel_tgz()); + let count = healed.lines().filter(|l| *l == ours_key.as_str()).count(); + assert_eq!( + count, 1, + "exactly one file:-keyed packages/snapshots block per section; lock: +{healed}" + ); + let snap_count = healed + .matches(&format!("left-pad@file:{}", fx.rel_tgz())) + .count(); + assert!( + !healed.contains("sha512-DRIFT"), + "drifted integrity healed: {snap_count} refs +{healed}" + ); + } + #[tokio::test] async fn dry_run_writes_nothing() { let fx = fixture_with(P1_BEFORE_PKG, P1_BEFORE_LOCK).await; diff --git a/crates/socket-patch-core/src/patch/vendor/pypi.rs b/crates/socket-patch-core/src/patch/vendor/pypi.rs index 553a0dc..b9b31f3 100644 --- a/crates/socket-patch-core/src/patch/vendor/pypi.rs +++ b/crates/socket-patch-core/src/patch/vendor/pypi.rs @@ -20,7 +20,7 @@ use super::pypi_poetry::{PoetryProject, PoetryTarget}; use super::pypi_requirements::{preflight_requirements, revert_requirements, wire_requirements}; use super::pypi_uv::{ check_target_guards, classify_dependency, load_uv_project, revert_uv, wire_uv, UvDepClass, - UvProject, + UvProject, UvTarget, }; use super::pypi_wheel::{build_patched_wheel, locate_installed_dist, wheel_file_name}; use super::state::{ @@ -221,6 +221,9 @@ enum WiringPlan { Poetry(Box), Pdm(Box), Pipenv(Box), + /// The lock already routes this package through THIS patch uuid's + /// vendored wheel: no wiring — verify (or rebuild) the artifact only. + InSync, } /// Which `VendorEntry` meta slot a flavor's wiring produced. @@ -232,6 +235,20 @@ enum MetaSlot { None, } +/// The uuid dir holds a wheel artifact — the cheap, flavor-agnostic +/// presence probe for the in-sync hot path (one uuid owns one wheel). +async fn uuid_dir_has_wheel(uuid_dir: &Path) -> bool { + let Ok(mut rd) = tokio::fs::read_dir(uuid_dir).await else { + return false; + }; + while let Ok(Some(e)) = rd.next_entry().await { + if e.file_name().to_string_lossy().ends_with(".whl") { + return true; + } + } + false +} + /// Build the synthesized AlreadyPatched outcome for an in-sync re-run: the /// artifact + lockfile already point at THIS patch uuid, so nothing is built /// or recorded (the first run's ledger entry holds the only copy of the @@ -323,12 +340,15 @@ pub async fn vendor_pypi( Ok(p) => p, Err((code, detail)) => return VendorOutcome::Refused { code, detail }, }; - if let Err((code, detail)) = check_target_guards(&project, &canon_name) { - return VendorOutcome::Refused { code, detail }; + match check_target_guards(&project, &canon_name, &record.uuid) { + Ok(UvTarget::InSync) => WiringPlan::InSync, + Ok(UvTarget::Fresh) => { + warnings.extend(project.warnings.iter().cloned()); + let class = classify_dependency(&project, &canon_name); + WiringPlan::Uv(Box::new(project), class) + } + Err((code, detail)) => return VendorOutcome::Refused { code, detail }, } - warnings.extend(project.warnings.iter().cloned()); - let class = classify_dependency(&project, &canon_name); - WiringPlan::Uv(Box::new(project), class) } PypiFlavor::Requirements => { if let Err((code, detail)) = @@ -349,12 +369,13 @@ pub async fn vendor_pypi( version, &record.uuid, ) { - Ok(PoetryTarget::Fresh) => {} - Ok(PoetryTarget::InSync) => return in_sync_outcome(base, record, warnings), + Ok(PoetryTarget::InSync) => WiringPlan::InSync, + Ok(PoetryTarget::Fresh) => { + warnings.extend(project.warnings.iter().cloned()); + WiringPlan::Poetry(Box::new(project)) + } Err((code, detail)) => return VendorOutcome::Refused { code, detail }, } - warnings.extend(project.warnings.iter().cloned()); - WiringPlan::Poetry(Box::new(project)) } PypiFlavor::Pdm => { let project = match super::pypi_pdm::load_pdm_project(project_root).await { @@ -363,12 +384,13 @@ pub async fn vendor_pypi( }; match super::pypi_pdm::check_target_guards(&project, &canon_name, version, &record.uuid) { - Ok(PdmTarget::Fresh) => {} - Ok(PdmTarget::InSync) => return in_sync_outcome(base, record, warnings), + Ok(PdmTarget::InSync) => WiringPlan::InSync, + Ok(PdmTarget::Fresh) => { + warnings.extend(project.warnings.iter().cloned()); + WiringPlan::Pdm(Box::new(project)) + } Err((code, detail)) => return VendorOutcome::Refused { code, detail }, } - warnings.extend(project.warnings.iter().cloned()); - WiringPlan::Pdm(Box::new(project)) } PypiFlavor::Pipenv => { let project = match super::pypi_pipenv::load_pipenv_project(project_root).await { @@ -376,15 +398,28 @@ pub async fn vendor_pypi( Err((code, detail)) => return VendorOutcome::Refused { code, detail }, }; match super::pypi_pipenv::check_target_guards(&project, &canon_name, &record.uuid) { - Ok(PipenvTarget::Fresh) => {} - Ok(PipenvTarget::InSync) => return in_sync_outcome(base, record, warnings), + Ok(PipenvTarget::InSync) => WiringPlan::InSync, + Ok(PipenvTarget::Fresh) => { + warnings.extend(project.warnings.iter().cloned()); + WiringPlan::Pipenv(Box::new(project)) + } Err((code, detail)) => return VendorOutcome::Refused { code, detail }, } - warnings.extend(project.warnings.iter().cloned()); - WiringPlan::Pipenv(Box::new(project)) } }; + let in_sync = matches!(plan, WiringPlan::InSync); + if in_sync { + // Wired to this uuid already. Intact artifact → the classic in-sync + // skip (no dist lookup — a not-installed re-run must stay green). + // Missing artifact → rebuild the wheel only; the wiring is correct + // and re-running it would re-record live vendored fragments as + // pre-vendor originals. + if uuid_dir_has_wheel(&project_root.join(&uuid_dir_rel)).await || dry_run { + return in_sync_outcome(base, record, warnings); + } + } + let dist = match locate_installed_dist(site_packages, raw_name, version).await { Ok(d) => d, Err((code, detail)) => return VendorOutcome::Refused { code, detail }, @@ -405,6 +440,7 @@ pub async fn vendor_pypi( &dest, dry_run, force, + &mut warnings, ) .await; let (result, artifact) = match built { @@ -457,6 +493,40 @@ pub async fn vendor_pypi( )); } + if in_sync { + // Artifact rebuilt; wiring untouched, ledger entry stays with the + // first run (the only copy of the pre-vendor originals). + warnings.push(VendorWarning::new( + "vendor_artifact_rebuilt", + format!( + "the committed vendored wheel for {canon_name}=={version} was missing; \ + rebuilt at {rel_wheel} (lockfile untouched)" + ), + )); + // Restore the informational marker the deleted uuid dir lost. + let mut vulns: Vec = record.vulnerabilities.keys().cloned().collect(); + vulns.sort(); + let marker = VendorMarker { + schema_version: 1, + purl: base.to_string(), + patch_uuid: record.uuid.clone(), + ecosystem: "pypi".to_string(), + vulnerabilities: vulns, + vendored_at: vendored_at.to_string(), + }; + if let Err(e) = write_marker(&project_root.join(&uuid_dir_rel), &marker).await { + warnings.push(VendorWarning::new( + "marker_write_failed", + format!("could not write the vendor marker: {e}"), + )); + } + return VendorOutcome::Done { + result, + entry: None, + warnings, + }; + } + // Marker: artifact-side breadcrumb in the uuid dir (informational only — // sweep/verify key off state.json + the path uuid). Written before the // wiring so lockfile edits stay the last mutation. @@ -494,6 +564,7 @@ pub async fn vendor_pypi( &wheel_name, &artifact.sha256_hex, class, + &record.uuid, ) .await .map(|(wiring, meta)| (wiring, MetaSlot::Uv(Some(meta)))), @@ -540,6 +611,8 @@ pub async fn vendor_pypi( ) .await .map(|(wiring, meta)| (wiring, MetaSlot::Pipenv(meta))), + // Returned right after the wheel build above. + WiringPlan::InSync => unreachable!("in-sync rebuilds never reach wiring"), }; let (wiring, meta) = match wired { Ok(pair) => pair, @@ -953,6 +1026,130 @@ mod tests { assert!(!fx.root.join(format!(".socket/vendor/pypi/{UUID}")).exists()); } + /// uv flavor, wired pair with a deleted committed wheel: the wheel is + /// rebuilt at the recorded path, pyproject + lock stay byte-identical, + /// no fresh ledger entry. An INTACT wheel stays the classic in-sync skip. + #[tokio::test] + async fn uv_wired_missing_wheel_rebuilds_artifact_only() { + let fx = e2e_fixture().await; + // Swap the requirements flavor for a uv project. + tokio::fs::remove_file(fx.root.join("requirements.txt")) + .await + .unwrap(); + touch( + &fx.root, + "pyproject.toml", + r#"[project] +name = "proj" +version = "0.1.0" +requires-python = ">=3.10" +dependencies = ["six==1.16.0"] +"#, + ) + .await; + touch( + &fx.root, + "uv.lock", + r#"version = 1 +revision = 3 +requires-python = ">=3.10" + +[[package]] +name = "proj" +version = "0.1.0" +source = { virtual = "." } +dependencies = [ + { name = "six" }, +] + +[package.metadata] +requires-dist = [{ name = "six", specifier = "==1.16.0" }] + +[[package]] +name = "six" +version = "1.16.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/71/39/171f1c67cd00715f190ba0b100d606d440a28c93c7714febeca8b79af85e/six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", size = 34041, upload-time = "2021-05-05T14:18:18.379Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254", size = 11053, upload-time = "2021-05-05T14:18:17.237Z" }, +] +"#, + ) + .await; + let sources = PatchSources::blobs_only(&fx.blobs); + let vendor_one = |dry_run: bool| { + vendor_pypi( + "pkg:pypi/six@1.16.0", + &fx.site_packages, + &fx.root, + &fx.record, + &sources, + "2026-06-09T00:00:00Z", + dry_run, + false, + ) + }; + + let VendorOutcome::Done { result, entry, .. } = vendor_one(false).await else { + panic!("first vendor must be Done"); + }; + assert!(result.success, "{:?}", result.error); + assert!(entry.is_some()); + let pyproject1 = tokio::fs::read(fx.root.join("pyproject.toml")) + .await + .unwrap(); + let lock1 = tokio::fs::read(fx.root.join("uv.lock")).await.unwrap(); + let uuid_dir = fx.root.join(format!(".socket/vendor/pypi/{UUID}")); + let wheel = uuid_dir.join("six-1.16.0-py2.py3-none-any.whl"); + assert!(wheel.is_file()); + + // Intact wheel: in-sync skip (no rebuild, no entry). + let VendorOutcome::Done { + result: r2, + entry: e2, + warnings: w2, + } = vendor_one(false).await + else { + panic!("re-run must be Done"); + }; + assert!(r2.success); + assert!(e2.is_none(), "in-sync re-run records nothing"); + assert!( + !w2.iter().any(|w| w.code == "vendor_artifact_rebuilt"), + "intact wheel must not claim a rebuild: {w2:?}" + ); + + // Deleted wheel: artifact-only rebuild. + tokio::fs::remove_dir_all(&uuid_dir).await.unwrap(); + let VendorOutcome::Done { + result: r3, + entry: e3, + warnings: w3, + } = vendor_one(false).await + else { + panic!("rebuild run must be Done"); + }; + assert!(r3.success, "{:?}", r3.error); + assert!(e3.is_none(), "artifact-only rebuild records no entry"); + assert!( + w3.iter().any(|w| w.code == "vendor_artifact_rebuilt"), + "rebuild is surfaced: {w3:?}" + ); + assert!(wheel.is_file(), "wheel rebuilt at the recorded path"); + assert_eq!( + tokio::fs::read(fx.root.join("pyproject.toml")) + .await + .unwrap(), + pyproject1, + "pyproject untouched by the rebuild" + ); + assert_eq!( + tokio::fs::read(fx.root.join("uv.lock")).await.unwrap(), + lock1, + "uv.lock untouched by the rebuild" + ); + } + #[tokio::test] async fn uuid_traversal_is_refused_before_any_write() { let fx = e2e_fixture().await; diff --git a/crates/socket-patch-core/src/patch/vendor/pypi_uv.rs b/crates/socket-patch-core/src/patch/vendor/pypi_uv.rs index 7fba29c..2f895ab 100644 --- a/crates/socket-patch-core/src/patch/vendor/pypi_uv.rs +++ b/crates/socket-patch-core/src/patch/vendor/pypi_uv.rs @@ -259,10 +259,20 @@ pub fn classify_dependency(p: &UvProject, canon_name: &str) -> UvDepClass { /// Split out of [`load_uv_project`] because they need the target name; the /// orchestrator runs them pre-flight so a refusal happens before the wheel /// artifact is built. +/// Pre-flight wiring state for one package (mirrors `PdmTarget`). +#[derive(Debug, PartialEq, Eq)] +pub(super) enum UvTarget { + Fresh, + /// `[tool.uv.sources]` already routes the package through THIS patch + /// uuid's vendored wheel — the in-sync hot path. + InSync, +} + pub(super) fn check_target_guards( p: &UvProject, canon_name: &str, -) -> Result<(), (&'static str, String)> { + record_uuid: &str, +) -> Result { // The same name at multiple versions/sources (platform forks) means one // surgical [[package]] rewrite would mispin the other forks — refuse. let units = p @@ -309,6 +319,13 @@ pub(super) fn check_target_guards( .and_then(|t| t.get("path")) .and_then(Value::as_str) .unwrap_or(""); + // Ours at the SAME patch generation: in sync — the sources and + // override entries are our own first-run edits, expected here. + if super::path::parse_vendor_path(path) + .is_some_and(|parts| parts.eco == "pypi" && parts.uuid == record_uuid) + { + return Ok(UvTarget::InSync); + } let detail = if path.contains(".socket/vendor/pypi/") { format!( "[tool.uv.sources] already routes {key} to a socket-patch vendored wheel; \ @@ -345,7 +362,7 @@ pub(super) fn check_target_guards( } } } - Ok(()) + Ok(UvTarget::Fresh) } /// Wire the pair for the vendored wheel. Writes `pyproject.toml` FIRST, then @@ -362,8 +379,9 @@ pub async fn wire_uv( wheel_file_name: &str, wheel_sha256_hex: &str, class: UvDepClass, + record_uuid: &str, ) -> Result<(Vec, UvMeta), (&'static str, String)> { - check_target_guards(p, canon_name)?; + check_target_guards(p, canon_name, record_uuid)?; let mut wiring: Vec = Vec::new(); // ── pyproject.toml (computed in memory; committed before the lock) ──── @@ -1296,6 +1314,7 @@ wheels = [ WHEEL_NAME, WHEEL_SHA, UvDepClass::Direct, + "9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f", ) .await .unwrap(); @@ -1343,6 +1362,7 @@ wheels = [ WHEEL_NAME, WHEEL_SHA, UvDepClass::Transitive, + "9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f", ) .await .unwrap(); @@ -1442,6 +1462,7 @@ wheels = [ WHEEL_NAME, WHEEL_SHA, UvDepClass::Direct, + "9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f", ) .await .unwrap_err(); @@ -1459,6 +1480,7 @@ wheels = [ WHEEL_NAME, WHEEL_SHA, UvDepClass::Transitive, + "9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f", ) .await .unwrap_err(); @@ -1480,13 +1502,15 @@ wheels = [ WHEEL_NAME, WHEEL_SHA, UvDepClass::Direct, + "9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f", ) .await .unwrap_err(); assert_eq!(err.0, "pypi_uv_source_already_exists"); assert!(err.1.contains("user-authored"), "{}", err.1); - // an existing SOCKET source refuses too, pointing at --revert + // an existing SOCKET source from a STALE patch generation refuses, + // pointing at --revert; the SAME generation is the in-sync hot path. let tmp = write_pair( &format!("{DIRECT_REGISTRY_PYPROJECT}\n[tool.uv.sources]\nsix = {{ path = \"{REL_WHEEL}\" }}\n"), DIRECT_REGISTRY_LOCK, @@ -1502,11 +1526,17 @@ wheels = [ WHEEL_NAME, WHEEL_SHA, UvDepClass::Direct, + "11111111-2222-4333-8444-555555555555", ) .await .unwrap_err(); assert_eq!(err.0, "pypi_uv_source_already_exists"); assert!(err.1.contains("--revert"), "{}", err.1); + assert_eq!( + check_target_guards(&p, "six", "9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f"), + Ok(UvTarget::InSync), + "the same patch generation is in sync, not a refusal" + ); // a user override for the package let tmp = write_pair( @@ -1524,6 +1554,7 @@ wheels = [ WHEEL_NAME, WHEEL_SHA, UvDepClass::Transitive, + "9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f", ) .await .unwrap_err(); @@ -1567,6 +1598,7 @@ wheels = [ WHEEL_NAME, WHEEL_SHA, UvDepClass::Direct, + "9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f", ) .await .unwrap_err(); @@ -1593,6 +1625,7 @@ wheels = [ WHEEL_NAME, WHEEL_SHA, UvDepClass::Direct, + "9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f", ) .await .unwrap(); @@ -1622,6 +1655,7 @@ wheels = [ WHEEL_NAME, WHEEL_SHA, UvDepClass::Transitive, + "9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f", ) .await .unwrap(); @@ -1654,6 +1688,7 @@ wheels = [ WHEEL_NAME, WHEEL_SHA, UvDepClass::Direct, + "9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f", ) .await .unwrap(); @@ -1682,6 +1717,7 @@ wheels = [ WHEEL_NAME, WHEEL_SHA, UvDepClass::Direct, + "9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f", ) .await .unwrap(); diff --git a/crates/socket-patch-core/src/patch/vendor/pypi_wheel.rs b/crates/socket-patch-core/src/patch/vendor/pypi_wheel.rs index 69d4ffb..4b851ff 100644 --- a/crates/socket-patch-core/src/patch/vendor/pypi_wheel.rs +++ b/crates/socket-patch-core/src/patch/vendor/pypi_wheel.rs @@ -20,7 +20,7 @@ use sha2::Digest as _; use crate::crawlers::python_crawler::{canonicalize_pypi_name, read_python_metadata}; use crate::manifest::schema::PatchRecord; use crate::patch::apply::{ - apply_package_patch, is_safe_relative_subpath, normalize_file_path, ApplyResult, PatchSources, + is_safe_relative_subpath, normalize_file_path, ApplyResult, PatchSources, }; use crate::utils::fs::{atomic_write_bytes, list_dir_entries}; @@ -255,6 +255,7 @@ pub async fn build_patched_wheel( dest: &Path, dry_run: bool, force: bool, + warnings: &mut Vec, ) -> Result<(ApplyResult, Option), (&'static str, String)> { // Editable installs (`pip install -e` / uv tool dev mode) point // site-packages at the user's own working tree: the RECORD describes a @@ -371,15 +372,18 @@ pub async fn build_patched_wheel( } // Patch the stage through the shared apply pipeline (same verify/source - // strategy contract as `apply`). The installed tree is never touched. - let mut result = apply_package_patch( + // strategy contract as `apply`, with the vendor auto-force policy — + // see `force_apply_staged`). The installed tree is never touched. + let mut result = super::force_apply_staged( purl, stage.path(), - &record.files, + record, sources, - Some(&record.uuid), dry_run, force, + &dist.dist_name, + &dist.version, + warnings, ) .await; if dry_run || !result.success { @@ -903,6 +907,7 @@ mod tests { &fx.dest, false, false, + &mut Vec::new(), ) .await .unwrap(); @@ -952,6 +957,7 @@ mod tests { &fx.dest, false, false, + &mut Vec::new(), ) .await .unwrap_err(); @@ -977,6 +983,7 @@ mod tests { &fx.dest, false, false, + &mut Vec::new(), ) .await .unwrap(); @@ -994,6 +1001,7 @@ mod tests { &fx.dest, false, false, + &mut Vec::new(), ) .await .unwrap(); @@ -1044,6 +1052,7 @@ mod tests { &fx.dest, false, false, + &mut Vec::new(), ) .await .unwrap(); @@ -1082,6 +1091,7 @@ mod tests { &fx.dest, false, false, + &mut Vec::new(), ) .await .unwrap_err(); @@ -1105,6 +1115,7 @@ mod tests { &fx.dest, true, false, + &mut Vec::new(), ) .await .unwrap(); @@ -1120,8 +1131,12 @@ mod tests { ); } + /// Vendor auto-force policy: installed content matching NEITHER hash is + /// overwritten with the verified patched content in the STAGE (the + /// installed tree is never touched), and the overwrite is surfaced as a + /// `vendor_content_mismatch_overwritten` warning. #[tokio::test] - async fn hash_mismatch_fails_without_touching_install_or_dest() { + async fn hash_mismatch_overwrites_in_stage_with_warning() { let fx = make_fixture("", None).await; // Corrupt the installed six.py so verify sees a HashMismatch. tokio::fs::write(fx.site_packages.join("six.py"), b"tampered") @@ -1132,6 +1147,7 @@ mod tests { .unwrap(); let record = patch_record(&[("six.py", ORIG, PATCHED)]); let sources = PatchSources::blobs_only(&fx.blobs); + let mut warnings = Vec::new(); let (result, artifact) = build_patched_wheel( "pkg:pypi/six@1.16.0", &fx.site_packages, @@ -1141,10 +1157,65 @@ mod tests { &fx.dest, false, false, + &mut warnings, + ) + .await + .unwrap(); + assert!(result.success, "{:?}", result.error); + assert!(artifact.is_some()); + assert!(fx.dest.exists(), "patched wheel must be written"); + assert_eq!( + warnings + .iter() + .filter(|w| w.code == "vendor_content_mismatch_overwritten") + .count(), + 1, + "overwrite surfaced as a warning: {warnings:?}" + ); + // Installed tree untouched — only the stage was overwritten. + assert_eq!( + tokio::fs::read(fx.site_packages.join("six.py")) + .await + .unwrap(), + b"tampered" + ); + } + + /// A patch-target file MISSING from the install still fails closed + /// without `--force` — auto-force must not inherit force's silent + /// NotFound skip (the wheel would ship without the fix). + #[tokio::test] + async fn missing_patch_file_fails_without_force() { + let fx = make_fixture("", None).await; + tokio::fs::remove_file(fx.site_packages.join("six.py")) + .await + .unwrap(); + let dist = locate_installed_dist(&fx.site_packages, "six", "1.16.0") + .await + .unwrap(); + let record = patch_record(&[("six.py", ORIG, PATCHED)]); + let sources = PatchSources::blobs_only(&fx.blobs); + let (result, artifact) = build_patched_wheel( + "pkg:pypi/six@1.16.0", + &fx.site_packages, + &dist, + &record, + &sources, + &fx.dest, + false, + false, + &mut Vec::new(), ) .await .unwrap(); assert!(!result.success); + // The RECORD staging step trips first ("RECORD member ... is + // unreadable") — either way the build fails closed rather than + // packing a wheel without the fix. + assert!( + result.error.is_some(), + "missing file fails closed with an error" + ); assert!(artifact.is_none()); assert!(!fx.dest.exists()); } diff --git a/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs b/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs new file mode 100644 index 0000000..618b4a0 --- /dev/null +++ b/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs @@ -0,0 +1,1529 @@ +//! Pristine-artifact fetching for lockfile-resolved packages with no +//! installed copy. +//! +//! `vendor` needs an installed package dir to stage from; on a fresh clone +//! there is none. This module downloads the pristine artifact the lockfile +//! resolves (the lock-recorded URL when present, the conventional registry +//! URL otherwise), verifies it against the integrity the lock records +//! **FAIL-CLOSED and before anything is written to the staging dir**, and +//! extracts it into a private tempdir the vendor pipeline then treats as +//! the installed dir. The project tree — node_modules included — is never +//! touched. +//! +//! Trust model: the URL comes from the user's own committed lockfile (or a +//! conventional construction from it); content trust comes from the +//! lock-recorded hash, not the transport — which is also why an entry with +//! no verifier ([`LockIntegrity::None`]) is refused outright +//! ([`FetchError::Unverifiable`]) without any network I/O. + +use std::path::{Path, PathBuf}; +use std::time::Duration; + +use base64::Engine as _; +use sha1::Sha1; +use sha2::{Digest, Sha256, Sha384, Sha512}; + +use crate::constants::USER_AGENT; +use crate::patch::apply::is_safe_relative_subpath; + +use super::lock_inventory::{LockIntegrity, LockfileEntry}; + +/// The default npm registry; override with `SOCKET_NPM_REGISTRY` (the +/// enterprise-mirror / test escape hatch — `.npmrc` parsing is out of +/// scope, but lock-recorded `resolved` URLs already carry custom hosts). +pub const DEFAULT_NPM_REGISTRY: &str = "https://registry.npmjs.org"; + +/// Whole-package caps — wider than `patch/package.rs`'s patch-archive caps +/// because these are full upstream packages, but still bounded so a +/// poisoned lockfile cannot turn the fetch into a disk/memory bomb. +const MAX_DOWNLOAD_BYTES: u64 = 128 * 1024 * 1024; +const MAX_TOTAL_DECOMPRESSED_BYTES: u64 = 512 * 1024 * 1024; +const MAX_ENTRY_BYTES: u64 = 128 * 1024 * 1024; +const MAX_ENTRIES: usize = 60_000; + +/// A fetched, verified, extracted package. The tempdir lives exactly as +/// long as this value — callers must hold it until the vendor pipeline has +/// finished staging from [`FetchedPackage::dir`]. +#[derive(Debug)] +pub struct FetchedPackage { + dir: PathBuf, + /// Where the bytes came from (surfaced in the fetch warning event). + pub url: String, + _tmp: tempfile::TempDir, +} + +impl FetchedPackage { + /// The extracted package root (`package.json` at the top for npm). + pub fn dir(&self) -> &Path { + &self.dir + } +} + +#[derive(Debug)] +pub enum FetchError { + /// The entry cannot be verified against the lockfile (no integrity + /// recorded, or no fetcher for its ecosystem) — decided BEFORE any + /// network I/O; the caller keeps its `package_not_installed` outcome. + Unverifiable(String), + /// The fetch was attempted and failed (HTTP error, size cap, integrity + /// mismatch, extraction failure). User-facing message. + Failed(String), +} + +/// One shared client for all fetches in a run. +/// The registry HTTP client type, nameable by callers that don't depend on +/// reqwest directly (the CLI's pristine-source ladder). +pub type RegistryClient = reqwest::Client; + +pub fn build_registry_client() -> RegistryClient { + reqwest::Client::builder() + .user_agent(USER_AGENT) + .timeout(Duration::from_secs(60)) + .build() + .unwrap_or_else(|_| reqwest::Client::new()) +} + +/// The npm registry base after the env override. +pub fn npm_registry_base() -> String { + std::env::var("SOCKET_NPM_REGISTRY") + .ok() + .map(|v| v.trim_end_matches('/').to_string()) + .filter(|v| !v.is_empty()) + .unwrap_or_else(|| DEFAULT_NPM_REGISTRY.to_string()) +} + +/// Conventional npm tarball URL: the scope stays in the package path, the +/// tarball leaf uses the bare name — +/// `{base}/@scope/name/-/name-1.0.0.tgz` / `{base}/name/-/name-1.0.0.tgz`. +pub fn npm_tarball_url(base: &str, name: &str, version: &str) -> String { + let leaf = name.rsplit('/').next().unwrap_or(name); + format!("{base}/{name}/-/{leaf}-{version}.tgz") +} + +/// Fetch + verify + extract one lockfile entry. Ecosystems without a +/// fetcher yet return [`FetchError::Unverifiable`] (callers keep their +/// not-installed outcome). +pub async fn fetch_and_stage( + entry: &LockfileEntry, + client: &reqwest::Client, +) -> Result { + if entry.integrity == LockIntegrity::None { + return Err(FetchError::Unverifiable(format!( + "the lockfile records no integrity hash for {}@{}; refusing to fetch \ + unverifiable content", + entry.name, entry.version + ))); + } + match entry.ecosystem { + "npm" => fetch_npm(entry, client).await, + #[cfg(feature = "cargo")] + "cargo" => fetch_cargo(entry, client).await, + #[cfg(feature = "golang")] + "golang" => fetch_golang(entry, client).await, + #[cfg(feature = "composer")] + "composer" => fetch_composer(entry, client).await, + "gem" => fetch_gem(entry, client).await, + "pypi" => fetch_pypi(entry, client).await, + other => Err(FetchError::Unverifiable(format!( + "no registry fetcher for ecosystem `{other}`" + ))), + } +} + +/// Traversal-guarded zip extraction. `strip_first` mirrors the tar +/// behavior (composer dist zips carry a variable top dir; wheels carry +/// content at the root). +fn extract_zip(bytes: &[u8], dest: &Path, strip_first: bool) -> Result<(), String> { + let mut archive = zip::ZipArchive::new(std::io::Cursor::new(bytes)) + .map_err(|e| format!("unreadable zip: {e}"))?; + if archive.len() > MAX_ENTRIES { + return Err(format!("zip exceeds {MAX_ENTRIES} entries")); + } + let mut total: u64 = 0; + for i in 0..archive.len() { + let mut file = archive + .by_index(i) + .map_err(|e| format!("unreadable zip entry: {e}"))?; + if file.is_dir() { + continue; + } + let raw = PathBuf::from(file.name()); + let rel = if strip_first { + match strip_first_component(&raw) { + Some(rel) => rel, + None => continue, + } + } else { + raw.clone() + }; + let rel_str = rel.to_string_lossy().into_owned(); + if !is_safe_relative_subpath(&rel_str) { + return Err(format!( + "zip entry `{}` escapes the extraction dir — refusing the artifact", + raw.display() + )); + } + if file.size() > MAX_ENTRY_BYTES { + return Err(format!( + "zip entry `{rel_str}` is {} bytes (cap {MAX_ENTRY_BYTES})", + file.size() + )); + } + total += file.size(); + if total > MAX_TOTAL_DECOMPRESSED_BYTES { + return Err(format!( + "zip decompresses past the {MAX_TOTAL_DECOMPRESSED_BYTES}-byte cap" + )); + } + let target = dest.join(&rel); + if let Some(parent) = target.parent() { + std::fs::create_dir_all(parent) + .map_err(|e| format!("cannot create {}: {e}", parent.display()))?; + } + let mut out = std::fs::File::create(&target) + .map_err(|e| format!("cannot create {}: {e}", target.display()))?; + std::io::copy(&mut file, &mut out) + .map_err(|e| format!("cannot extract `{rel_str}`: {e}"))?; + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let exec = file.unix_mode().is_some_and(|m| m & 0o111 != 0); + let perms = if exec { 0o755 } else { 0o644 }; + let _ = std::fs::set_permissions(&target, std::fs::Permissions::from_mode(perms)); + } + } + Ok(()) +} + +/// Composer dist zips (packagist/GitHub zipballs): sha1-verified, variable +/// top dir stripped. The extracted dir plays the installed package dir. +#[cfg(feature = "composer")] +async fn fetch_composer( + entry: &LockfileEntry, + client: &reqwest::Client, +) -> Result { + let Some(url) = entry.resolved.clone() else { + return Err(FetchError::Unverifiable(format!( + "composer.lock records no dist URL for {}@{}", + entry.name, entry.version + ))); + }; + let bytes = download(client, &url).await.map_err(FetchError::Failed)?; + verify_integrity(&bytes, &entry.integrity)?; + let tmp = tempfile::tempdir() + .map_err(|e| FetchError::Failed(format!("cannot create fetch tempdir: {e}")))?; + let dir = tmp.path().join("package"); + extract_zip(&bytes, &dir, /*strip_first=*/ true).map_err(FetchError::Failed)?; + if tokio::fs::metadata(dir.join("composer.json")) + .await + .is_err() + { + return Err(FetchError::Failed(format!( + "fetched dist for {}@{} carries no composer.json", + entry.name, entry.version + ))); + } + Ok(FetchedPackage { + dir, + url, + _tmp: tmp, + }) +} + +/// `.gem` files are plain tar containers holding `data.tar.gz` (the +/// package content, no prefix dir) + metadata. The whole `.gem` is +/// sha256-verified against the Gemfile.lock CHECKSUMS entry first. +async fn fetch_gem( + entry: &LockfileEntry, + client: &reqwest::Client, +) -> Result { + let Some(url) = entry.resolved.clone() else { + return Err(FetchError::Unverifiable(format!( + "no download URL for {}@{}", + entry.name, entry.version + ))); + }; + let bytes = download(client, &url).await.map_err(FetchError::Failed)?; + verify_integrity(&bytes, &entry.integrity)?; + + // Locate data.tar.gz inside the (uncompressed) outer tar. + let mut archive = tar::Archive::new(bytes.as_slice()); + let mut data: Option> = None; + for e in archive + .entries() + .map_err(|e| FetchError::Failed(format!("unreadable .gem: {e}")))? + { + use std::io::Read as _; + let mut e = e.map_err(|err| FetchError::Failed(format!("unreadable .gem entry: {err}")))?; + let is_data = e + .path() + .ok() + .is_some_and(|p| p.as_os_str() == "data.tar.gz"); + if !is_data { + continue; + } + if e.header().size().unwrap_or(u64::MAX) > MAX_DOWNLOAD_BYTES { + return Err(FetchError::Failed( + "data.tar.gz exceeds the size cap".into(), + )); + } + let mut buf = Vec::new(); + e.read_to_end(&mut buf) + .map_err(|err| FetchError::Failed(format!("cannot read data.tar.gz: {err}")))?; + data = Some(buf); + break; + } + let Some(data) = data else { + return Err(FetchError::Failed(format!( + "fetched .gem for {}@{} carries no data.tar.gz", + entry.name, entry.version + ))); + }; + let tmp = tempfile::tempdir() + .map_err(|e| FetchError::Failed(format!("cannot create fetch tempdir: {e}")))?; + let dir = tmp.path().join("gem"); + extract_tgz_no_strip(&data, &dir).map_err(FetchError::Failed)?; + Ok(FetchedPackage { + dir, + url, + _tmp: tmp, + }) +} + +/// Pure-python wheels recorded by uv.lock (URL + sha256): the unzipped +/// wheel IS a site-packages layout (package dirs + `.dist-info/RECORD` at +/// the root), which is exactly the shape the pypi vendor backend stages +/// from. +async fn fetch_pypi( + entry: &LockfileEntry, + client: &reqwest::Client, +) -> Result { + let Some(url) = entry.resolved.clone() else { + return Err(FetchError::Unverifiable(format!( + "the lockfile records no platform-independent wheel URL for {}@{} (only uv.lock carries fetchable wheel resolutions today)", + entry.name, entry.version + ))); + }; + let bytes = download(client, &url).await.map_err(FetchError::Failed)?; + verify_integrity(&bytes, &entry.integrity)?; + let tmp = tempfile::tempdir() + .map_err(|e| FetchError::Failed(format!("cannot create fetch tempdir: {e}")))?; + let dir = tmp.path().join("site-packages"); + extract_zip(&bytes, &dir, /*strip_first=*/ false).map_err(FetchError::Failed)?; + Ok(FetchedPackage { + dir, + url, + _tmp: tmp, + }) +} + +/// crates.io static download host; override with `SOCKET_CRATES_REGISTRY`. +#[cfg(feature = "cargo")] +pub const DEFAULT_CRATES_REGISTRY: &str = "https://static.crates.io/crates"; + +#[cfg(feature = "cargo")] +fn crates_registry_base() -> String { + std::env::var("SOCKET_CRATES_REGISTRY") + .ok() + .map(|v| v.trim_end_matches('/').to_string()) + .filter(|v| !v.is_empty()) + .unwrap_or_else(|| DEFAULT_CRATES_REGISTRY.to_string()) +} + +/// `.crate` files are tar.gz with a `{name}-{version}/` top dir — the same +/// extraction path as npm tarballs. The Cargo.lock `checksum` is the sha256 +/// of the `.crate` bytes. +#[cfg(feature = "cargo")] +async fn fetch_cargo( + entry: &LockfileEntry, + client: &reqwest::Client, +) -> Result { + let url = entry.resolved.clone().unwrap_or_else(|| { + format!( + "{}/{}/{}-{}.crate", + crates_registry_base(), + entry.name, + entry.name, + entry.version + ) + }); + let bytes = download(client, &url).await.map_err(FetchError::Failed)?; + verify_integrity(&bytes, &entry.integrity)?; + + let tmp = tempfile::tempdir() + .map_err(|e| FetchError::Failed(format!("cannot create fetch tempdir: {e}")))?; + let dir = tmp.path().join("crate"); + extract_tgz(&bytes, &dir).map_err(FetchError::Failed)?; + if tokio::fs::metadata(dir.join("Cargo.toml")).await.is_err() { + return Err(FetchError::Failed(format!( + "fetched .crate for {}@{} carries no Cargo.toml — not a crate", + entry.name, entry.version + ))); + } + Ok(FetchedPackage { + dir, + url, + _tmp: tmp, + }) +} + +/// Default Go module proxy; `SOCKET_GOPROXY` wins, else the standard +/// `GOPROXY` env (first element that isn't `direct`/`off`). +#[cfg(feature = "golang")] +pub const DEFAULT_GOPROXY: &str = "https://proxy.golang.org"; + +#[cfg(feature = "golang")] +fn goproxy_base() -> String { + if let Ok(v) = std::env::var("SOCKET_GOPROXY") { + let v = v.trim_end_matches('/').to_string(); + if !v.is_empty() { + return v; + } + } + if let Ok(v) = std::env::var("GOPROXY") { + for part in v.split(',') { + let part = part.trim().trim_end_matches('/'); + if !part.is_empty() && part != "direct" && part != "off" { + return part.to_string(); + } + } + } + DEFAULT_GOPROXY.to_string() +} + +/// Go's module-path case encoding for proxy URLs: an uppercase letter `X` +/// becomes `!x` (applies to the module path and the version). +#[cfg(feature = "golang")] +fn go_escape(s: &str) -> String { + let mut out = String::with_capacity(s.len()); + for c in s.chars() { + if c.is_ascii_uppercase() { + out.push('!'); + out.push(c.to_ascii_lowercase()); + } else { + out.push(c); + } + } + out +} + +/// go.sum's `h1:` dirhash over a module zip: sha256 of the sorted +/// `"{sha256hex(content)} {entry name}\n"` lines, base64-encoded +/// (golang.org/x/mod/sumdb/dirhash Hash1/HashZip). Computed in memory +/// BEFORE extraction. +#[cfg(feature = "golang")] +fn go_h1_of_zip(bytes: &[u8]) -> Result { + use std::io::Read as _; + let mut archive = zip::ZipArchive::new(std::io::Cursor::new(bytes)) + .map_err(|e| format!("unreadable module zip: {e}"))?; + if archive.len() > MAX_ENTRIES { + return Err(format!("module zip exceeds {MAX_ENTRIES} entries")); + } + let mut files: Vec<(String, String)> = Vec::new(); + let mut total: u64 = 0; + for i in 0..archive.len() { + let mut file = archive + .by_index(i) + .map_err(|e| format!("unreadable module zip entry: {e}"))?; + if file.is_dir() { + continue; // go module zips carry files only + } + let name = file.name().to_string(); + if name.contains('\n') { + return Err("module zip entry name contains a newline".to_string()); + } + if file.size() > MAX_ENTRY_BYTES { + return Err(format!( + "module zip entry `{name}` is {} bytes (cap {MAX_ENTRY_BYTES})", + file.size() + )); + } + total += file.size(); + if total > MAX_TOTAL_DECOMPRESSED_BYTES { + return Err(format!( + "module zip decompresses past the {MAX_TOTAL_DECOMPRESSED_BYTES}-byte cap" + )); + } + let mut hasher = Sha256::new(); + let mut buf = [0u8; 64 * 1024]; + loop { + let n = file + .read(&mut buf) + .map_err(|e| format!("cannot read module zip entry `{name}`: {e}"))?; + if n == 0 { + break; + } + hasher.update(&buf[..n]); + } + files.push((name, hex::encode(hasher.finalize()))); + } + files.sort_by(|a, b| a.0.cmp(&b.0)); + let mut h = Sha256::new(); + for (name, content_hex) in &files { + h.update(format!("{content_hex} {name}\n").as_bytes()); + } + Ok(format!( + "h1:{}", + base64::engine::general_purpose::STANDARD.encode(h.finalize()) + )) +} + +/// Traversal-guarded zip extraction with an EXPLICIT required prefix +/// (`@/` — go module paths contain slashes, so a +/// first-component strip would be wrong). Same guard family as +/// [`extract_tgz`]; an entry outside the prefix fails the whole artifact. +#[cfg(feature = "golang")] +fn extract_zip_with_prefix(bytes: &[u8], dest: &Path, prefix: &str) -> Result<(), String> { + let mut archive = zip::ZipArchive::new(std::io::Cursor::new(bytes)) + .map_err(|e| format!("unreadable module zip: {e}"))?; + for i in 0..archive.len() { + let mut file = archive + .by_index(i) + .map_err(|e| format!("unreadable module zip entry: {e}"))?; + if file.is_dir() { + continue; + } + let name = file.name().to_string(); + let Some(rel) = name.strip_prefix(prefix) else { + return Err(format!( + "module zip entry `{name}` lies outside `{prefix}` — refusing the artifact" + )); + }; + if !is_safe_relative_subpath(rel) { + return Err(format!( + "module zip entry `{name}` escapes the extraction dir — refusing the artifact" + )); + } + let target = dest.join(rel); + if let Some(parent) = target.parent() { + std::fs::create_dir_all(parent) + .map_err(|e| format!("cannot create {}: {e}", parent.display()))?; + } + let mut out = std::fs::File::create(&target) + .map_err(|e| format!("cannot create {}: {e}", target.display()))?; + std::io::copy(&mut file, &mut out).map_err(|e| format!("cannot extract `{rel}`: {e}"))?; + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let exec = file.unix_mode().is_some_and(|m| m & 0o111 != 0); + let perms = if exec { 0o755 } else { 0o644 }; + let _ = std::fs::set_permissions(&target, std::fs::Permissions::from_mode(perms)); + } + } + Ok(()) +} + +#[cfg(feature = "golang")] +async fn fetch_golang( + entry: &LockfileEntry, + client: &reqwest::Client, +) -> Result { + let LockIntegrity::GoH1(expected) = &entry.integrity else { + return Err(FetchError::Unverifiable( + "go module entries verify via the go.sum h1 dirhash only".to_string(), + )); + }; + let url = entry.resolved.clone().unwrap_or_else(|| { + format!( + "{}/{}/@v/{}.zip", + goproxy_base(), + go_escape(&entry.name), + go_escape(&entry.version) + ) + }); + let bytes = download(client, &url).await.map_err(FetchError::Failed)?; + let actual = go_h1_of_zip(&bytes).map_err(FetchError::Failed)?; + if &actual != expected { + return Err(FetchError::Failed(format!( + "go.sum dirhash mismatch: lockfile records {expected}, the fetched module zip \ + hashes to {actual}" + ))); + } + let tmp = tempfile::tempdir() + .map_err(|e| FetchError::Failed(format!("cannot create fetch tempdir: {e}")))?; + let dir = tmp.path().join("module"); + let prefix = format!("{}@{}/", entry.name, entry.version); + extract_zip_with_prefix(&bytes, &dir, &prefix).map_err(FetchError::Failed)?; + Ok(FetchedPackage { + dir, + url, + _tmp: tmp, + }) +} + +async fn fetch_npm( + entry: &LockfileEntry, + client: &reqwest::Client, +) -> Result { + fetch_npm_inner(entry, client, true).await +} + +async fn fetch_npm_inner( + entry: &LockfileEntry, + client: &reqwest::Client, + verify: bool, +) -> Result { + let url = entry + .resolved + .clone() + .unwrap_or_else(|| npm_tarball_url(&npm_registry_base(), &entry.name, &entry.version)); + let bytes = download(client, &url).await.map_err(FetchError::Failed)?; + if !verify { + // fetch_npm_unverified: the caller owns end-to-end verification. + } else { + match &entry.integrity { + // yarn berry locks never hash the tarball itself — the checksum is + // sha512 of the deterministic cache zip. Rebuild it from the fetched + // bytes (the same spike-pinned recipe the berry wiring uses) and + // compare. Only cacheKey 10c0 (yarn 4 default) is reproducible. + LockIntegrity::BerryChecksum(expected) => { + if !expected.starts_with("10c0/") { + return Err(FetchError::Unverifiable(format!( + "yarn berry checksum `{expected}` uses a cacheKey other than 10c0; \ + the cache-zip recipe is not reproducible for it" + ))); + } + let actual = super::berry_zip::berry_cache_checksum_10c0(&bytes, &entry.name) + .map_err(FetchError::Failed)?; + if &actual != expected { + return Err(FetchError::Failed(format!( + "yarn berry cache checksum mismatch: lockfile records {expected}, \ + the fetched tarball rebuilds to {actual}" + ))); + } + } + other => verify_integrity(&bytes, other)?, + } + } + + let tmp = tempfile::tempdir() + .map_err(|e| FetchError::Failed(format!("cannot create fetch tempdir: {e}")))?; + let dir = tmp.path().join("package"); + extract_tgz(&bytes, &dir).map_err(FetchError::Failed)?; + if tokio::fs::metadata(dir.join("package.json")).await.is_err() { + return Err(FetchError::Failed(format!( + "fetched tarball for {}@{} carries no package.json — not an npm package", + entry.name, entry.version + ))); + } + Ok(FetchedPackage { + dir, + url, + _tmp: tmp, + }) +} + +/// Stage a package from an on-disk vendored tarball (the fresh-clone +/// re-vendor path: the project has our committed artifact but no installed +/// copy). The bytes are verified against the LEDGER-recorded sha256 before +/// extraction — same fail-closed posture as the registry path; an entry +/// with no recorded hash is refused. +pub async fn stage_local_artifact( + tgz_path: &Path, + expected_sha256_hex: &str, +) -> Result { + if expected_sha256_hex.is_empty() { + return Err(FetchError::Unverifiable( + "the vendor ledger records no sha256 for the artifact".to_string(), + )); + } + let bytes = tokio::fs::read(tgz_path) + .await + .map_err(|e| FetchError::Failed(format!("cannot read {}: {e}", tgz_path.display())))?; + if bytes.len() as u64 > MAX_DOWNLOAD_BYTES { + return Err(FetchError::Failed(format!( + "{}: artifact exceeds the {MAX_DOWNLOAD_BYTES}-byte cap", + tgz_path.display() + ))); + } + let actual = hex::encode(Sha256::digest(&bytes)); + if !actual.eq_ignore_ascii_case(expected_sha256_hex) { + return Err(FetchError::Failed(format!( + "{}: sha256 mismatch against the vendor ledger (recorded {expected_sha256_hex}, \ + on-disk bytes hash to {actual})", + tgz_path.display() + ))); + } + let tmp = tempfile::tempdir() + .map_err(|e| FetchError::Failed(format!("cannot create staging tempdir: {e}")))?; + let dir = tmp.path().join("package"); + extract_tgz(&bytes, &dir).map_err(FetchError::Failed)?; + Ok(FetchedPackage { + dir, + url: format!("file:{}", tgz_path.display()), + _tmp: tmp, + }) +} + +/// Capped download. http(s) only; the cap is enforced on the declared +/// Content-Length AND the actual stream (a lying server cannot blow past +/// it). +async fn download(client: &reqwest::Client, url: &str) -> Result, String> { + if !(url.starts_with("https://") || url.starts_with("http://")) { + return Err(format!("refusing non-http(s) artifact URL `{url}`")); + } + let mut resp = client + .get(url) + .send() + .await + .map_err(|e| format!("GET {url}: {e}"))?; + let status = resp.status(); + if !status.is_success() { + return Err(format!("GET {url}: HTTP {status}")); + } + if let Some(len) = resp.content_length() { + if len > MAX_DOWNLOAD_BYTES { + return Err(format!( + "{url}: artifact is {len} bytes (cap {MAX_DOWNLOAD_BYTES})" + )); + } + } + let mut bytes: Vec = Vec::new(); + while let Some(chunk) = resp + .chunk() + .await + .map_err(|e| format!("reading {url}: {e}"))? + { + if bytes.len() as u64 + chunk.len() as u64 > MAX_DOWNLOAD_BYTES { + return Err(format!( + "{url}: artifact exceeds the {MAX_DOWNLOAD_BYTES}-byte cap" + )); + } + bytes.extend_from_slice(&chunk); + } + Ok(bytes) +} + +/// Verify downloaded bytes against the lock-recorded verifier. Runs BEFORE +/// any disk write. Berry cache-zip checksums and go.sum dirhashes have +/// dedicated verifiers in their ecosystems' fetchers. +/// Fetch + stage an npm package from its conventional registry URL WITHOUT +/// content verification. The download/extract caps still apply. +/// +/// SECURITY: callers MUST end-to-end verify whatever they derive from the +/// staged copy against an independent trust anchor before committing it — +/// repair's ledger reconstruction verifies the deterministically REBUILT +/// vendored tarball against the integrity the rewired lockfile records +/// (`artifact_matches_integrity`); a tampered pristine source then changes +/// the rebuilt bytes and fails closed. +pub async fn fetch_npm_unverified( + name: &str, + version: &str, + client: &reqwest::Client, +) -> Result { + let entry = LockfileEntry { + ecosystem: "npm", + name: name.to_string(), + version: version.to_string(), + purl: format!("pkg:npm/{name}@{version}"), + resolved: None, + integrity: LockIntegrity::None, + }; + fetch_npm_inner(&entry, client, false).await +} + +/// Whole-artifact verification against a lock-recorded integrity (the same +/// verifiers the fetch path uses, including the berry cache-zip rebuild). +/// `name` feeds the berry cache-zip recipe; ignored otherwise. +pub fn artifact_matches_integrity( + bytes: &[u8], + name: &str, + integrity: &LockIntegrity, +) -> Result<(), String> { + match integrity { + LockIntegrity::BerryChecksum(expected) => { + if !expected.starts_with("10c0/") { + return Err(format!( + "yarn berry checksum `{expected}` uses a cacheKey other than 10c0" + )); + } + let actual = super::berry_zip::berry_cache_checksum_10c0(bytes, name)?; + if &actual == expected { + Ok(()) + } else { + Err(format!( + "yarn berry cache checksum mismatch: lockfile records {expected}, the \ + artifact rebuilds to {actual}" + )) + } + } + other => verify_integrity(bytes, other).map_err(|e| match e { + FetchError::Failed(d) | FetchError::Unverifiable(d) => d, + }), + } +} + +fn verify_integrity(bytes: &[u8], integrity: &LockIntegrity) -> Result<(), FetchError> { + match integrity { + LockIntegrity::Sri(sri) => verify_sri(bytes, sri).map_err(FetchError::Failed), + LockIntegrity::Sha1Hex(expect) => { + let actual = hex::encode(Sha1::digest(bytes)); + if &actual == expect { + Ok(()) + } else { + Err(FetchError::Failed(format!( + "sha1 mismatch: lockfile records {expect}, downloaded bytes hash to {actual}" + ))) + } + } + LockIntegrity::Sha256Hex(expect) => { + let actual = hex::encode(Sha256::digest(bytes)); + if actual.eq_ignore_ascii_case(expect) { + Ok(()) + } else { + Err(FetchError::Failed(format!( + "sha256 mismatch: lockfile records {expect}, downloaded bytes hash to {actual}" + ))) + } + } + LockIntegrity::BerryChecksum(_) | LockIntegrity::GoH1(_) => Err(FetchError::Unverifiable( + "verifier handled by a dedicated ecosystem fetcher".to_string(), + )), + LockIntegrity::None => Err(FetchError::Unverifiable( + "no integrity recorded".to_string(), + )), + } +} + +/// SRI verification: pick the strongest hash of a (possibly multi-hash, +/// whitespace-separated) SRI string and compare base64 digests. +fn verify_sri(bytes: &[u8], sri: &str) -> Result<(), String> { + let mut best: Option<(u8, &str, &str)> = None; + for token in sri.split_whitespace() { + let Some((algo, b64)) = token.split_once('-') else { + continue; + }; + let rank = match algo { + "sha512" => 3, + "sha384" => 2, + "sha256" => 1, + _ => continue, + }; + if best.map(|(r, _, _)| rank > r).unwrap_or(true) { + best = Some((rank, algo, b64)); + } + } + let Some((_, algo, expect)) = best else { + return Err(format!("no usable hash in SRI `{sri}`")); + }; + let b64 = base64::engine::general_purpose::STANDARD; + let actual = match algo { + "sha512" => b64.encode(Sha512::digest(bytes)), + "sha384" => b64.encode(Sha384::digest(bytes)), + _ => b64.encode(Sha256::digest(bytes)), + }; + if actual == expect { + Ok(()) + } else { + Err(format!( + "{algo} integrity mismatch: lockfile records {expect}, downloaded bytes hash to \ + {actual}" + )) + } +} + +/// Strip the FIRST path component (npm's tarball semantics — usually +/// `package/`, but registry tarballs may use any prefix dir). +fn strip_first_component(path: &Path) -> Option { + let mut components = path.components(); + components.next()?; + let rest = components.as_path(); + (!rest.as_os_str().is_empty()).then(|| rest.to_path_buf()) +} + +/// Traversal-guarded, mode-preserving tgz extraction (the same guard +/// family as `patch/package.rs::read_archive_to_map`, plus exec-bit +/// preservation: the deterministic re-pack reads modes from disk, so a +/// bytes-only extraction would silently strip bin scripts' exec bits). +/// Fails CLOSED on any traversal-shaped entry — a malicious tarball must +/// not half-extract. +fn extract_tgz(bytes: &[u8], dest: &Path) -> Result<(), String> { + extract_tar_gz(bytes, dest, /*strip_first=*/ true) +} + +/// Like [`extract_tgz`] but keeps entry paths verbatim (gem `data.tar.gz` +/// archives carry package content at the root, no prefix dir). +#[allow(dead_code)] // used by the gem fetcher (feature-independent helper) +fn extract_tgz_no_strip(bytes: &[u8], dest: &Path) -> Result<(), String> { + extract_tar_gz(bytes, dest, /*strip_first=*/ false) +} + +fn extract_tar_gz(bytes: &[u8], dest: &Path, strip_first: bool) -> Result<(), String> { + use std::io::Read as _; + let gz = flate2::read::GzDecoder::new(bytes).take(MAX_TOTAL_DECOMPRESSED_BYTES); + let mut archive = tar::Archive::new(gz); + let mut count = 0usize; + for entry in archive + .entries() + .map_err(|e| format!("unreadable tarball: {e}"))? + { + let mut entry = entry.map_err(|e| format!("unreadable tarball entry: {e}"))?; + count += 1; + if count > MAX_ENTRIES { + return Err(format!("tarball exceeds {MAX_ENTRIES} entries")); + } + // Regular files only: symlinks/hardlinks/devices never extract + // (a symlink could redirect later entries out of the stage). + if !entry.header().entry_type().is_file() { + continue; + } + let raw = entry + .path() + .map_err(|e| format!("tarball entry has an undecodable path: {e}"))? + .into_owned(); + let rel = if strip_first { + match strip_first_component(&raw) { + Some(rel) => rel, + None => continue, // a bare prefix-level file — not package content + } + } else { + raw.clone() + }; + let rel_str = rel.to_string_lossy(); + if !is_safe_relative_subpath(&rel_str) { + return Err(format!( + "tarball entry `{}` escapes the extraction dir — refusing the artifact", + raw.display() + )); + } + let size = entry.header().size().unwrap_or(u64::MAX); + if size > MAX_ENTRY_BYTES { + return Err(format!( + "tarball entry `{rel_str}` is {size} bytes (cap {MAX_ENTRY_BYTES})" + )); + } + let target = dest.join(&rel); + if let Some(parent) = target.parent() { + std::fs::create_dir_all(parent) + .map_err(|e| format!("cannot create {}: {e}", parent.display()))?; + } + let mut out = std::fs::File::create(&target) + .map_err(|e| format!("cannot create {}: {e}", target.display()))?; + std::io::copy(&mut entry, &mut out) + .map_err(|e| format!("cannot extract `{rel_str}`: {e}"))?; + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let mode = entry.header().mode().unwrap_or(0o644); + let perms = if mode & 0o111 != 0 { 0o755 } else { 0o644 }; + let _ = std::fs::set_permissions(&target, std::fs::Permissions::from_mode(perms)); + } + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use wiremock::matchers::{method, path as url_path}; + use wiremock::{Mock, MockServer, ResponseTemplate}; + + /// Build a gzipped tarball with the given `(path, bytes, exec)` entries. + fn make_tgz(entries: &[(&str, &[u8], bool)]) -> Vec { + let mut builder = tar::Builder::new(flate2::write::GzEncoder::new( + Vec::new(), + flate2::Compression::default(), + )); + for (path, bytes, exec) in entries { + let mut header = tar::Header::new_gnu(); + header.set_size(bytes.len() as u64); + header.set_mode(if *exec { 0o755 } else { 0o644 }); + header.set_cksum(); + builder.append_data(&mut header, path, *bytes).unwrap(); + } + builder.into_inner().unwrap().finish().unwrap() + } + + fn sri_of(bytes: &[u8]) -> String { + format!( + "sha512-{}", + base64::engine::general_purpose::STANDARD.encode(Sha512::digest(bytes)) + ) + } + + fn npm_entry(resolved: Option, integrity: LockIntegrity) -> LockfileEntry { + LockfileEntry { + ecosystem: "npm", + name: "left-pad".into(), + version: "1.3.0".into(), + purl: "pkg:npm/left-pad@1.3.0".into(), + resolved, + integrity, + } + } + + #[test] + fn tarball_url_forms() { + assert_eq!( + npm_tarball_url(DEFAULT_NPM_REGISTRY, "left-pad", "1.3.0"), + "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz" + ); + assert_eq!( + npm_tarball_url(DEFAULT_NPM_REGISTRY, "@scope/pkg", "2.0.0"), + "https://registry.npmjs.org/@scope/pkg/-/pkg-2.0.0.tgz", + "the scope stays in the path; the leaf uses the bare name" + ); + } + + #[test] + fn sri_picks_strongest_hash_and_compares() { + let bytes = b"hello"; + let good = sri_of(bytes); + assert!(verify_sri(bytes, &good).is_ok()); + // Multi-hash: a wrong sha256 alongside the right sha512 still passes + // (strongest wins), and vice versa fails. + let multi = format!("sha256-WRONG= {good}"); + assert!(verify_sri(bytes, &multi).is_ok()); + let bad = sri_of(b"other"); + assert!(verify_sri(bytes, &bad).is_err()); + assert!( + verify_sri(bytes, "md5-abc=").is_err(), + "unknown algos refuse" + ); + } + + #[tokio::test] + async fn fetch_verifies_sri_and_extracts_with_modes() { + let tgz = make_tgz(&[ + ("package/package.json", br#"{"name":"left-pad"}"#, false), + ("package/bin/cli.js", b"#!/usr/bin/env node\n", true), + ("package/index.js", b"module.exports = 1;\n", false), + ]); + let mock = MockServer::start().await; + Mock::given(method("GET")) + .and(url_path("/left-pad/-/left-pad-1.3.0.tgz")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(tgz.clone())) + .mount(&mock) + .await; + + let entry = npm_entry( + Some(format!("{}/left-pad/-/left-pad-1.3.0.tgz", mock.uri())), + LockIntegrity::Sri(sri_of(&tgz)), + ); + let fetched = fetch_and_stage(&entry, &build_registry_client()) + .await + .unwrap(); + assert!(fetched.dir().join("package.json").is_file()); + assert_eq!( + std::fs::read(fetched.dir().join("index.js")).unwrap(), + b"module.exports = 1;\n" + ); + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let mode = std::fs::metadata(fetched.dir().join("bin/cli.js")) + .unwrap() + .permissions() + .mode(); + assert_eq!(mode & 0o111, 0o111, "exec bit preserved"); + } + // The tempdir dies with the holder. + let dir = fetched.dir().to_path_buf(); + drop(fetched); + assert!(!dir.exists()); + } + + #[tokio::test] + async fn integrity_mismatch_fails_before_extraction() { + let tgz = make_tgz(&[("package/package.json", b"{}", false)]); + let mock = MockServer::start().await; + Mock::given(method("GET")) + .and(url_path("/left-pad/-/left-pad-1.3.0.tgz")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(tgz)) + .mount(&mock) + .await; + + let entry = npm_entry( + Some(format!("{}/left-pad/-/left-pad-1.3.0.tgz", mock.uri())), + LockIntegrity::Sri(sri_of(b"the lock expects different bytes")), + ); + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Failed(msg)) => { + assert!(msg.contains("mismatch"), "{msg}") + } + other => panic!("expected integrity failure, got {other:?}"), + } + } + + #[tokio::test] + async fn unverifiable_entry_refuses_without_network() { + // A URL that would hard-fail if contacted — Unverifiable proves the + // decision happened before any I/O. + let entry = npm_entry( + Some("http://127.0.0.1:1/nope.tgz".into()), + LockIntegrity::None, + ); + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Unverifiable(msg)) => { + assert!(msg.contains("no integrity"), "{msg}") + } + other => panic!("expected Unverifiable, got {other:?}"), + } + } + + #[tokio::test] + async fn http_error_and_scheme_guard_fail_closed() { + let mock = MockServer::start().await; + // No mounted route → 404. + let entry = npm_entry( + Some(format!("{}/missing.tgz", mock.uri())), + LockIntegrity::Sri(sri_of(b"x")), + ); + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Failed(msg)) => assert!(msg.contains("404"), "{msg}"), + other => panic!("expected HTTP failure, got {other:?}"), + } + + let entry = npm_entry( + Some("ftp://example.com/x.tgz".into()), + LockIntegrity::Sri(sri_of(b"x")), + ); + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Failed(msg)) => assert!(msg.contains("non-http"), "{msg}"), + other => panic!("expected scheme refusal, got {other:?}"), + } + } + + #[test] + fn extraction_strips_first_component_whatever_its_name() { + let tgz = make_tgz(&[("weird-prefix/package.json", b"{}", false)]); + let tmp = tempfile::tempdir().unwrap(); + extract_tgz(&tgz, tmp.path()).unwrap(); + assert!(tmp.path().join("package.json").is_file()); + } + + #[test] + fn traversal_entries_fail_closed() { + // The tar crate refuses to WRITE `..` paths, so craft the header + // name bytes directly — exactly what a hostile tarball would carry. + for evil in ["package/../../escape.js", "package/x/../../../up.js"] { + let mut builder = tar::Builder::new(flate2::write::GzEncoder::new( + Vec::new(), + flate2::Compression::default(), + )); + let mut header = tar::Header::new_gnu(); + { + let name = &mut header.as_gnu_mut().unwrap().name; + name[..evil.len()].copy_from_slice(evil.as_bytes()); + } + header.set_size(4); + header.set_mode(0o644); + header.set_cksum(); + builder.append(&header, &b"evil"[..]).unwrap(); + let tgz = builder.into_inner().unwrap().finish().unwrap(); + + let tmp = tempfile::tempdir().unwrap(); + let err = extract_tgz(&tgz, tmp.path()).unwrap_err(); + assert!(err.contains("escapes"), "{evil}: {err}"); + assert!( + std::fs::read_dir(tmp.path()).unwrap().next().is_none(), + "nothing may extract from a traversal-bearing tarball" + ); + } + } + + #[tokio::test] + async fn berry_checksum_verifies_via_cache_zip_rebuild() { + let tgz = make_tgz(&[ + ("package/package.json", br#"{"name":"left-pad"}"#, false), + ("package/index.js", b"module.exports = 1;\n", false), + ]); + let expected = + super::super::berry_zip::berry_cache_checksum_10c0(&tgz, "left-pad").unwrap(); + let mock = MockServer::start().await; + Mock::given(method("GET")) + .and(url_path("/left-pad/-/left-pad-1.3.0.tgz")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(tgz)) + .mount(&mock) + .await; + + let entry = npm_entry( + Some(format!("{}/left-pad/-/left-pad-1.3.0.tgz", mock.uri())), + LockIntegrity::BerryChecksum(expected), + ); + let fetched = fetch_and_stage(&entry, &build_registry_client()) + .await + .unwrap(); + assert!(fetched.dir().join("package.json").is_file()); + + // Tampered checksum → Failed; foreign cacheKey → Unverifiable. + let entry = npm_entry( + Some(format!("{}/left-pad/-/left-pad-1.3.0.tgz", mock.uri())), + LockIntegrity::BerryChecksum(format!("10c0/{}", "0".repeat(128))), + ); + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Failed(msg)) => assert!(msg.contains("mismatch"), "{msg}"), + other => panic!("expected mismatch, got {other:?}"), + } + let entry = npm_entry( + Some(format!("{}/left-pad/-/left-pad-1.3.0.tgz", mock.uri())), + LockIntegrity::BerryChecksum(format!("9/{}", "0".repeat(128))), + ); + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Unverifiable(msg)) => assert!(msg.contains("cacheKey"), "{msg}"), + other => panic!("expected Unverifiable, got {other:?}"), + } + } + + #[tokio::test] + async fn stage_local_artifact_verifies_ledger_sha256() { + let tgz = make_tgz(&[("package/package.json", b"{}", false)]); + let tmp = tempfile::tempdir().unwrap(); + let tgz_path = tmp.path().join("left-pad-1.3.0.tgz"); + std::fs::write(&tgz_path, &tgz).unwrap(); + let sha = hex::encode(Sha256::digest(&tgz)); + + let staged = stage_local_artifact(&tgz_path, &sha).await.unwrap(); + assert!(staged.dir().join("package.json").is_file()); + + match stage_local_artifact(&tgz_path, &"0".repeat(64)).await { + Err(FetchError::Failed(msg)) => assert!(msg.contains("mismatch"), "{msg}"), + other => panic!("expected ledger mismatch, got {other:?}"), + } + match stage_local_artifact(&tgz_path, "").await { + Err(FetchError::Unverifiable(_)) => {} + other => panic!("expected Unverifiable for empty hash, got {other:?}"), + } + } + + #[cfg(feature = "cargo")] + #[tokio::test] + async fn cargo_crate_fetch_verifies_sha256_and_extracts() { + // .crate = tar.gz with a {name}-{version}/ top dir. + let crate_bytes = make_tgz(&[ + ( + "left-pad-1.3.0/Cargo.toml", + b"[package]\nname = \"left-pad\"\n", + false, + ), + ("left-pad-1.3.0/src/lib.rs", b"pub fn pad() {}\n", false), + ]); + let sha = hex::encode(Sha256::digest(&crate_bytes)); + let mock = MockServer::start().await; + Mock::given(method("GET")) + .and(url_path("/left-pad/left-pad-1.3.0.crate")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(crate_bytes)) + .mount(&mock) + .await; + + let entry = LockfileEntry { + ecosystem: "cargo", + name: "left-pad".into(), + version: "1.3.0".into(), + purl: "pkg:cargo/left-pad@1.3.0".into(), + resolved: Some(format!("{}/left-pad/left-pad-1.3.0.crate", mock.uri())), + integrity: LockIntegrity::Sha256Hex(sha), + }; + let fetched = fetch_and_stage(&entry, &build_registry_client()) + .await + .unwrap(); + assert!(fetched.dir().join("Cargo.toml").is_file()); + assert!(fetched.dir().join("src/lib.rs").is_file()); + + // Tampered checksum fails closed. + let entry = LockfileEntry { + integrity: LockIntegrity::Sha256Hex("0".repeat(64)), + ..entry + }; + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Failed(msg)) => assert!(msg.contains("mismatch"), "{msg}"), + other => panic!("expected mismatch, got {other:?}"), + } + } + + /// Build a go module zip in memory (files only, `module@version/` + /// prefix — the go zip layout). + #[cfg(feature = "golang")] + fn make_module_zip(prefix: &str, files: &[(&str, &[u8])]) -> Vec { + use std::io::Write as _; + let mut writer = zip::ZipWriter::new(std::io::Cursor::new(Vec::new())); + for (name, bytes) in files { + writer + .start_file( + format!("{prefix}{name}"), + zip::write::SimpleFileOptions::default() + .compression_method(zip::CompressionMethod::Deflated), + ) + .unwrap(); + writer.write_all(bytes).unwrap(); + } + writer.finish().unwrap().into_inner() + } + + /// Independent spec-mirror of dirhash Hash1/HashZip, structured + /// differently from the production fn to catch encoding slips. + #[cfg(feature = "golang")] + fn spec_h1(files: &[(&str, &[u8])], prefix: &str) -> String { + // dirhash.Hash1 sorts the FILE NAMES, then emits one line per file. + let mut named: Vec<(String, &[u8])> = files + .iter() + .map(|(name, bytes)| (format!("{prefix}{name}"), *bytes)) + .collect(); + named.sort_by(|a, b| a.0.cmp(&b.0)); + let lines: Vec = named + .iter() + .map(|(name, bytes)| format!("{} {name}\n", hex::encode(Sha256::digest(bytes)))) + .collect(); + let digest = Sha256::digest(lines.concat().as_bytes()); + format!( + "h1:{}", + base64::engine::general_purpose::STANDARD.encode(digest) + ) + } + + #[cfg(feature = "golang")] + #[tokio::test] + async fn golang_module_fetch_verifies_h1_dirhash_and_extracts() { + // Out-of-order files prove the sort; nested module path proves the + // explicit-prefix strip (a first-component strip would be wrong). + let prefix = "github.com/x/y@v1.0.0/"; + let files: [(&str, &[u8]); 3] = [ + ("go.mod", b"module github.com/x/y\n"), + ("a/b.go", b"package a\n"), + ("README.md", b"# y\n"), + ]; + let zip_bytes = make_module_zip(prefix, &files); + let expected = spec_h1(&files, prefix); + assert_eq!( + go_h1_of_zip(&zip_bytes).unwrap(), + expected, + "production dirhash matches the spec mirror" + ); + + let mock = MockServer::start().await; + Mock::given(method("GET")) + .and(url_path("/github.com/x/y/@v/v1.0.0.zip")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(zip_bytes)) + .mount(&mock) + .await; + + let entry = LockfileEntry { + ecosystem: "golang", + name: "github.com/x/y".into(), + version: "v1.0.0".into(), + purl: "pkg:golang/github.com/x/y@v1.0.0".into(), + resolved: Some(format!("{}/github.com/x/y/@v/v1.0.0.zip", mock.uri())), + integrity: LockIntegrity::GoH1(expected), + }; + let fetched = fetch_and_stage(&entry, &build_registry_client()) + .await + .unwrap(); + assert!(fetched.dir().join("go.mod").is_file()); + assert!(fetched.dir().join("a/b.go").is_file()); + + // Tampered h1 fails closed. + let entry = LockfileEntry { + integrity: LockIntegrity::GoH1( + "h1:AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=".into(), + ), + ..entry + }; + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Failed(msg)) => assert!(msg.contains("mismatch"), "{msg}"), + other => panic!("expected mismatch, got {other:?}"), + } + } + + #[cfg(feature = "golang")] + #[test] + fn go_escape_uppercase_and_zip_prefix_guards() { + assert_eq!( + go_escape("github.com/Azure/azure-sdk"), + "github.com/!azure/azure-sdk" + ); + assert_eq!(go_escape("v1.0.0-RC1"), "v1.0.0-!r!c1"); + + // An entry outside the module prefix fails the whole artifact. + let zip_bytes = make_module_zip("github.com/x/y@v1.0.0/", &[("go.mod", b"m\n")]); + let tmp = tempfile::tempdir().unwrap(); + let err = + extract_zip_with_prefix(&zip_bytes, tmp.path(), "github.com/OTHER@v1/").unwrap_err(); + assert!(err.contains("outside"), "{err}"); + } + + /// Build a zip with the given `(path, bytes)` entries. + fn make_zip(files: &[(&str, &[u8])]) -> Vec { + use std::io::Write as _; + let mut writer = zip::ZipWriter::new(std::io::Cursor::new(Vec::new())); + for (name, bytes) in files { + writer + .start_file( + name.to_string(), + zip::write::SimpleFileOptions::default() + .compression_method(zip::CompressionMethod::Deflated), + ) + .unwrap(); + writer.write_all(bytes).unwrap(); + } + writer.finish().unwrap().into_inner() + } + + #[cfg(feature = "composer")] + #[tokio::test] + async fn composer_dist_fetch_verifies_sha1_and_strips_top_dir() { + // GitHub zipballs carry an `owner-repo-sha/` top dir. + let zip_bytes = make_zip(&[ + ( + "Seldaek-monolog-abc123/composer.json", + br#"{"name":"monolog/monolog"}"#, + ), + ("Seldaek-monolog-abc123/src/Logger.php", b" assert!(msg.contains("mismatch"), "{msg}"), + other => panic!("expected mismatch, got {other:?}"), + } + } + + #[tokio::test] + async fn gem_fetch_verifies_sha256_and_extracts_data_tar() { + // .gem = plain tar holding data.tar.gz (content at the ROOT — no + // prefix dir) + metadata.gz. + let data_tgz = make_tgz(&[ + ("lib/rails.rb", b"module Rails; end\n", false), + ("README.md", b"# rails\n", false), + ]); + let mut outer = tar::Builder::new(Vec::new()); + for (name, bytes) in [ + ("metadata.gz", b"meta".as_slice()), + ("data.tar.gz", &data_tgz), + ] { + let mut header = tar::Header::new_gnu(); + header.set_size(bytes.len() as u64); + header.set_mode(0o644); + header.set_cksum(); + outer.append_data(&mut header, name, bytes).unwrap(); + } + let gem_bytes = outer.into_inner().unwrap(); + let sha = hex::encode(Sha256::digest(&gem_bytes)); + + let mock = MockServer::start().await; + Mock::given(method("GET")) + .and(url_path("/downloads/rails-7.1.0.gem")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(gem_bytes)) + .mount(&mock) + .await; + + let entry = LockfileEntry { + ecosystem: "gem", + name: "rails".into(), + version: "7.1.0".into(), + purl: "pkg:gem/rails@7.1.0".into(), + resolved: Some(format!("{}/downloads/rails-7.1.0.gem", mock.uri())), + integrity: LockIntegrity::Sha256Hex(sha), + }; + let fetched = fetch_and_stage(&entry, &build_registry_client()) + .await + .unwrap(); + assert!( + fetched.dir().join("lib/rails.rb").is_file(), + "data.tar.gz content extracts at the root (no strip)" + ); + assert!(fetched.dir().join("README.md").is_file()); + } + + #[tokio::test] + async fn pypi_wheel_fetch_extracts_site_packages_layout() { + let wheel = make_zip(&[ + ("requests/__init__.py", b"__version__ = '2.28.0'\n"), + ( + "requests-2.28.0.dist-info/RECORD", + b"requests/__init__.py,sha256=abc,24\n", + ), + ("requests-2.28.0.dist-info/WHEEL", b"Wheel-Version: 1.0\n"), + ]); + let sha = hex::encode(Sha256::digest(&wheel)); + let mock = MockServer::start().await; + Mock::given(method("GET")) + .and(url_path("/packages/requests-2.28.0-py3-none-any.whl")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(wheel)) + .mount(&mock) + .await; + + let entry = LockfileEntry { + ecosystem: "pypi", + name: "requests".into(), + version: "2.28.0".into(), + purl: "pkg:pypi/requests@2.28.0".into(), + resolved: Some(format!( + "{}/packages/requests-2.28.0-py3-none-any.whl", + mock.uri() + )), + integrity: LockIntegrity::Sha256Hex(sha), + }; + let fetched = fetch_and_stage(&entry, &build_registry_client()) + .await + .unwrap(); + // Wheel content at the root: a site-packages-shaped dir with the + // dist-info RECORD the pypi vendor backend stages from. + assert!(fetched.dir().join("requests/__init__.py").is_file()); + assert!(fetched + .dir() + .join("requests-2.28.0.dist-info/RECORD") + .is_file()); + + // No recorded wheel URL (poetry/requirements) → Unverifiable. + let entry = LockfileEntry { + resolved: None, + integrity: LockIntegrity::Sha256Hex("0".repeat(64)), + ..entry + }; + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Unverifiable(msg)) => assert!(msg.contains("wheel"), "{msg}"), + other => panic!("expected Unverifiable, got {other:?}"), + } + } + + #[test] + fn oversized_entry_header_fails_closed() { + // A header CLAIMING more than the per-entry cap fails before any + // attempt to read that much data. + let mut builder = tar::Builder::new(flate2::write::GzEncoder::new( + Vec::new(), + flate2::Compression::default(), + )); + let mut header = tar::Header::new_gnu(); + header.set_path("package/huge.bin").unwrap(); + header.set_size(MAX_ENTRY_BYTES + 1); + header.set_mode(0o644); + header.set_cksum(); + // Intentionally append no data: the size check fires first. + let inner = { + use std::io::Write as _; + builder.get_mut().write_all(&header.as_bytes()[..]).unwrap(); + builder.into_inner().unwrap().finish().unwrap() + }; + let tmp = tempfile::tempdir().unwrap(); + let err = extract_tgz(&inner, tmp.path()).unwrap_err(); + assert!( + err.contains("cap") || err.contains("unreadable"), + "oversize header fails closed: {err}" + ); + } +} diff --git a/crates/socket-patch-core/src/patch/vendor/verify.rs b/crates/socket-patch-core/src/patch/vendor/verify.rs index 562eef6..45622d6 100644 --- a/crates/socket-patch-core/src/patch/vendor/verify.rs +++ b/crates/socket-patch-core/src/patch/vendor/verify.rs @@ -164,6 +164,101 @@ fn read_wheel_to_map(whl: &Path) -> Result>, String> { Ok(out) } +/// Hard cap on whole-artifact bytes hashed by the health check — committed +/// artifacts are small (a package tarball/wheel); a tampered multi-GiB file +/// must not stall `repair`. +const MAX_HEALTH_HASH_BYTES: u64 = 512 * 1024 * 1024; + +/// Classified health of one ledger entry's committed artifact, for +/// `repair`-style callers that need a DECISION (rebuild or not), not just a +/// routing tag. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum ArtifactHealth { + /// Exists and every record file hashes to its afterHash (and, for + /// file-shaped artifacts, the whole file matches the ledger sha256). + Healthy, + /// Nothing at the artifact path: rebuildable. + Missing, + /// Present but failing verification: rebuildable. `reason` is the + /// stable routing tag (`vendor_hash_mismatch`, `file_not_found`, + /// `vendor_artifact_unreadable`, `vendor_sha256_mismatch`). + Corrupt { reason: String }, + /// The ledger/artifact uuid doesn't match the record: a re-vendor is + /// pending — not repair's job. + StaleUuid, + /// The entry can't be judged (poisoned path, empty record): fail + /// closed, never rebuild from it. + Unverifiable { reason: String }, +} + +/// Health-check one vendored artifact against its patch record: the +/// per-file afterHash verification of [`verify_vendored_patch_record`] +/// plus, for file-shaped artifacts (`.tgz`/`.tar.gz`/`.whl`) with a +/// recorded ledger sha256, a whole-file hash cross-check — the rewired +/// lockfile integrity references those exact bytes, so silent drift breaks +/// the package manager even when the patched members still verify. +pub async fn check_vendored_artifact( + project_root: &Path, + entry: &VendorEntry, + record: &PatchRecord, +) -> ArtifactHealth { + match verify_vendored_patch_record(project_root, entry, record).await { + Err(tag) => match tag.as_str() { + "vendor_artifact_missing" => ArtifactHealth::Missing, + "vendor_uuid_mismatch" => ArtifactHealth::StaleUuid, + "vendor_hash_mismatch" | "file_not_found" | "vendor_artifact_unreadable" => { + ArtifactHealth::Corrupt { reason: tag } + } + _ => ArtifactHealth::Unverifiable { reason: tag }, + }, + Ok(()) => { + let norm = entry.artifact.path.replace('\\', "/"); + let file_shaped = + norm.ends_with(".tgz") || norm.ends_with(".tar.gz") || norm.ends_with(".whl"); + if !file_shaped || entry.artifact.sha256.is_empty() { + return ArtifactHealth::Healthy; + } + // The path already passed checked_artifact_path inside the + // verification above. + match file_sha256_hex(&project_root.join(&norm)).await { + Some(hex) if hex.eq_ignore_ascii_case(&entry.artifact.sha256) => { + ArtifactHealth::Healthy + } + Some(_) => ArtifactHealth::Corrupt { + reason: "vendor_sha256_mismatch".to_string(), + }, + None => ArtifactHealth::Corrupt { + reason: "vendor_artifact_unreadable".to_string(), + }, + } + } + } +} + +/// Plain sha256 hex of a regular file, size-capped; `None` on any read +/// failure or cap breach. Public for repair's ledger re-synthesis (the +/// rebuilt artifact's recorded sha). +pub async fn file_sha256_hex(path: &Path) -> Option { + use sha2::{Digest, Sha256}; + use tokio::io::AsyncReadExt; + + let meta = tokio::fs::metadata(path).await.ok()?; + if !meta.is_file() || meta.len() > MAX_HEALTH_HASH_BYTES { + return None; + } + let mut file = tokio::fs::File::open(path).await.ok()?; + let mut hasher = Sha256::new(); + let mut buf = vec![0u8; 64 * 1024]; + loop { + let n = file.read(&mut buf).await.ok()?; + if n == 0 { + break; + } + hasher.update(&buf[..n]); + } + Some(hex::encode(hasher.finalize())) +} + fn verify_member_map( members: &HashMap>, record: &PatchRecord, @@ -421,4 +516,89 @@ mod tests { "vendor_artifact_missing" ); } + + /// Full classification matrix for the repair-facing health check. + #[tokio::test] + async fn artifact_health_classification_matrix() { + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + let rel = format!(".socket/vendor/npm/{UUID}/x-1.0.0.tgz"); + let rec = record(UUID, "package/index.js"); + + // Missing. + let ent = entry("npm", UUID, &rel); + assert_eq!( + check_vendored_artifact(root, &ent, &rec).await, + ArtifactHealth::Missing + ); + + // Healthy (no ledger sha recorded → member verification only). + tokio::fs::create_dir_all(root.join(format!(".socket/vendor/npm/{UUID}"))) + .await + .unwrap(); + write_tgz(&root.join(&rel), "package/index.js", PATCHED); + assert_eq!( + check_vendored_artifact(root, &ent, &rec).await, + ArtifactHealth::Healthy + ); + + // Healthy with a MATCHING ledger sha256. + let tgz_bytes = tokio::fs::read(root.join(&rel)).await.unwrap(); + let mut ent_sha = entry("npm", UUID, &rel); + ent_sha.artifact.sha256 = { + use sha2::{Digest, Sha256}; + hex::encode(Sha256::digest(&tgz_bytes)) + }; + assert_eq!( + check_vendored_artifact(root, &ent_sha, &rec).await, + ArtifactHealth::Healthy + ); + + // Whole-file drift the member check can't see: members verify, but + // the bytes differ from what the lockfile integrity references + // (re-compressed archive → different sha). + ent_sha.artifact.sha256 = "0".repeat(64); + assert_eq!( + check_vendored_artifact(root, &ent_sha, &rec).await, + ArtifactHealth::Corrupt { + reason: "vendor_sha256_mismatch".to_string() + } + ); + + // Member tamper. + write_tgz(&root.join(&rel), "package/index.js", b"tampered"); + assert_eq!( + check_vendored_artifact(root, &ent, &rec).await, + ArtifactHealth::Corrupt { + reason: "vendor_hash_mismatch".to_string() + } + ); + + // Unreadable. + tokio::fs::write(root.join(&rel), b"\x1f\x8b00garbage") + .await + .unwrap(); + assert_eq!( + check_vendored_artifact(root, &ent, &rec).await, + ArtifactHealth::Corrupt { + reason: "vendor_artifact_unreadable".to_string() + } + ); + + // Stale uuid → not repair's job. + let rec_new = record("11111111-2222-4333-8444-555555555555", "package/index.js"); + assert_eq!( + check_vendored_artifact(root, &ent, &rec_new).await, + ArtifactHealth::StaleUuid + ); + + // Poisoned path → fail closed. + let ent_bad = entry("npm", UUID, "../../outside.tgz"); + assert_eq!( + check_vendored_artifact(root, &ent_bad, &rec).await, + ArtifactHealth::Unverifiable { + reason: "vendor_path_unsafe".to_string() + } + ); + } } diff --git a/crates/socket-patch-core/src/patch/vendor/yarn_berry_lock.rs b/crates/socket-patch-core/src/patch/vendor/yarn_berry_lock.rs index 613b503..2dd101b 100644 --- a/crates/socket-patch-core/src/patch/vendor/yarn_berry_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/yarn_berry_lock.rs @@ -83,7 +83,7 @@ pub async fn vendor_yarn_berry( Ok(coords) => coords, Err(outcome) => return *outcome, }; - let (name, version) = (coords.name, coords.version); + let (name, version) = (coords.name.as_str(), coords.version.as_str()); let uuid_dir_rel = coords.uuid_dir_rel.clone(); let base_purl = coords.base_purl.clone(); let rel_tgz = format!("{}/{}", coords.uuid_dir_rel, tgz_rel_leaf(name, version)); @@ -195,6 +195,12 @@ pub async fn vendor_yarn_berry( format!("{PACKAGE_JSON} root is not an object"), ); }; + // A user-authored BARE-name pin to the exact version being vendored is + // TAKEN OVER (its value is rewritten to our spec — the pin already + // forced this exact version, so semantics are preserved — and recorded + // as the wiring `original` so revert restores it). Anything else + // same-name still refuses. + let mut takeover_original: Option = None; if let Some(res) = pkg_obj.get("resolutions") { let Some(res_obj) = res.as_object() else { return refused( @@ -210,19 +216,26 @@ pub async fn vendor_yarn_berry( continue; } // Our own (possibly stale-uuid) entry is fine to overwrite; a - // user-authored override is never clobbered. + // user-authored override is never clobbered silently. let ours = value .as_str() .is_some_and(|v| parse_vendor_path(v).is_some_and(|p| p.eco == "npm")); - if !ours { - return refused( - "vendor_override_conflict", - format!( - "{PACKAGE_JSON} already has a resolutions entry for `{selector}` \ - ({value}); vendor will not overwrite a user-authored override" - ), - ); + if ours { + continue; + } + if selector == name && value.as_str() == Some(version) { + takeover_original = Some(version.to_string()); + continue; } + return refused( + "vendor_override_conflict", + format!( + "{PACKAGE_JSON} already has a resolutions entry for `{selector}` \ + ({value}); vendor will not overwrite a user-authored override (an \ + exact-version pin `\"{name}\": \"{version}\"` is taken over \ + automatically)" + ), + ); } } @@ -254,6 +267,7 @@ pub async fn vendor_yarn_berry( sources, dry_run, force, + &mut warnings, ) .await { @@ -393,16 +407,17 @@ pub async fn vendor_yarn_berry( WiringRecord { file: PACKAGE_JSON.to_string(), kind: KIND_RESOLUTION.to_string(), - // Rewritten only when replacing our own stale entry — and then - // there is deliberately no `original` (never record our own edit - // as a pre-vendor fragment). + // Rewritten when replacing our own stale entry (no `original` — + // never record our own edit as a pre-vendor fragment) or a + // taken-over user pin (whose value IS the `original`, restored + // verbatim on revert). action: if existing_entry { WiringAction::Rewritten } else { WiringAction::Added }, key: Some(name.to_string()), - original: None, + original: takeover_original.map(Value::String), new: Some(Value::String(spec)), }, WiringRecord { @@ -688,6 +703,13 @@ fn revert_resolution_record( )); return; } + // A takeover recorded the user's pinned value: restore it in place + // (the key and table stay). Otherwise remove our entry as before. + if let Some(orig) = rec.original.as_ref().and_then(Value::as_str) { + res_obj.insert(key.to_string(), Value::String(orig.to_string())); + *changed = true; + return; + } res_obj.shift_remove(key); if res_obj.is_empty() { obj.shift_remove("resolutions"); @@ -876,7 +898,7 @@ fn carried_sections(lines: &[String]) -> Vec { } /// Read a berry scalar field (`: `, value possibly quoted). -fn berry_field<'a>(lines: &'a [String], field: &str) -> Option<&'a str> { +pub(super) fn berry_field<'a>(lines: &'a [String], field: &str) -> Option<&'a str> { for line in lines.iter().skip(1) { let Some(rest) = body_field_line(line) else { continue; @@ -1356,6 +1378,62 @@ __metadata: assert_eq!(tokio::fs::read(fx.pkg_path()).await.unwrap(), fx.pkg_bytes); } + /// A user-authored BARE-name pin to the exact version being vendored is + /// taken over: the value moves to our spec, the wiring records the pin + /// as `original`, and revert restores it (table kept). Range-keyed + /// selectors keep refusing. + #[tokio::test] + async fn user_exact_pin_resolution_is_taken_over_and_revert_restores_it() { + let pkg_before = B3_BEFORE_PKG.replace( + " }\n}", + " },\n \"resolutions\": {\n \"left-pad\": \"1.3.0\"\n }\n}", + ); + let fx = fixture_with(&pkg_before, B3_BEFORE_LOCK).await; + + let (result, entry, _) = expect_done(fx.vendor(false).await); + assert!(result.success, "{:?}", result.error); + let entry = entry.unwrap(); + + let pkg: Value = + serde_json::from_slice(&tokio::fs::read(fx.pkg_path()).await.unwrap()).unwrap(); + let val = pkg["resolutions"]["left-pad"].as_str().unwrap(); + assert!( + parse_vendor_path(val).is_some_and(|p| p.eco == "npm"), + "pin value rewritten to our spec: {val}" + ); + + let rec = entry + .wiring + .iter() + .find(|r| r.kind == KIND_RESOLUTION) + .unwrap(); + assert_eq!(rec.action, WiringAction::Rewritten); + assert_eq!( + rec.original, + Some(Value::String("1.3.0".to_string())), + "the user's pin is the original" + ); + + // Revert restores the pin in place (the resolutions table stays). + let outcome = revert_yarn_berry(&entry, fx.root(), false).await; + assert!(outcome.success, "{:?}", outcome.error); + let pkg: Value = + serde_json::from_slice(&tokio::fs::read(fx.pkg_path()).await.unwrap()).unwrap(); + assert_eq!( + pkg["resolutions"]["left-pad"], + Value::String("1.3.0".to_string()), + "pin restored" + ); + + // A range-keyed selector with the same value still refuses. + let pkg = B3_BEFORE_PKG.replace( + " }\n}", + " },\n \"resolutions\": {\n \"left-pad@npm:1.x\": \"1.3.0\"\n }\n}", + ); + let fx = fixture_with(&pkg, B3_BEFORE_LOCK).await; + expect_refused(fx.vendor(false).await, "vendor_override_conflict"); + } + #[tokio::test] async fn missing_entry_and_other_version_guards() { // No left-pad entry at all. diff --git a/crates/socket-patch-core/src/patch/vendor/yarn_classic_lock.rs b/crates/socket-patch-core/src/patch/vendor/yarn_classic_lock.rs index ac1ff84..6278bb8 100644 --- a/crates/socket-patch-core/src/patch/vendor/yarn_classic_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/yarn_classic_lock.rs @@ -68,7 +68,7 @@ pub async fn vendor_yarn_classic( Ok(coords) => coords, Err(outcome) => return *outcome, }; - let (name, version) = (coords.name, coords.version); + let (name, version) = (coords.name.as_str(), coords.version.as_str()); let uuid_dir_rel = coords.uuid_dir_rel; let base_purl = coords.base_purl; @@ -135,6 +135,7 @@ pub async fn vendor_yarn_classic( sources, dry_run, force, + &mut warnings, ) .await { @@ -554,7 +555,7 @@ fn rewrite_classic_block( /// Does this block's `resolved` already point into `.socket/vendor/npm/` /// (ours — current or stale uuid)? -fn block_points_into_vendor(lines: &[String]) -> bool { +pub(super) fn block_points_into_vendor(lines: &[String]) -> bool { classic_field(lines, "resolved") .and_then(parse_vendor_path) .is_some_and(|p| p.eco == "npm") diff --git a/crates/socket-patch-core/src/utils/purl.rs b/crates/socket-patch-core/src/utils/purl.rs index 6ea0e80..8cb2d24 100644 --- a/crates/socket-patch-core/src/utils/purl.rs +++ b/crates/socket-patch-core/src/utils/purl.rs @@ -1,3 +1,5 @@ +use std::borrow::Cow; + /// Strip the trailing `?qualifiers` and `#subpath` components from a PURL, /// leaving the canonical `pkg:type/namespace/name@version` base. /// @@ -18,6 +20,94 @@ pub fn strip_purl_qualifiers(purl: &str) -> &str { } } +/// Strictly percent-decode ONE purl path component (a scope, namespace +/// segment, name, or version) AFTER it has been split out of the purl. +/// +/// The patches API serves purls in canonical percent-encoded form +/// (`pkg:npm/%40scope/name@1.0.0`), while crawlers build purls from the +/// literal on-disk names (`pkg:npm/@scope/name@1.0.0`). Parsers must +/// decode the API form to find installed packages. +/// +/// SECURITY: this must only ever be called on a component AFTER the purl +/// has been split on `/` and the version `@` — so an encoded separator +/// (`%2f`) cannot create new path segments at parse time; it surfaces as +/// a literal `/` *inside* one component — and BEFORE the path-safety +/// guards run, so `%2e%2e`, `%2f`, `%5c`, `%00` are rejected post-decode +/// by the same `is_safe_*` gates that reject their literal forms. +/// Guarding the encoded form instead would be a traversal bypass. +/// +/// Decoding is all-or-nothing: an invalid escape (`%G1`, trailing `%4`) +/// or a non-UTF8 decode returns the input unchanged (fail-safe — the +/// undecoded form contains no separators, and `%` is not a legal +/// character in any real package name). Zero-alloc when no `%`. +pub fn percent_decode_purl_component(component: &str) -> Cow<'_, str> { + if !component.contains('%') { + return Cow::Borrowed(component); + } + fn hex_val(b: u8) -> Option { + match b { + b'0'..=b'9' => Some(b - b'0'), + b'a'..=b'f' => Some(b - b'a' + 10), + b'A'..=b'F' => Some(b - b'A' + 10), + _ => None, + } + } + let bytes = component.as_bytes(); + let mut out: Vec = Vec::with_capacity(bytes.len()); + let mut i = 0; + while i < bytes.len() { + if bytes[i] == b'%' { + let (Some(hi), Some(lo)) = ( + bytes.get(i + 1).copied().and_then(hex_val), + bytes.get(i + 2).copied().and_then(hex_val), + ) else { + // Invalid escape: leave the whole component verbatim. + return Cow::Borrowed(component); + }; + out.push(hi * 16 + lo); + i += 3; + } else { + out.push(bytes[i]); + i += 1; + } + } + match String::from_utf8(out) { + Ok(s) => Cow::Owned(s), + // Decoded bytes are not UTF-8: leave the component verbatim. + Err(_) => Cow::Borrowed(component), + } +} + +/// Canonical string form for purl-to-purl comparison and display: +/// percent-decode each `/`-separated component of the +/// `pkg:type/...@version` base; qualifiers/subpath are appended verbatim. +/// +/// Used ONLY for string equality (`purl_eq`) and human output — never to +/// build filesystem paths (a `%2f` decoding into a name can at worst make +/// two distinct purls compare equal, not change a write location). +pub fn normalize_purl(purl: &str) -> Cow<'_, str> { + if !purl.contains('%') { + return Cow::Borrowed(purl); + } + let split = purl.find(['?', '#']).unwrap_or(purl.len()); + let (base, suffix) = purl.split_at(split); + let mut out = String::with_capacity(purl.len()); + for (i, seg) in base.split('/').enumerate() { + if i > 0 { + out.push('/'); + } + out.push_str(&percent_decode_purl_component(seg)); + } + out.push_str(suffix); + Cow::Owned(out) +} + +/// Purl equality up to percent-encoding of the base components +/// (`pkg:npm/%40scope/x@1` ≡ `pkg:npm/@scope/x@1`). +pub fn purl_eq(a: &str, b: &str) -> bool { + normalize_purl(a) == normalize_purl(b) +} + /// Parse a PyPI PURL to extract name and version. /// /// e.g., `"pkg:pypi/requests@2.28.0?artifact_id=abc"` -> `Some(("requests", "2.28.0"))` @@ -154,8 +244,12 @@ pub fn build_composer_purl(namespace: &str, name: &str, version: &str) -> String /// We follow the same shape as `parse_composer_purl` since both /// have a `/` namespace structure. The leading `@` on /// the scope is preserved (matching npm's `@scope/name` convention). +/// `((scope, name), version)` from a JSR purl, percent-decoded. +#[cfg(feature = "deno")] +pub type JsrPurlParts<'a> = ((Cow<'a, str>, Cow<'a, str>), Cow<'a, str>); + #[cfg(feature = "deno")] -pub fn parse_jsr_purl(purl: &str) -> Option<((&str, &str), &str)> { +pub fn parse_jsr_purl(purl: &str) -> Option> { let base = strip_purl_qualifiers(purl); let rest = base.strip_prefix("pkg:jsr/")?; let at_idx = rest.rfind('@')?; @@ -167,8 +261,12 @@ pub fn parse_jsr_purl(purl: &str) -> Option<((&str, &str), &str)> { } let slash_idx = name_part.find('/')?; - let scope = &name_part[..slash_idx]; - let name = &name_part[slash_idx + 1..]; + // Decode AFTER splitting on `/`/`@` and BEFORE the shape checks below + // (and the caller's `is_safe_jsr_component` gate) — see + // `percent_decode_purl_component`. The API serves `%40scope`. + let scope = percent_decode_purl_component(&name_part[..slash_idx]); + let name = percent_decode_purl_component(&name_part[slash_idx + 1..]); + let version = percent_decode_purl_component(version); // Scope must be `@`. The bare `@` (length 1) is // invalid — there's no actual scope after the marker. @@ -248,15 +346,22 @@ pub fn is_purl(s: &str) -> bool { /// /// Non-PyPI keys never carry a `?`, so for them this reduces to plain /// equality. +/// +/// Comparison is encoding-tolerant (`purl_eq`): manifest keys come from +/// the API in percent-encoded form (`pkg:npm/%40scope/x@1`) while users +/// type the literal form — both spellings must match either way around. pub fn purl_matches_identifier(manifest_key: &str, identifier: &str) -> bool { if identifier.contains('?') { - manifest_key == identifier + purl_eq(manifest_key, identifier) } else { // Base identifier: compare bases. Strip both sides so a subpath // (`#...`) carried by either the key or the identifier doesn't // defeat the match — `strip_purl_qualifiers(identifier)` is a no-op // for a plain base PURL, so existing behaviour is unchanged. - strip_purl_qualifiers(manifest_key) == strip_purl_qualifiers(identifier) + purl_eq( + strip_purl_qualifiers(manifest_key), + strip_purl_qualifiers(identifier), + ) } } @@ -504,25 +609,30 @@ mod tests { ); } + #[cfg(feature = "deno")] + fn jsr_parts(purl: &str) -> Option<(String, String, String)> { + parse_jsr_purl(purl).map(|((s, n), v)| (s.into_owned(), n.into_owned(), v.into_owned())) + } + #[cfg(feature = "deno")] #[test] fn test_parse_jsr_purl() { assert_eq!( - parse_jsr_purl("pkg:jsr/@std/path@0.220.0"), - Some((("@std", "path"), "0.220.0")) + jsr_parts("pkg:jsr/@std/path@0.220.0"), + Some(("@std".into(), "path".into(), "0.220.0".into())) ); assert_eq!( - parse_jsr_purl("pkg:jsr/@luca/flag@1.0.0"), - Some((("@luca", "flag"), "1.0.0")) + jsr_parts("pkg:jsr/@luca/flag@1.0.0"), + Some(("@luca".into(), "flag".into(), "1.0.0".into())) ); // Scope must start with `@`. - assert_eq!(parse_jsr_purl("pkg:jsr/std/path@0.220.0"), None); + assert_eq!(jsr_parts("pkg:jsr/std/path@0.220.0"), None); // Empty pieces. - assert_eq!(parse_jsr_purl("pkg:jsr/@/path@0.220.0"), None); - assert_eq!(parse_jsr_purl("pkg:jsr/@std/@0.220.0"), None); - assert_eq!(parse_jsr_purl("pkg:jsr/@std/path@"), None); + assert_eq!(jsr_parts("pkg:jsr/@/path@0.220.0"), None); + assert_eq!(jsr_parts("pkg:jsr/@std/@0.220.0"), None); + assert_eq!(jsr_parts("pkg:jsr/@std/path@"), None); // Wrong scheme. - assert_eq!(parse_jsr_purl("pkg:npm/@std/path@0.220.0"), None); + assert_eq!(jsr_parts("pkg:npm/@std/path@0.220.0"), None); } #[cfg(feature = "deno")] @@ -661,8 +771,8 @@ mod tests { // Scope `@` + version `@` + qualifier `@` all coexist; only the // version `@` should be honored. assert_eq!( - parse_jsr_purl("pkg:jsr/@std/path@0.220.0?download_url=x@y"), - Some((("@std", "path"), "0.220.0")) + jsr_parts("pkg:jsr/@std/path@0.220.0?download_url=x@y"), + Some(("@std".into(), "path".into(), "0.220.0".into())) ); } @@ -748,6 +858,91 @@ mod tests { )); } + // --- Percent-decoding: API purls carry %-encoded components -------------- + + #[test] + fn test_percent_decode_purl_component() { + // The canonical case: an encoded npm scope marker. + assert_eq!( + percent_decode_purl_component("%40modelcontextprotocol"), + "@modelcontextprotocol" + ); + // Traversal sequences decode — the post-decode safety guards are + // what reject them, not this helper. + assert_eq!(percent_decode_purl_component("%2e%2e"), ".."); + assert_eq!(percent_decode_purl_component("a%2fb"), "a/b"); + assert_eq!(percent_decode_purl_component("%00"), "\0"); + // Invalid escapes leave the WHOLE component verbatim (all-or-nothing). + assert_eq!(percent_decode_purl_component("%G1abc"), "%G1abc"); + assert_eq!(percent_decode_purl_component("abc%4"), "abc%4"); + assert_eq!(percent_decode_purl_component("abc%"), "abc%"); + // Non-UTF8 decode (lone continuation byte) leaves it verbatim. + assert_eq!(percent_decode_purl_component("%FF"), "%FF"); + // No '%' is zero-alloc (borrowed). + assert!(matches!( + percent_decode_purl_component("plain-name"), + Cow::Borrowed(_) + )); + } + + #[test] + fn test_normalize_purl_and_purl_eq() { + assert_eq!( + normalize_purl("pkg:npm/%40modelcontextprotocol/sdk@1.12.0"), + "pkg:npm/@modelcontextprotocol/sdk@1.12.0" + ); + assert!(purl_eq( + "pkg:npm/%40scope/x@1.0.0", + "pkg:npm/@scope/x@1.0.0" + )); + assert!(purl_eq( + "pkg:npm/@scope/x@1.0.0", + "pkg:npm/%40scope/x@1.0.0" + )); + assert!(!purl_eq( + "pkg:npm/%40scope/x@1.0.0", + "pkg:npm/@scope/x@2.0.0" + )); + // Qualifiers/subpath are preserved verbatim (not decoded). + assert_eq!( + normalize_purl("pkg:npm/%40s/x@1?artifact_id=a%2Fb"), + "pkg:npm/@s/x@1?artifact_id=a%2Fb" + ); + // Unencoded input is unchanged (and borrowed). + assert!(matches!( + normalize_purl("pkg:npm/lodash@4.17.21"), + Cow::Borrowed(_) + )); + } + + #[test] + fn test_purl_matches_identifier_decodes_encoded_key() { + // Encoded manifest key vs literal identifier — and vice versa. + assert!(purl_matches_identifier( + "pkg:npm/%40scope/x@1.0.0", + "pkg:npm/@scope/x@1.0.0" + )); + assert!(purl_matches_identifier( + "pkg:npm/@scope/x@1.0.0", + "pkg:npm/%40scope/x@1.0.0" + )); + assert!(!purl_matches_identifier( + "pkg:npm/%40scope/x@1.0.0", + "pkg:npm/@scope/y@1.0.0" + )); + } + + #[cfg(feature = "deno")] + #[test] + fn test_parse_jsr_purl_percent_encoded_scope() { + let ((scope, name), version) = parse_jsr_purl("pkg:jsr/%40std/path@0.220.0").unwrap(); + assert_eq!(scope, "@std"); + assert_eq!(name, "path"); + assert_eq!(version, "0.220.0"); + // The encoded bare `@` is still rejected post-decode. + assert_eq!(jsr_parts("pkg:jsr/%40/path@0.220.0"), None); + } + // --- Regression: name must not absorb the version separator ------------- #[test] diff --git a/crates/socket-patch-core/tests/blob_fetcher_edges_e2e.rs b/crates/socket-patch-core/tests/blob_fetcher_edges_e2e.rs index 5dcdf8f..761e56e 100644 --- a/crates/socket-patch-core/tests/blob_fetcher_edges_e2e.rs +++ b/crates/socket-patch-core/tests/blob_fetcher_edges_e2e.rs @@ -184,6 +184,7 @@ async fn fetch_missing_sources_package_mode_with_no_packages_path() { blobs_path: &blobs, packages_path: None, diffs_path: None, + mem_blobs: None, }; // Non-empty manifest: there IS work to do. So `total == 0` below can // only mean the None-packages_path branch short-circuited — not that @@ -226,6 +227,7 @@ async fn fetch_missing_sources_diff_mode_with_no_diffs_path() { blobs_path: &blobs, packages_path: None, diffs_path: None, + mem_blobs: None, }; let manifest = manifest_with_after_hashes(&[&"a".repeat(64)]); let client = dummy_client(); @@ -632,6 +634,7 @@ async fn fetch_missing_sources_diff_downloads_and_writes_archive() { blobs_path: &blobs, packages_path: None, diffs_path: Some(&diffs), + mem_blobs: None, }; let manifest = manifest_with_uuids(&[uuid]); let client = proxy_client(&server.uri()); @@ -688,6 +691,7 @@ async fn fetch_missing_sources_package_downloads_via_package_endpoint() { blobs_path: &blobs, packages_path: Some(&packages), diffs_path: None, + mem_blobs: None, }; let manifest = manifest_with_uuids(&[uuid]); let client = proxy_client(&server.uri()); @@ -723,6 +727,7 @@ async fn fetch_missing_sources_diff_404_is_failure_with_kind_message() { blobs_path: &blobs, packages_path: None, diffs_path: Some(&diffs), + mem_blobs: None, }; let manifest = manifest_with_uuids(&[uuid]); let client = proxy_client(&server.uri()); @@ -765,6 +770,7 @@ async fn fetch_missing_sources_diff_invokes_progress_callback() { blobs_path: &blobs, packages_path: None, diffs_path: Some(&diffs), + mem_blobs: None, }; let manifest = manifest_with_uuids(&[uuid]); let client = proxy_client(&server.uri());