diff --git a/Cargo.lock b/Cargo.lock index ab20718..74f50ac 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2420,6 +2420,7 @@ dependencies = [ "base64", "clap", "dialoguer", + "flate2", "fs2", "hex", "indicatif", @@ -2431,6 +2432,7 @@ dependencies = [ "serial_test", "sha2", "socket-patch-core", + "tar", "tempfile", "testcontainers", "tokio", diff --git a/README.md b/README.md index 72809b5..19fec6a 100644 --- a/README.md +++ b/README.md @@ -233,7 +233,10 @@ socket-patch scan -g # Scan + apply + emit an OpenVEX attestation in one pass socket-patch scan --json --sync --yes --vex socket.vex.json -# Vendor every patched dependency (committable; see the vendor command) +# Vendor every patched dependency (committable; see the vendor command). +# Works on a completely fresh clone: dependencies listed in the lockfile +# but not yet installed are fetched pristine from their registry and +# integrity-verified against the lockfile before vendoring. socket-patch scan --json --vendor --yes # Same, but keep the manifest out of it entirely diff --git a/crates/socket-patch-cli/CLI_CONTRACT.md b/crates/socket-patch-cli/CLI_CONTRACT.md index 7fb1d59..14bc18e 100644 --- a/crates/socket-patch-cli/CLI_CONTRACT.md +++ b/crates/socket-patch-cli/CLI_CONTRACT.md @@ -55,7 +55,8 @@ Beyond the globals above, each subcommand defines a small set of local arguments | Subcommand | Local arg | Env var | Purpose | |---|---|---|---| | `apply` | `--force` / `-f` | `SOCKET_FORCE` | Bypass beforeHash check | -| `vendor` | `--force` / `-f` | `SOCKET_FORCE` | Bypass beforeHash check when staging the vendored copy | +| `vendor` | `--force` / `-f` | `SOCKET_FORCE` | Tolerate missing patch-target files in the stage + bypass the variant probe. A beforeHash mismatch no longer needs it: vendor staging auto-overwrites with the verified patched content (`vendor_content_mismatch_overwritten` warning) | +| (global) | `--strict` | `SOCKET_STRICT` | Treat a beforeHash mismatch as a hard error in the in-place apply paths (apply/get/scan --apply/hook/go redirect). DEFAULT (v3.4): a mismatched file is overwritten with the FULL verified patched content (the diff strategy self-disables on a wrong base; archive/blob writes are hash-gated to exactly afterHash; the missing blob is downloaded on demand) and surfaced as a `content_mismatch_overwritten` stderr warning + Skipped event. `--force` overrides `--strict` and additionally skips missing files. Vendor staging is unaffected (it always auto-overwrites into its private stage). | | `vendor` | `--revert` | `SOCKET_VENDOR_REVERT` | Undo vendoring: restore recorded original lockfile fragments + remove `.socket/vendor/` artifacts. Works without a manifest | | `apply`, `scan`, `vendor` | `--vex` | `SOCKET_VEX` | Generate an OpenVEX 0.2.0 document at this path on a successful run; see "embedded VEX" below | | `apply`, `scan`, `vendor` | `--vex-product`, `--vex-no-verify`, `--vex-doc-id`, `--vex-compact` | `SOCKET_VEX_PRODUCT`, `SOCKET_VEX_NO_VERIFY`, `SOCKET_VEX_DOC_ID`, `SOCKET_VEX_COMPACT` | Passthrough to the embedded VEX builder; mirror the standalone `vex` knobs. Inert unless `--vex` is set | @@ -71,10 +72,14 @@ Beyond the globals above, each subcommand defines a small set of local arguments `scan --apply` opts JSON callers into the full discover → select → apply pipeline. Without it, `scan --json` stays read-only (discovery + `updates` array only). No effect outside `--json` mode — the non-JSON path always prompts the user interactively. -`scan --prune` opts into garbage collection. When set, `scan` removes manifest entries for packages no longer present in the crawl, then deletes orphan blob, diff, and package-archive files from `.socket/`. Off by default (v3.0) so a temporary uninstall doesn't silently destroy manifest state. +`scan --prune` opts into garbage collection. When set, `scan` removes manifest entries for packages no longer present in the crawl, then deletes orphan blob, diff, and package-archive files from `.socket/`. Off by default (v3.0) so a temporary uninstall doesn't silently destroy manifest state. The pass also reconciles vendored state (runs FIRST, under the apply lock — lock contention skips it without failing the scan): vendored entries whose patch is gone from the manifest are reverted, vendored entries whose dependency is no longer in the lockfile graph are reverted AND their manifest entries dropped (detached entries are exempt from both — they are manifest- and lockfile-invisible by design; a missing or undeterminable lockfile keeps the entry, fail-safe), and orphan `.socket/vendor//` dirs with no ledger entry are swept. The JSON `gc` sub-object gains `revertedVendoredEntries` + `removedVendorOrphanDirs` (wet) / `revertableVendoredEntries` + `vendorOrphanDirs` (preview). `scan` queries the patch API in `--batch-size` chunks. Authenticated runs POST `/v0/orgs/{slug}/patches/batch`; token-less runs POST `{proxy}/patch/batch` on the public proxy and degrade to per-package `GET /patch/by-package/:purl` requests in two cases: the deployed proxy predates the batch endpoint (legacy proxies answer the POST with their `400 "Unsupported endpoint"` catch-all), or the all-or-nothing batch validation rejects the chunk (e.g. a crawled PURL type the server doesn't recognize, such as `pkg:jsr/…` — the per-package path tolerates those individually, preserving the pre-batch scan semantics). Rate limits and over-capacity 503s surface instead of silently degrading. +**Lockfile supplement (v3.4)**: `scan` discovery is no longer limited to installed trees. The project's lockfiles (`package-lock.json`/`npm-shrinkwrap.json`, `pnpm-lock.yaml` v9, `yarn.lock` classic + berry, `bun.lock`, `Cargo.lock`, `go.sum`, `composer.lock`, `Gemfile.lock`, `uv.lock`/`poetry.lock`/pinned `requirements.txt`) are inventoried and dependencies with NO installed copy join discovery — counts, the API lookup, the table (flagged ` [NOT INSTALLED]`, plus a stderr note), and the prune "scanned" set (a wiped node_modules no longer prunes lockfile-listed entries). JSON gains a top-level `lockfileOnlyPackages` count and an additive `notInstalled: true` on matching `packages[]` entries. `--apply` partitions lockfile-only patches out BEFORE download (calm `skipped`/`package_not_installed` records — never an error exit, never a manifest write); `--vendor` passes them through to the vendor engine's auto-fetch. Vendored-ledger entries likewise stay discoverable on a fresh clone (the committed artifact is the dependency). Global scans (`--global`) get no supplement. + +**Vendor auto-fetch (v3.4)**: `vendor`/`scan --vendor` no longer fail on lockfile-resolved packages with no installed copy. Already-vendored purls stage from their committed artifact (sha256-verified against the vendor ledger; offline-safe). Otherwise the pristine artifact is fetched per the lockfile resolution and verified against the lock's recorded integrity FAIL-CLOSED before any write: npm SRI (or yarn classic's sha1 fragment), yarn berry's cache-zip checksum (rebuilt from the fetched tarball; cacheKey 10c0 only), Cargo.lock sha256 over the .crate, go.sum `h1:` dirhash over the module zip, composer `dist.shasum` (sha1), Gemfile.lock `CHECKSUMS` sha256, uv.lock wheel sha256 (pure `py3-none-any` wheels only). Entries the lock cannot verify are NEVER fetched (`vendor_fetch_unverifiable` warning + the calm `package_not_installed` skip). Registry bases honor `SOCKET_NPM_REGISTRY`, `SOCKET_CRATES_REGISTRY`, `SOCKET_GOPROXY` (else `GOPROXY`); npm/yarn/composer/gem/uv lock-recorded URLs are used verbatim. `--offline` refuses the fetch with the calm skip (the detail names the lockfile resolution). The fetch stages into a private tempdir — the project tree is never touched. + `scan --sync` is sugar for `--apply --prune` — the canonical single-flag bot invocation. `scan --json --sync --yes` discovers, applies, and reconciles state in one pass. `scan --vendor` swaps the in-place apply for the vendor pipeline: discover → download (manifest written, as `--apply`) → vendor every patched dependency via the same engine as the `vendor` command (under the same lock). The whole manifest is vendored, so a package vendored at an older patch uuid is **re-vendored automatically** (its old uuid dir is removed — `vendor_stale_artifact_removed`); same-uuid re-runs are `already_vendored` skips. With `--prune`, GC runs **before** the vendor step so stale manifest entries don't fail vendoring with `package_not_installed`. JSON output gains a `download` sub-object (the download phase; no `applied` field — nothing is applied in place) and a `vendor` sub-object (a full vendor Envelope). `--dry-run` previews per-patch `would_vendor` | `would_revendor` (+`oldUuid`) | `already_vendored` without network downloads or disk writes. Interactive mode prompts "Download and vendor N patch(es)?". @@ -442,7 +447,8 @@ worse, lets a warm cache silently serve unpatched bytes): moved past the vendored uuid (that would break VEX verification with `vendor_uuid_mismatch` until a vendor run). The skip rides `apply.patches[]` as `skipped`/`vendored`; a newer available patch still surfaces in `updates[]` — the signal to run `scan --vendor`. `scan --prune` exempts - vendored purls (an absent installed copy is their NORMAL state, not grounds to prune). An + vendored purls from the crawl-based manifest prune (an absent installed copy is their NORMAL + state) but reconciles vendored state via the lockfile instead — see the `--prune` section. An explicit `get` is allowed to move the manifest past the vendored uuid and warns (`warnings[]` + stderr) that a `vendor` run must refresh the artifact. * **Old-binary skew caveat**: a pre-detached `socket-patch` binary running `vendor` against a @@ -602,6 +608,11 @@ Every `--json` invocation emits a single JSON object that follows the **unified | `vendor_yarn_berry_cache_unsupported` | `failed` | vendor (yarn berry): lock `cacheKey ≠ 10c0` or non-default `.yarnrc.yml` `compressionLevel` — the cache-zip checksum is not reproducible. | | `vendor_override_conflict` | `failed` | vendor (pnpm/yarn-berry): a user-authored override/resolution for the package already exists. | | `vendor_integrity_unverified` | `skipped` (warning) | vendor (pipenv): the lockfile format does not hash-check file entries; the committed wheel bytes are the protection. | +| `vendor_content_mismatch_overwritten` | `skipped` (warning) | vendor: a staged file matched NEITHER beforeHash nor afterHash (patch built against different bytes, or local edits); the stage was overwritten with the verified patched content and the vendor succeeded. | +| `vendor_fetched_missing` | `skipped` (warning) | vendor: the package was not installed; its pristine artifact was fetched per the lockfile resolution (or staged from the committed vendor artifact), integrity-verified, and vendored — the project tree was not touched. | +| `vendor_fetch_failed` | `failed` | vendor: the lockfile-resolved fetch was attempted and failed (HTTP error, size cap, integrity mismatch, or a corrupt committed artifact). Suppresses the duplicate `package_not_installed` skip. | +| `vendor_fetch_unverifiable` | `skipped` (warning) | vendor: the lockfile records no usable integrity for the missing package; nothing was fetched (fail-closed) and the `package_not_installed` skip follows. | +| `content_mismatch_overwritten` | `skipped` (warning) | apply (default policy): a file matched NEITHER beforeHash nor afterHash and was overwritten with the full verified patched content. `--strict` turns this case into a `failed` event instead. | | `vendor_lock_checksums_unsupported` / `vendor_stale_lock_checksum` | `failed` | vendor (gem): an ambiguous/platform CHECKSUMS entry, or a v1-wired lock whose stale token blocks the hot path (run `vendor --revert` + re-vendor). | | `pypi_{poetry,pdm,pipenv}_no_lockfile` | `failed` | vendor (pypi): a lock-less tool marker with no `requirements.txt` fallback — run ` lock`. | | `vendor_*` / `pypi_*` / `gemfile_*` / `lock_*` / `locked_version_mismatch` / `user_authored_*` / `native_extensions_unsupported` / `platform_gem_unsupported` | `failed`/`skipped` | vendor: per-ecosystem refusal + drift vocabulary; see the Vendor command contract section. New tags are additive (MINOR). | diff --git a/crates/socket-patch-cli/Cargo.toml b/crates/socket-patch-cli/Cargo.toml index cedba95..e983539 100644 --- a/crates/socket-patch-cli/Cargo.toml +++ b/crates/socket-patch-cli/Cargo.toml @@ -59,6 +59,9 @@ setup-e2e = [] [dev-dependencies] sha2 = { workspace = true } +# scan_vendor_e2e builds pristine registry tarballs for the auto-fetch tests. +tar = { workspace = true } +flate2 = { workspace = true } hex = { workspace = true } wiremock = { workspace = true } portable-pty = { workspace = true } diff --git a/crates/socket-patch-cli/src/args.rs b/crates/socket-patch-cli/src/args.rs index 784b0fe..1fde519 100644 --- a/crates/socket-patch-cli/src/args.rs +++ b/crates/socket-patch-cli/src/args.rs @@ -144,6 +144,19 @@ pub struct GlobalArgs { )] pub offline: bool, + /// Treat a beforeHash mismatch as a hard error. By DEFAULT a file whose + /// on-disk content matches neither the patch's beforeHash nor its + /// afterHash is overwritten with the full verified patched content and + /// surfaced as a stderr warning (`content_mismatch_overwritten`); this + /// flag restores the fail-closed behavior. `--force` overrides it. + #[arg( + long, + env = "SOCKET_STRICT", + default_value_t = false, + value_parser = parse_bool_flag, + )] + pub strict: bool, + /// Operate on globally-installed packages. #[arg( long = "global", @@ -378,6 +391,7 @@ impl Default for GlobalArgs { ecosystems: None, download_mode: "diff".to_string(), offline: false, + strict: false, global: false, global_prefix: None, json: false, diff --git a/crates/socket-patch-cli/src/commands/apply.rs b/crates/socket-patch-cli/src/commands/apply.rs index 07d11db..e2cd0db 100644 --- a/crates/socket-patch-cli/src/commands/apply.rs +++ b/crates/socket-patch-cli/src/commands/apply.rs @@ -5,9 +5,115 @@ use socket_patch_core::crawlers::{ }; use socket_patch_core::manifest::operations::read_manifest; use socket_patch_core::manifest::schema::PatchRecord; -use socket_patch_core::patch::apply::{ +use socket_patch_core::patch::apply::{MismatchPolicy, apply_package_patch, verify_file_patch, ApplyResult, PatchSources, VerifyStatus, }; +/// Files whose pre-apply content matched NEITHER hash and were (or would +/// be) overwritten with the verified patched content — the promoted +/// verify signature `apply_package_patch` leaves behind under the default +/// mismatch policy. +pub(crate) fn mismatch_overwritten_files(result: &ApplyResult) -> Vec { + result + .files_verified + .iter() + .filter(|v| { + v.status == VerifyStatus::Ready + && v.expected_hash.is_some() + && v.current_hash != v.expected_hash + }) + .map(|v| v.file.clone()) + .collect() +} + +/// Surface one mismatch-overwrite per file on stderr (human mode). +fn warn_mismatch_overwrites(result: &ApplyResult, common: &GlobalArgs) { + if common.json || common.silent { + return; + } + for file in mismatch_overwritten_files(result) { + eprintln!( + "Warning (content_mismatch_overwritten): {} {file} did not match the patch's \ + expected original content; applied the full verified patched content instead \ + (pass --strict to fail on mismatches)", + socket_patch_core::utils::purl::normalize_purl(&result.package_key) + ); + } +} + +/// The default mismatch policy applies the FULL patched content for +/// mismatched files — and the full content lives in the afterHash blob, +/// which the default `--download-mode diff` may not have staged. Probe the +/// in-scope packages for mismatches and fetch the missing afterHash blobs +/// by hash (online only) so the apply below can fall through diff → blob. +async fn ensure_blobs_for_mismatches( + args: &ApplyArgs, + manifest: &socket_patch_core::manifest::schema::PatchManifest, + all_packages: &HashMap, + blobs_path: &Path, +) { + if args.common.strict && !args.force { + return; // strict fails on mismatch — nothing to fetch + } + let mut needed: std::collections::HashSet = std::collections::HashSet::new(); + for (purl, pkg_path) in all_packages { + let Some(record) = manifest.patches.get(purl) else { + continue; + }; + for (file_name, info) in &record.files { + if info.before_hash.is_empty() { + continue; + } + let verify = verify_file_patch(pkg_path, file_name, info).await; + if verify.status == socket_patch_core::patch::apply::VerifyStatus::HashMismatch + && tokio::fs::metadata(blobs_path.join(&info.after_hash)) + .await + .is_err() + { + needed.insert(info.after_hash.clone()); + } + } + } + if needed.is_empty() { + return; + } + if args.common.offline { + if !args.common.silent && !args.common.json { + eprintln!( + "Warning: {} mismatched file(s) need their full patched blob, but --offline \ + prevents fetching; those files will fail to apply", + needed.len() + ); + } + return; + } + if !args.common.silent && !args.common.json { + eprintln!( + "Downloading {} full patched blob(s) for mismatched file(s)...", + needed.len() + ); + } + let (client, _) = get_api_client_with_overrides(args.common.api_client_overrides()).await; + let _ = socket_patch_core::api::blob_fetcher::fetch_blobs_by_hash( + &needed, + blobs_path, + &client, + None, + ) + .await; +} + +/// The mismatch policy this run applies with: `--force` ⊃ default +/// (adds the missing-file skip), `--strict` restores fail-closed. +pub(crate) fn mismatch_policy(force: bool, strict: bool) -> MismatchPolicy { + if force { + MismatchPolicy::Force + } else if strict { + MismatchPolicy::Strict + } else { + MismatchPolicy::Warn + } +} + #[cfg(feature = "golang")] use socket_patch_core::patch::go_redirect::{ apply_go_redirect, reconcile_go_redirects, verify_go_redirect_state, @@ -102,7 +208,7 @@ async fn try_local_go_apply( patch: &PatchRecord, sources: &PatchSources<'_>, common: &GlobalArgs, - force: bool, + policy: MismatchPolicy, ) -> Option { if !is_local_go(purl, common) { return None; @@ -126,7 +232,7 @@ async fn try_local_go_apply( sources, Some(&patch.uuid), common.dry_run, - force, + policy, ) .await, ) @@ -139,7 +245,7 @@ async fn try_local_go_apply( _patch: &PatchRecord, _sources: &PatchSources<'_>, _common: &GlobalArgs, - _force: bool, + _policy: MismatchPolicy, ) -> Option { None } @@ -538,6 +644,21 @@ pub async fn run(args: ApplyArgs) -> i32 { } for result in &results { env.record(result_to_event(result, args.common.dry_run)); + // Mismatch overwrites ride as Skipped warning events + // (same pattern as the vendor warnings): the package's + // Applied event stands, the warning is per-file. + for file in mismatch_overwritten_files(result) { + env.record( + PatchEvent::new(PatchAction::Skipped, result.package_key.clone()) + .with_reason( + "content_mismatch_overwritten", + format!( + "{file} did not match the patch's expected original \ + content; the full verified patched content was applied" + ), + ), + ); + } // Sidecar records live on the envelope, not on // individual events. Consumers iterate // `envelope.sidecars[]` and JOIN against @@ -609,9 +730,20 @@ pub async fn run(args: ApplyArgs) -> i32 { } else { format!(" (via {})", tags.join("+")) }; - println!(" {}{}", result.package_key, suffix); + println!( + " {}{}", + socket_patch_core::utils::purl::normalize_purl( + &result.package_key + ), + suffix + ); } else if all_files_already_patched(result) { - println!(" {} (already patched)", result.package_key); + println!( + " {} (already patched)", + socket_patch_core::utils::purl::normalize_purl( + &result.package_key + ) + ); } } } @@ -888,6 +1020,7 @@ async fn apply_patches_inner( } // Apply patches + ensure_blobs_for_mismatches(args, &manifest, &all_packages, &blobs_path).await; let mut has_errors = false; // Group release-variant PURLs by base. PyPI (`?artifact_id=`), @@ -977,10 +1110,11 @@ async fn apply_patches_inner( &sources, Some(&patch.uuid), args.common.dry_run, - args.force, + mismatch_policy(args.force, args.common.strict), ) .await; + warn_mismatch_overwrites(&result, &args.common); // A variant that reached apply is the installed distribution // (it passed the first-file check, or `--force` bypassed it), // so record it as matched whether or not the patch succeeded. @@ -1060,7 +1194,14 @@ async fn apply_patches_inner( // cache) — patches in place via `apply_package_patch`. Without the // `golang` feature `try_local_go_apply` is an inert `None`. let result = - match try_local_go_apply(purl, pkg_path, patch, &sources, &args.common, args.force) + match try_local_go_apply( + purl, + pkg_path, + patch, + &sources, + &args.common, + mismatch_policy(args.force, args.common.strict), + ) .await { Some(r) => r, @@ -1072,12 +1213,13 @@ async fn apply_patches_inner( &sources, Some(&patch.uuid), args.common.dry_run, - args.force, + mismatch_policy(args.force, args.common.strict), ) .await } }; + warn_mismatch_overwrites(&result, &args.common); if !result.success { has_errors = true; if !args.common.silent && !args.common.json { @@ -1111,7 +1253,10 @@ async fn apply_patches_inner( unmatched.len() ); for purl in &unmatched { - eprintln!(" - {}", purl); + eprintln!( + " - {}", + socket_patch_core::utils::purl::normalize_purl(purl) + ); } } diff --git a/crates/socket-patch-cli/src/commands/get.rs b/crates/socket-patch-cli/src/commands/get.rs index e7359aa..b6f1271 100644 --- a/crates/socket-patch-cli/src/commands/get.rs +++ b/crates/socket-patch-cli/src/commands/get.rs @@ -13,7 +13,7 @@ use socket_patch_core::manifest::schema::{ }; use socket_patch_core::patch::apply::select_installed_variants; use socket_patch_core::utils::fuzzy_match::fuzzy_match_packages; -use socket_patch_core::utils::purl::{is_purl, strip_purl_qualifiers}; +use socket_patch_core::utils::purl::{is_purl, normalize_purl, strip_purl_qualifiers}; use socket_patch_core::utils::telemetry::{track_patch_fetch_failed, track_patch_fetched}; use std::collections::HashMap; use std::fmt; @@ -580,6 +580,9 @@ pub struct DownloadParams { /// `true` (`--all-releases`), every variant is downloaded. No effect /// on ecosystems without per-release artifact_id variants. pub all_releases: bool, + /// `--strict` forwarded to the nested apply (a beforeHash mismatch + /// fails instead of warn-and-overwrite). + pub strict: bool, } /// Narrow a selection of patches down to the release variant(s) present @@ -1030,7 +1033,7 @@ pub async fn download_and_apply_patches( let action = decide_patch_action(&manifest, &patch.purl, &patch.uuid); if let PatchAction::Skipped = action { if !params.json && !params.silent { - eprintln!(" [skip] {} (already in manifest)", patch.purl); + eprintln!(" [skip] {} (already in manifest)", normalize_purl(&patch.purl)); } downloaded_patches.push(serde_json::json!({ "purl": patch.purl, @@ -1193,6 +1196,7 @@ pub async fn download_and_apply_patches( global_prefix: params.global_prefix.clone(), silent: params.json || params.silent, download_mode: params.download_mode.clone(), + strict: params.strict, ..crate::args::GlobalArgs::default() }, force: false, @@ -1621,6 +1625,7 @@ pub async fn run(args: GetArgs) -> i32 { download_mode: args.common.download_mode.clone(), api_overrides: args.common.api_client_overrides(), all_releases: args.all_releases, + strict: args.common.strict, }; let (code, result_json) = download_and_apply_patches(&selected, ¶ms).await; @@ -1810,6 +1815,7 @@ async fn save_and_apply_patch(args: &GetArgs, patch: &PatchResponse) -> i32 { global_prefix: args.common.global_prefix.clone(), silent: quiet, download_mode: args.common.download_mode.clone(), + strict: args.common.strict, ..crate::args::GlobalArgs::default() }, force: false, diff --git a/crates/socket-patch-cli/src/commands/scan.rs b/crates/socket-patch-cli/src/commands/scan.rs index f395763..3f2b989 100644 --- a/crates/socket-patch-cli/src/commands/scan.rs +++ b/crates/socket-patch-cli/src/commands/scan.rs @@ -10,7 +10,7 @@ use socket_patch_core::patch::apply_lock; use socket_patch_core::utils::cleanup_blobs::{ cleanup_unused_archives, cleanup_unused_blobs, CleanupResult, }; -use socket_patch_core::utils::purl::strip_purl_qualifiers; +use socket_patch_core::utils::purl::{normalize_purl, strip_purl_qualifiers}; use socket_patch_core::utils::telemetry::{ track_patch_scan_failed, track_patch_scanned, track_patch_vendor_failed, track_patch_vendored, }; @@ -54,6 +54,12 @@ pub(crate) struct GcSummary { pub blobs: CleanupResult, pub diffs: CleanupResult, pub packages: CleanupResult, + /// Vendored entries reverted (or revertable, preview mode) because + /// their patch is gone from the manifest or their dependency left the + /// lockfile graph — see `vendor::run_vendor_gc`. Sorted. + pub vendored_reverted: Vec, + /// Orphan `.socket/vendor//` dirs swept (or sweepable). + pub vendor_orphan_dirs: usize, /// `true` when `--no-prune` was set; the sub-object only carries the /// `skipped: true` field in that case. pub skipped: bool, @@ -64,6 +70,17 @@ impl GcSummary { self.blobs.bytes_freed + self.diffs.bytes_freed + self.packages.bytes_freed } + /// Fold a vendored-state GC pass into this summary. + fn absorb_vendor_gc(&mut self, v: super::vendor::VendorGcSummary) { + self.vendored_reverted = v + .dropped_reverted + .into_iter() + .chain(v.unused_reverted) + .collect(); + self.vendored_reverted.sort(); + self.vendor_orphan_dirs = v.orphan_dirs; + } + /// Serialize for a *mutating* GC pass (post-apply). fn to_apply_json(&self) -> serde_json::Value { if self.skipped { @@ -74,6 +91,8 @@ impl GcSummary { "removedBlobs": self.blobs.blobs_removed, "removedDiffArchives": self.diffs.blobs_removed, "removedPackageArchives": self.packages.blobs_removed, + "revertedVendoredEntries": self.vendored_reverted, + "removedVendorOrphanDirs": self.vendor_orphan_dirs, "bytesFreed": self.total_bytes(), }) } @@ -88,6 +107,8 @@ impl GcSummary { "orphanBlobs": self.blobs.blobs_removed, "orphanDiffArchives": self.diffs.blobs_removed, "orphanPackageArchives": self.packages.blobs_removed, + "revertableVendoredEntries": self.vendored_reverted, + "vendorOrphanDirs": self.vendor_orphan_dirs, "bytesReclaimable": self.total_bytes(), }) } @@ -118,6 +139,7 @@ async fn run_gc( diffs, packages, skipped: false, + ..Default::default() } } @@ -127,16 +149,28 @@ async fn run_gc( /// `prune` flag — when GC isn't requested, simply don't call this function and /// don't emit a `gc` sub-object. async fn run_apply_gc( + common: &crate::args::GlobalArgs, manifest_path: &Path, socket_dir: &Path, scanned_purls: &HashSet, vendored: &HashSet, ) -> GcSummary { + // Vendored-state GC FIRST: it reverts manifest-dropped and + // lockfile-unused vendored entries, dropping the latter's manifest + // entries — so the manifest prune + blob sweep below reclaims their + // blobs in this same pass (and the stale `vendored` exemption set is + // harmless: the entries it would exempt are already gone). + let vendor_gc = super::vendor::run_vendor_gc(common, manifest_path, /*dry_run=*/ false).await; + // Re-read the just-written manifest (the apply step may have added // or updated entries we now want to consider for pruning). let mut manifest = match read_manifest(manifest_path).await { Ok(Some(m)) => m, - _ => return GcSummary::default(), + _ => { + let mut gc = GcSummary::default(); + gc.absorb_vendor_gc(vendor_gc); + return gc; + } }; let prunable = detect_prunable(&manifest, scanned_purls, vendored); for purl in &prunable { @@ -147,22 +181,42 @@ async fn run_apply_gc( // file-level cleanup below still operates on the in-memory copy. let _ = write_manifest(manifest_path, &manifest).await; } - run_gc(&manifest, prunable, socket_dir, /*dry_run=*/ false).await + let mut gc = run_gc(&manifest, prunable, socket_dir, /*dry_run=*/ false).await; + gc.absorb_vendor_gc(vendor_gc); + gc } /// Dry-run preview of the apply-mode GC pass. Same shape as /// [`run_apply_gc`] but emits `prunable*`/`orphan*` field names and /// performs no mutation. async fn preview_apply_gc( + common: &crate::args::GlobalArgs, manifest_path: &Path, socket_dir: &Path, scanned_purls: &HashSet, vendored: &HashSet, ) -> GcSummary { + // Read-only preview of the vendored-state GC (lists, never reverts). + let vendor_gc = super::vendor::run_vendor_gc(common, manifest_path, /*dry_run=*/ true).await; + let mut manifest = match read_manifest(manifest_path).await { Ok(Some(m)) => m, - _ => return GcSummary::default(), + _ => { + let mut gc = GcSummary::default(); + gc.absorb_vendor_gc(vendor_gc); + return gc; + } }; + // Mirror the wet pass: an unused vendored entry's manifest keys are + // dropped before the blob sweep, so drop them from the in-memory copy + // too — otherwise the preview under-reports orphan blobs/bytes + // relative to what the real `--prune` run frees. + for purl in &vendor_gc.unused_reverted { + let base = strip_purl_qualifiers(purl).to_string(); + manifest + .patches + .retain(|k, _| k != purl && strip_purl_qualifiers(k) != base); + } let prunable = detect_prunable(&manifest, scanned_purls, vendored); // Mirror `run_apply_gc`: drop the prunable entries from the manifest // *before* computing orphans (no write — this is the preview). The @@ -174,7 +228,9 @@ async fn preview_apply_gc( for purl in &prunable { manifest.patches.remove(purl); } - run_gc(&manifest, prunable, socket_dir, /*dry_run=*/ true).await + let mut gc = run_gc(&manifest, prunable, socket_dir, /*dry_run=*/ true).await; + gc.absorb_vendor_gc(vendor_gc); + gc } /// PURL strings present in the manifest but absent from `scanned_purls`. @@ -197,28 +253,197 @@ async fn preview_apply_gc( /// copy is its NORMAL state, not "no longer installed". Without this, a /// wiped node_modules would prune the manifest entry — and the next /// `vendor` run would then reconcile-revert the vendoring itself. +/// +/// Both sides are compared in percent-DECODED form (`normalize_purl`): +/// manifest keys come from the API encoded (`pkg:npm/%40scope/x@1`) while +/// crawler purls carry the literal `@scope` — comparing the raw strings +/// would make every encoded scoped entry look prunable and `--prune`/ +/// `--sync` would GC the very patch it just downloaded. pub(crate) fn detect_prunable( manifest: &PatchManifest, scanned_purls: &HashSet, vendored: &HashSet, ) -> Vec { - let scanned_bases: HashSet<&str> = scanned_purls + let scanned_bases: HashSet = scanned_purls .iter() - .map(|p| strip_purl_qualifiers(p)) + .map(|p| normalize_purl(strip_purl_qualifiers(p)).into_owned()) .collect(); manifest .patches .keys() .filter(|p| { - let base = strip_purl_qualifiers(p); - !scanned_bases.contains(base) + let base = normalize_purl(strip_purl_qualifiers(p)); + !scanned_bases.contains(base.as_ref()) && !vendored.contains(p.as_str()) - && !vendored.contains(base) + && !vendored.contains(strip_purl_qualifiers(p)) }) .cloned() .collect() } +/// Lockfile-only packages: dependencies the project's lockfile resolves +/// that have no crawled (installed) counterpart. +#[derive(Default)] +struct LockfileSupplement { + packages: Vec, + /// Literal crawler-form purls, for fast membership tests. + purls: HashSet, + /// The lockfile the entries came from, for messages. + source: &'static str, +} + +/// Inventory the project's lockfile(s) and fabricate crawl entries for +/// dependencies that are not installed. The fabricated `path` is the +/// WOULD-BE install dir — every consumer degrades safely on a nonexistent +/// path (hash verify → NotFound, apply → partitioned skip, vendor → +/// auto-fetch). Global scans target the machine's global tree, not this +/// project's lockfile, so they get no supplement. +async fn lockfile_supplement( + common: &GlobalArgs, + crawled: &[socket_patch_core::crawlers::types::CrawledPackage], +) -> LockfileSupplement { + use socket_patch_core::patch::vendor::lock_inventory; + + let mut out = LockfileSupplement { + source: "project lockfiles", + ..Default::default() + }; + if common.global || common.global_prefix.is_some() { + return out; + } + let entries = lock_inventory::inventory_project(&common.cwd).await; + if entries.is_empty() { + return out; + } + let crawled_purls: HashSet<&str> = crawled.iter().map(|p| p.purl.as_str()).collect(); + for entry in entries { + if crawled_purls.contains(entry.purl.as_str()) { + continue; + } + let Some(pkg) = crawled_from_purl(&entry.purl, &common.cwd) else { + continue; + }; + out.purls.insert(entry.purl.clone()); + out.packages.push(pkg); + } + out +} + +/// A displayable crawl entry fabricated from a purl (decoded form). The +/// path is a placeholder consumers degrade safely on. +fn crawled_from_purl( + purl: &str, + cwd: &std::path::Path, +) -> Option { + let decoded = normalize_purl(strip_purl_qualifiers(purl)).into_owned(); + let rest = decoded.strip_prefix("pkg:")?; + let (_eco, rest) = rest.split_once('/')?; + let at = rest.rfind('@').filter(|&i| i > 0)?; + let (name_part, version) = (&rest[..at], &rest[at + 1..]); + let (namespace, name) = match name_part.rsplit_once('/') { + Some((ns, n)) => (Some(ns.to_string()), n.to_string()), + None => (None, name_part.to_string()), + }; + Some(socket_patch_core::crawlers::types::CrawledPackage { + name, + version: version.to_string(), + namespace, + purl: decoded.clone(), + path: cwd.join("node_modules").join(name_part), + }) +} + +/// Vendored-ledger packages with no crawled counterpart: on a fresh clone +/// the committed artifact IS the dependency, so these stay discoverable +/// (updates[] detection, the table, and `scan --vendor` re-vendor/in-sync +/// runs all keep working before any install). They are NOT "lockfile-only" +/// — nothing needs installing; the artifact satisfies the lock. +async fn vendored_ledger_supplement( + common: &GlobalArgs, + crawled: &[socket_patch_core::crawlers::types::CrawledPackage], +) -> Vec { + if common.global || common.global_prefix.is_some() { + return Vec::new(); + } + let Ok(state) = socket_patch_core::patch::vendor::load_state(&common.cwd).await else { + return Vec::new(); + }; + let crawled_norm: HashSet = crawled + .iter() + .map(|p| normalize_purl(&p.purl).into_owned()) + .collect(); + let mut seen: HashSet = HashSet::new(); + let mut out = Vec::new(); + for entry in state.entries.values() { + let base = strip_purl_qualifiers(&entry.base_purl); + let norm = normalize_purl(base).into_owned(); + if crawled_norm.contains(&norm) || !seen.insert(norm) { + continue; + } + if let Some(pkg) = crawled_from_purl(base, &common.cwd) { + out.push(pkg); + } + } + out.sort_by(|a, b| a.purl.cmp(&b.purl)); + out +} + +/// Vendor-mode pre-prompt check: uuids of selected patches whose installed +/// files match NEITHER beforeHash nor afterHash — the patch was built +/// against different bytes than the installed artifact. Vendoring still +/// succeeds for these (the vendor stage force-applies the verified patched +/// content; see `force_apply_staged`), but the user should learn it BEFORE +/// the confirm prompt, not from a post-hoc warning event. +/// +/// Best-effort and read-only: a detail-fetch failure or an unresolvable +/// installed path just skips the annotation — it never blocks the flow and +/// writes nothing (unlike `download_patch_records`, which stages blobs). +async fn preverify_vendor_baselines( + api_client: &socket_patch_core::api::client::ApiClient, + org_slug: Option<&str>, + selected: &[PatchSearchResult], + crawled: &[socket_patch_core::crawlers::types::CrawledPackage], + lockfile_only: &HashSet, +) -> HashSet { + use socket_patch_core::manifest::schema::PatchFileInfo; + use socket_patch_core::patch::apply::{verify_file_patch, VerifyStatus}; + use socket_patch_core::utils::purl::purl_eq; + + let mut mismatched: HashSet = HashSet::new(); + for patch in selected { + // API purls come percent-encoded, crawler purls literal — purl_eq + // bridges the two spellings. + let base = strip_purl_qualifiers(&patch.purl); + // Lockfile-only packages have no installed bytes to compare — the + // vendor engine fetches them pristine (nothing to annotate). + if lockfile_only.contains(normalize_purl(base).as_ref()) { + continue; + } + let Some(pkg) = crawled.iter().find(|c| purl_eq(&c.purl, base)) else { + continue; + }; + let Ok(Some(detail)) = api_client.fetch_patch(org_slug, &patch.uuid).await else { + continue; + }; + for (file, info) in &detail.files { + let info = PatchFileInfo { + before_hash: info.before_hash.clone().unwrap_or_default(), + after_hash: info.after_hash.clone().unwrap_or_default(), + }; + if info.before_hash.is_empty() { + continue; // a new file has no baseline to compare + } + if verify_file_patch(&pkg.path, file, &info).await.status + == VerifyStatus::HashMismatch + { + mismatched.insert(patch.uuid.clone()); + break; + } + } + } + mismatched +} + /// Cross-reference an existing manifest against discovery results to find /// PURLs whose newest available patch UUID differs from the locally-recorded /// one. Used by both the discovery JSON path and the table-print path. @@ -613,7 +838,7 @@ async fn run_vendor_json_path( result["vendor"] = preview_vendor_json(&args.common.cwd, &selected).await; if prune { let gc = - preview_apply_gc(manifest_path, socket_dir, scanned_purls, vendored_purls).await; + preview_apply_gc(&args.common, manifest_path, socket_dir, scanned_purls, vendored_purls).await; result["gc"] = gc.to_preview_json(); } let final_code = @@ -639,6 +864,7 @@ async fn run_vendor_json_path( download_mode: args.common.download_mode.clone(), api_overrides: args.common.api_client_overrides(), all_releases: args.all_releases, + strict: args.common.strict, }; let mut has_errors = false; let detached_records: Option> = if args.detached { @@ -674,7 +900,7 @@ async fn run_vendor_json_path( // package_not_installed; vendored entries are exempt from // the prune itself. if prune { - let gc = run_apply_gc(manifest_path, socket_dir, scanned_purls, vendored_purls).await; + let gc = run_apply_gc(&args.common, manifest_path, socket_dir, scanned_purls, vendored_purls).await; result["gc"] = gc.to_apply_json(); } @@ -760,7 +986,7 @@ async fn run_vendor_interactive_path( // GC before the vendor step (see the JSON path): stale manifest // entries would fail vendoring with package_not_installed. if prune { - let gc = run_apply_gc(manifest_path, socket_dir, scanned_purls, vendored_purls).await; + let gc = run_apply_gc(&args.common, manifest_path, socket_dir, scanned_purls, vendored_purls).await; if !gc.pruned.is_empty() { println!("GC: pruned {} manifest entr{}.", gc.pruned.len(), { if gc.pruned.len() == 1 { @@ -770,6 +996,15 @@ async fn run_vendor_interactive_path( } }); } + if !gc.vendored_reverted.is_empty() || gc.vendor_orphan_dirs > 0 { + println!( + "GC: reverted {} vendored entr{}; swept {} orphan vendor dir{}.", + gc.vendored_reverted.len(), + if gc.vendored_reverted.len() == 1 { "y" } else { "ies" }, + gc.vendor_orphan_dirs, + if gc.vendor_orphan_dirs == 1 { "" } else { "s" }, + ); + } } match boxed_scan_vendor_step( &args.common, @@ -830,6 +1065,39 @@ fn partition_vendored_selected( (kept, vendored_records) } +/// Lockfile-only patches are skipped BEFORE download in apply mode: the +/// package is not on disk to patch in place, and downloading its patch +/// into the manifest would create a not-yet-appliable entry (and flip the +/// apply path's exit code). `scan --vendor` is the route that handles them +/// (the vendor engine auto-fetches lockfile-resolved packages). Matching +/// bridges API purl encoding via `normalize_purl`. Same shape/mechanics as +/// [`partition_vendored_selected`]. +fn partition_not_installed_selected( + selected: Vec, + lockfile_only: &HashSet, +) -> (Vec, Vec) { + if lockfile_only.is_empty() { + return (selected, Vec::new()); + } + let is_lockfile_only = |p: &str| { + lockfile_only.contains(normalize_purl(strip_purl_qualifiers(p)).as_ref()) + }; + let (not_installed, kept): (Vec<_>, Vec<_>) = selected + .into_iter() + .partition(|p| is_lockfile_only(&p.purl)); + let mut records: Vec = not_installed + .iter() + .map(|p| { + serde_json::json!({ + "purl": p.purl, "uuid": p.uuid, + "action": "skipped", "errorCode": "package_not_installed", + }) + }) + .collect(); + records.sort_by(|a, b| a["purl"].as_str().cmp(&b["purl"].as_str())); + (kept, records) +} + /// Fold the pre-download vendored skips into the apply report returned by /// `download_and_apply_patches`: they were "found" by discovery and /// skipped here, never downloaded. Also strips the inner `status` (scan @@ -1063,7 +1331,28 @@ pub async fn run(args: ScanArgs) -> i32 { } // Crawl packages - let (all_crawled, eco_counts) = crawl_all_ecosystems(&crawler_options).await; + let (mut all_crawled, mut eco_counts) = crawl_all_ecosystems(&crawler_options).await; + + // Lockfile supplement: dependencies the project's lockfile resolves + // that have NO installed copy (fresh clone, partial install). They join + // discovery — counts, API lookup, table, the prune "scanned" set — and + // are flagged "not yet installed" everywhere a user could act on them. + let lockfile_only = lockfile_supplement(&args.common, &all_crawled).await; + if !lockfile_only.packages.is_empty() { + for pkg in &lockfile_only.packages { + if let Some(eco) = Ecosystem::from_purl(&pkg.purl) { + *eco_counts.entry(eco).or_insert(0) += 1; + } + } + all_crawled.extend(lockfile_only.packages.iter().cloned()); + } + let ledger_supplement = vendored_ledger_supplement(&args.common, &all_crawled).await; + for pkg in &ledger_supplement { + if let Some(eco) = Ecosystem::from_purl(&pkg.purl) { + *eco_counts.entry(eco).or_insert(0) += 1; + } + } + all_crawled.extend(ledger_supplement); // Every PURL the crawl found, captured BEFORE the `--ecosystems` // display/query filter is applied. Prune (below) must reference the @@ -1072,6 +1361,9 @@ pub async fn run(args: ScanArgs) -> i32 { // prune used the filtered set instead, `scan --ecosystems npm --prune` // would treat every cargo/go/pypi/gem manifest entry as "uninstalled" // and delete it (plus its blobs) — silent cross-ecosystem data loss. + // Lockfile-only purls are deliberately included: a dependency the + // lockfile still resolves must not be pruned just because node_modules + // is wiped or partially installed. let installed_purls: HashSet = all_crawled.iter().map(|p| p.purl.clone()).collect(); // Vendor-ledger purl keys, loaded once and shared by the prune @@ -1129,6 +1421,7 @@ pub async fn run(args: ScanArgs) -> i32 { let mut result = serde_json::json!({ "status": "success", "scannedPackages": 0, + "lockfileOnlyPackages": 0, "packagesWithPatches": 0, "totalPatches": 0, "freePatches": 0, @@ -1189,6 +1482,13 @@ pub async fn run(args: ScanArgs) -> i32 { } else { eprintln!("Found {package_count} packages{eco_summary}"); } + if !lockfile_only.purls.is_empty() { + eprintln!( + "Note: {} package(s) from {} are not yet installed (lockfile-only).", + lockfile_only.purls.len(), + lockfile_only.source, + ); + } } // Query API in batches @@ -1374,6 +1674,7 @@ pub async fn run(args: ScanArgs) -> i32 { let mut result = serde_json::json!({ "status": "success", "scannedPackages": package_count, + "lockfileOnlyPackages": lockfile_only.purls.len(), "packagesWithPatches": all_packages_with_patches.len(), "totalPatches": total_patches, "freePatches": free_patches, @@ -1386,6 +1687,19 @@ pub async fn run(args: ScanArgs) -> i32 { "newUuid": u.new_uuid, })).collect::>(), }); + // Flag lockfile-only packages so JSON consumers can tell "patch + // available but not installed" from the installed case. Additive + // field; absent means installed. + if let Some(packages) = result["packages"].as_array_mut() { + for pkg in packages { + let is_lockfile_only = pkg["purl"] + .as_str() + .is_some_and(|p| lockfile_only.purls.contains(p)); + if is_lockfile_only { + pkg["notInstalled"] = serde_json::json!(true); + } + } + } // `apply` and `prune` are computed once at the top of run() // (factoring in --sync, which implies both). They're independent @@ -1428,6 +1742,17 @@ pub async fn run(args: ScanArgs) -> i32 { // operator's signal to run `scan --vendor` (or `vendor`). let (selected, vendored_records) = partition_vendored_selected(selected, &vendored_purls); + // Lockfile-only purls leave the apply selection here (calm + // skip records, never an error); the union rides the same + // bookkeeping as the vendored skips. + let (selected, vendored_records) = { + let (kept, not_installed) = + partition_not_installed_selected(selected, &lockfile_only.purls); + let mut all = vendored_records; + all.extend(not_installed); + all.sort_by(|a, b| a["purl"].as_str().cmp(&b["purl"].as_str())); + (kept, all) + }; let mut apply_code = 0i32; if dry { @@ -1497,6 +1822,7 @@ pub async fn run(args: ScanArgs) -> i32 { download_mode: args.common.download_mode.clone(), api_overrides: args.common.api_client_overrides(), all_releases: args.all_releases, + strict: args.common.strict, }; let (code, apply_json) = download_and_apply_patches(&selected, ¶ms).await; apply_code = code; @@ -1511,10 +1837,10 @@ pub async fn run(args: ScanArgs) -> i32 { // --- GC (if requested) -------------------------------------- if prune { let gc = if dry { - preview_apply_gc(&manifest_path, &socket_dir, &scanned_purls, &vendored_purls) + preview_apply_gc(&args.common, &manifest_path, &socket_dir, &scanned_purls, &vendored_purls) .await } else { - run_apply_gc(&manifest_path, &socket_dir, &scanned_purls, &vendored_purls).await + run_apply_gc(&args.common, &manifest_path, &socket_dir, &scanned_purls, &vendored_purls).await }; result["gc"] = if dry { gc.to_preview_json() @@ -1564,9 +1890,9 @@ pub async fn run(args: ScanArgs) -> i32 { // --- GC-only path (no --apply, just --prune) -------------------- if prune { let gc = if dry { - preview_apply_gc(&manifest_path, &socket_dir, &scanned_purls, &vendored_purls).await + preview_apply_gc(&args.common, &manifest_path, &socket_dir, &scanned_purls, &vendored_purls).await } else { - run_apply_gc(&manifest_path, &socket_dir, &scanned_purls, &vendored_purls).await + run_apply_gc(&args.common, &manifest_path, &socket_dir, &scanned_purls, &vendored_purls).await }; result["gc"] = if dry { gc.to_preview_json() @@ -1670,14 +1996,22 @@ pub async fn run(args: ScanArgs) -> i32 { } else { String::new() }; + // Lockfile-only packages can be patched by `scan --vendor` + // (which fetches them pristine) but not applied in place. + let not_installed_marker = if lockfile_only.purls.contains(pkg.purl.as_str()) { + color(" [NOT INSTALLED]", "33", use_color) + } else { + String::new() + }; println!( - "{:<40} {:>8} {:<16} {}{}", + "{:<40} {:>8} {:<16} {}{}{}", display_purl, count_str, format_severity(severity, use_color), vuln_str, update_marker, + not_installed_marker, ); } @@ -1804,7 +2138,30 @@ pub async fn run(args: ScanArgs) -> i32 { for p in &vendored_selected { println!( " [skip] {} (vendored — run scan --vendor to update)", - p.purl + normalize_purl(&p.purl) + ); + } + } + + // Lockfile-only purls leave the in-place apply selection (calm skip, + // mirrors the JSON path). In `--vendor` mode they stay: the vendor + // engine fetches lockfile-resolved packages pristine. + let (selected, not_installed_selected): (Vec<_>, Vec) = if args.vendor { + (selected, Vec::new()) + } else { + let (kept, skipped) = partition_not_installed_selected(selected, &lockfile_only.purls); + let printed: Vec = skipped + .iter() + .filter_map(|r| r["purl"].as_str().map(str::to_string)) + .collect(); + (kept, printed) + }; + if !args.common.silent { + for purl in ¬_installed_selected { + println!( + " [skip] {} (not installed — run your package manager's install first, \ + or `scan --vendor` to vendor it from the lockfile)", + normalize_purl(purl) ); } } @@ -1816,6 +2173,22 @@ pub async fn run(args: ScanArgs) -> i32 { return embed_vex_human(&args.common, &args.vex, &manifest_path, 0).await; } + // Vendor mode: pre-verify baselines so a content mismatch surfaces + // BEFORE the confirm prompt (vendoring still proceeds for these — + // the stage force-applies the verified patched content). + let mismatched_baselines: HashSet = if args.vendor && !args.common.silent { + preverify_vendor_baselines( + &api_client, + effective_org_slug, + &selected, + &filtered_crawled, + &lockfile_only.purls, + ) + .await + } else { + HashSet::new() + }; + // Display detailed summary of selected patches before confirming // (presentational only — skipped wholesale under --silent). if !args.common.silent { @@ -1851,10 +2224,18 @@ pub async fn run(args: ScanArgs) -> i32 { println!( " {} [{}] {}", - patch.purl, + // Human display only: show the decoded form of an + // API-encoded purl (`%40scope` → `@scope`). JSON output + // keeps the verbatim key. + normalize_purl(&patch.purl), patch.tier.to_uppercase(), sev_colored, ); + if mismatched_baselines.contains(&patch.uuid) { + println!( + " (installed content differs from patch baseline — will vendor patched content)" + ); + } if !vuln_ids.is_empty() { println!(" Fixes: {}", vuln_ids.join(", ")); } @@ -1925,6 +2306,7 @@ pub async fn run(args: ScanArgs) -> i32 { download_mode: args.common.download_mode.clone(), api_overrides: args.common.api_client_overrides(), all_releases: args.all_releases, + strict: args.common.strict, }; let code = if args.vendor { @@ -1954,7 +2336,7 @@ pub async fn run(args: ScanArgs) -> i32 { // run `socket-patch gc` (or `repair`) explicitly. (Vendor mode // already ran its GC before the vendor step.) if prune && !args.vendor { - let gc = run_apply_gc(&manifest_path, &socket_dir, &scanned_purls, &vendored_purls).await; + let gc = run_apply_gc(&args.common, &manifest_path, &socket_dir, &scanned_purls, &vendored_purls).await; let total = gc.blobs.blobs_removed + gc.diffs.blobs_removed + gc.packages.blobs_removed; if !args.common.silent && (!gc.pruned.is_empty() || total > 0) { println!( @@ -1966,6 +2348,15 @@ pub async fn run(args: ScanArgs) -> i32 { socket_patch_core::utils::cleanup_blobs::format_bytes(gc.total_bytes()), ); } + if !args.common.silent && (!gc.vendored_reverted.is_empty() || gc.vendor_orphan_dirs > 0) { + println!( + "GC: reverted {} vendored entr{}; swept {} orphan vendor dir{}.", + gc.vendored_reverted.len(), + if gc.vendored_reverted.len() == 1 { "y" } else { "ies" }, + gc.vendor_orphan_dirs, + if gc.vendor_orphan_dirs == 1 { "" } else { "s" }, + ); + } } embed_vex_human(&args.common, &args.vex, &manifest_path, code).await @@ -2156,6 +2547,16 @@ mod tests { HashSet::new() } + /// GlobalArgs rooted at the test project dir (the vendored-state GC + /// loads `.socket/vendor/state.json` from `cwd`; these fixtures have + /// none, so the vendor pass is a no-op). + fn gc_common(cwd: &Path) -> crate::args::GlobalArgs { + crate::args::GlobalArgs { + cwd: cwd.to_path_buf(), + ..Default::default() + } + } + #[test] fn detect_prunable_empty_manifest_empty_scanned() { let m = PatchManifest::new(); @@ -2244,6 +2645,23 @@ mod tests { ); } + #[test] + fn detect_prunable_encoded_manifest_key_not_pruned() { + // The API serves scoped purls percent-encoded and they land in the + // manifest verbatim; the crawler reports the literal `@scope` form. + // Comparing raw strings would make every encoded scoped entry look + // prunable — `scan --prune` would GC the patch it just downloaded. + let m = manifest_with(&[("pkg:npm/%40scope/x@1.0.0", "uuid-a")]); + let s = scanned(&["pkg:npm/@scope/x@1.0.0"]); + assert!( + detect_prunable(&m, &s, &no_vendored()).is_empty(), + "encoded manifest key must match the decoded scanned purl" + ); + // A genuinely-gone encoded entry still prunes. + let out = detect_prunable(&m, &scanned(&[]), &no_vendored()); + assert_eq!(out, vec!["pkg:npm/%40scope/x@1.0.0".to_string()]); + } + #[test] fn detect_prunable_exempts_qualified_variant_of_vendored_base() { // The ledger key set carries qualifier-stripped bases (see @@ -2319,7 +2737,14 @@ mod tests { seed_manifest_with_blob(tmp.path(), "pkg:npm/gone@1.0.0", &after_hash); let scanned: HashSet = HashSet::new(); - let preview = preview_apply_gc(&manifest_path, &socket_dir, &scanned, &no_vendored()).await; + let preview = preview_apply_gc( + &gc_common(tmp.path()), + &manifest_path, + &socket_dir, + &scanned, + &no_vendored(), + ) + .await; assert_eq!( preview.pruned, @@ -2356,13 +2781,20 @@ mod tests { let (mp_p, sd_p, blob_p) = seed_manifest_with_blob(tmp_preview.path(), "pkg:npm/gone@1.0.0", &after_hash); let scanned: HashSet = HashSet::new(); - let preview = preview_apply_gc(&mp_p, &sd_p, &scanned, &no_vendored()).await; + let preview = preview_apply_gc( + &gc_common(tmp_preview.path()), + &mp_p, + &sd_p, + &scanned, + &no_vendored(), + ) + .await; assert!(blob_p.exists(), "preview must not mutate"); let tmp_wet = tempfile::tempdir().unwrap(); let (mp_w, sd_w, blob_w) = seed_manifest_with_blob(tmp_wet.path(), "pkg:npm/gone@1.0.0", &after_hash); - let wet = run_apply_gc(&mp_w, &sd_w, &scanned, &no_vendored()).await; + let wet = run_apply_gc(&gc_common(tmp_wet.path()), &mp_w, &sd_w, &scanned, &no_vendored()).await; assert_eq!( preview.blobs.blobs_removed, wet.blobs.blobs_removed, diff --git a/crates/socket-patch-cli/src/commands/vendor.rs b/crates/socket-patch-cli/src/commands/vendor.rs index ac7f9c6..0ca3037 100644 --- a/crates/socket-patch-cli/src/commands/vendor.rs +++ b/crates/socket-patch-cli/src/commands/vendor.rs @@ -27,7 +27,7 @@ use socket_patch_core::patch::vendor::{ self, ecosystem_dir_for_purl, load_state, save_state, RevertOutcome, VendorEntry, VendorOutcome, VendorWarning, }; -use socket_patch_core::utils::purl::strip_purl_qualifiers; +use socket_patch_core::utils::purl::{normalize_purl, strip_purl_qualifiers}; use socket_patch_core::utils::telemetry::{track_patch_vendor_failed, track_patch_vendored}; use socket_patch_core::vex::time::now_rfc3339; use std::collections::{HashMap, HashSet}; @@ -49,8 +49,12 @@ pub struct VendorArgs { #[command(flatten)] pub common: GlobalArgs, - /// Skip pre-vendor hash verification (vendor even if the installed - /// package's files differ from the patch's beforeHash). + /// Tolerate MISSING patch-target files in the staged copy (they are + /// skipped instead of failing the vendor) and bypass the variant + /// probe for multi-release ecosystems. A plain beforeHash mismatch + /// no longer needs this: vendor staging always overwrites mismatched + /// content with the verified patched bytes (surfaced as a + /// `vendor_content_mismatch_overwritten` warning). #[arg( short = 'f', long, @@ -235,6 +239,62 @@ pub(crate) async fn dispatch_revert_one( } } +/// Is this vendored entry still consumed by its project's lockfile +/// dependency graph? `None` = cannot determine — callers must keep the +/// entry (fail-safe): non-npm ecosystems have no in-use probe yet, and a +/// missing/unreadable lockfile proves nothing. +pub(crate) async fn dispatch_in_use_one(entry: &VendorEntry, project_root: &Path) -> Option { + match entry.ecosystem.as_str() { + "npm" => { + socket_patch_core::patch::vendor::npm_flavor::vendored_entry_in_use( + entry, + project_root, + ) + .await + } + _ => None, + } +} + +/// Uuid dirs under `.socket/vendor//` with no owning `(eco, uuid)` +/// ledger entry (a hand-edited state file, or artifacts left by an +/// interrupted run). The lockfile wiring for these is already gone or +/// owned by a recorded entry, so removal is safe; removed unless +/// `dry_run`. Unparseable dirs are never returned (and never deleted). +/// Returns the orphans so callers can emit events / counts. +pub(crate) async fn sweep_orphan_vendor_dirs( + cwd: &Path, + state: &socket_patch_core::patch::vendor::VendorState, + dry_run: bool, +) -> Vec { + let recorded_units: HashSet<(&str, &str)> = state + .entries + .values() + .map(|e| (e.ecosystem.as_str(), e.uuid.as_str())) + .collect(); + let mut orphans = Vec::new(); + for unit in vendor::path::sweep_vendor_dirs(cwd).await { + if recorded_units.contains(&(unit.eco.as_str(), unit.uuid.as_str())) { + continue; + } + if !dry_run { + let _ = remove_tree(&unit.dir).await; + } + orphans.push(unit); + } + orphans +} + +/// Does `eco` fall inside this run's `--ecosystems` scope? +pub(crate) fn ecosystem_in_scope(common: &GlobalArgs, eco: &str) -> bool { + match common.ecosystems.as_deref() { + None => true, + Some(list) => list.iter().any(|e| { + e.eq_ignore_ascii_case(eco) || (eco == "golang" && e.eq_ignore_ascii_case("go")) + }), + } +} + /// Surface a backend warning: stderr line for humans, a Skipped event with /// the stable code for JSON consumers (Skipped never flips the status). fn record_warning(env: &mut Envelope, purl: &str, warning: &VendorWarning, common: &GlobalArgs) { @@ -465,13 +525,137 @@ pub(crate) async fn vendor_records( global_prefix: common.global_prefix.clone(), batch_size: 100, }; - let all_packages = find_packages_for_purls( + let mut all_packages = find_packages_for_purls( &vendorable_partition, &crawler_options, common.silent || common.json, ) .await; + // ── Auto-fetch: lockfile-resolved packages with no installed copy ──── + // A manifest patch whose package is not on disk but IS resolvable from + // the project's lockfile is fetched pristine from its registry (lock- + // recorded URL else the conventional one), verified against the lock's + // integrity FAIL-CLOSED, and staged from a private tempdir — the + // project tree is never touched, and the lock wiring works without an + // installed copy (it keys off lock entries). The holders keep the + // tempdirs alive until the dispatch loop below has staged from them. + let mut fetched_holders: Vec = + Vec::new(); + // Fetch failures must keep their distinct Failed event; this set + // suppresses the later duplicate `package_not_installed` skip. + let mut fetch_failed: HashSet = HashSet::new(); + { + use socket_patch_core::patch::vendor::{lock_inventory, registry_fetch}; + let missing: Vec = vendorable + .iter() + .filter(|p| !all_packages.contains_key(*p)) + .cloned() + .collect(); + if !missing.is_empty() { + // The inventory is a local file read — fine offline; only the + // fetch itself needs the network. + let inventory = lock_inventory::inventory_project(&common.cwd).await; + let client = registry_fetch::build_registry_client(); + // Pre-loaded vendor ledger for the artifact-staging path: an + // already-vendored purl with no installed copy (fresh clone) + // stages from its own committed artifact, sha256-verified + // against the ledger — offline-safe, no registry traffic. + let ledger = load_state(&common.cwd).await.unwrap_or_default(); + for purl in &missing { + if let Some(entry) = ledger + .entries + .get(purl) + .or_else(|| ledger.entries.values().find(|e| &e.base_purl == purl)) + .filter(|e| e.ecosystem == "npm" && e.artifact.path.ends_with(".tgz")) + { + let tgz = common.cwd.join(&entry.artifact.path); + match registry_fetch::stage_local_artifact(&tgz, &entry.artifact.sha256) + .await + { + Ok(staged) => { + all_packages.insert(purl.clone(), staged.dir().to_path_buf()); + fetched_holders.push(staged); + continue; + } + Err(registry_fetch::FetchError::Failed(detail)) => { + // A corrupt committed artifact is worth a loud + // failure — re-vendoring over it would mask the + // corruption. + fetch_failed.insert(purl.clone()); + env.record( + PatchEvent::new(PatchAction::Failed, purl.clone()) + .with_error("vendor_fetch_failed", detail.clone()), + ); + if !common.silent && !common.json { + eprintln!( + "Cannot vendor {}: {detail}", + normalize_purl(purl) + ); + } + continue; + } + Err(registry_fetch::FetchError::Unverifiable(_)) => { + // No recorded hash (legacy ledger) — fall + // through to the lockfile/registry path. + } + } + } + let Some(entry) = lock_inventory::lookup(&inventory, purl) else { + continue; // not lockfile-resolvable → package_not_installed + }; + if common.offline { + // The enriched skip detail lands below in the unmatched + // pass (the purl stays unmatched). + continue; + } + match registry_fetch::fetch_and_stage(entry, &client).await { + Ok(fetched) => { + record_warning( + env, + purl, + &VendorWarning::new( + "vendor_fetched_missing", + format!( + "{}@{} is not installed; fetched the pristine artifact \ + from {} (integrity verified against the lockfile) and \ + vendored from that copy — the project tree was not \ + touched", + entry.name, entry.version, fetched.url + ), + ), + common, + ); + all_packages.insert(purl.clone(), fetched.dir().to_path_buf()); + fetched_holders.push(fetched); + } + Err(registry_fetch::FetchError::Unverifiable(detail)) => { + record_warning( + env, + purl, + &VendorWarning::new("vendor_fetch_unverifiable", detail), + common, + ); + // Falls through to package_not_installed below. + } + Err(registry_fetch::FetchError::Failed(detail)) => { + fetch_failed.insert(purl.clone()); + env.record( + PatchEvent::new(PatchAction::Failed, purl.clone()) + .with_error("vendor_fetch_failed", detail.clone()), + ); + if !common.silent && !common.json { + eprintln!( + "Cannot vendor {}: fetch failed: {detail}", + normalize_purl(purl) + ); + } + } + } + } + } + } + let vendored_at = now_rfc3339(); let mut state = match load_state(&common.cwd).await { Ok(s) => s, @@ -566,7 +750,7 @@ pub(crate) async fn vendor_records( ); } if !common.silent && !common.json { - eprintln!("Cannot vendor {candidate}: {detail}"); + eprintln!("Cannot vendor {}: {detail}", normalize_purl(candidate)); } } Some(VendorOutcome::Done { @@ -579,23 +763,45 @@ pub(crate) async fn vendor_records( if !common.silent && !common.json { eprintln!( "Failed to vendor {}: {}", - candidate, + normalize_purl(candidate), result.error.as_deref().unwrap_or("unknown error") ); } } let mut event = result_to_event(&result, common.dry_run); // The shared translator's in-sync classification reads - // `already_patched`; under `vendor` the contract tag is - // `already_vendored` (artifact + wiring already in sync). + // `already_patched`. Two distinct cases land there: + // + // * `entry` is None — the TRUE in-sync rerun (the backend + // synthesized AlreadyPatched and recorded nothing); + // under `vendor` the contract tag is `already_vendored`. + // * `entry` is Some — the FIRST vendor of a package + // already patched in place by `apply`: every file + // verified AlreadyPatched, but THIS run packed the + // artifact and rewired the lock. That is an Applied + // (`summary.applied` must count it), not a skip. if event.action == PatchAction::Skipped && event.error_code.as_deref() == Some("already_patched") { - event = PatchEvent::new(PatchAction::Skipped, candidate.clone()) - .with_reason( - "already_vendored", - "artifact and lockfile wiring already in sync", - ); + if entry.is_none() { + event = PatchEvent::new(PatchAction::Skipped, candidate.clone()) + .with_reason( + "already_vendored", + "artifact and lockfile wiring already in sync", + ); + } else { + let files = result + .files_verified + .iter() + .map(|f| crate::json_envelope::PatchEventFile { + path: f.file.clone(), + verified: true, + applied_via: None, + }) + .collect(); + event = PatchEvent::new(PatchAction::Applied, candidate.clone()) + .with_files(files); + } } env.record(event); for w in &warnings { @@ -681,10 +887,10 @@ pub(crate) async fn vendor_records( } // Manifest entries that targeted in-scope ecosystems but had no - // installed package on disk. + // installed package on disk (and could not be auto-fetched). let mut unmatched: Vec = vendorable .iter() - .filter(|p| !matched.contains(*p)) + .filter(|p| !matched.contains(*p) && !fetch_failed.contains(*p)) .cloned() .collect(); unmatched.sort(); @@ -694,15 +900,39 @@ pub(crate) async fn vendor_records( .map(|p| strip_purl_qualifiers(p).to_string()) .collect(); unmatched.retain(|p| !vendored_bases.contains(strip_purl_qualifiers(p))); + has_errors |= !fetch_failed.is_empty(); if !unmatched.is_empty() { has_errors = true; + // Offline runs name the packages the lockfile COULD have fetched — + // the inventory is a local file read, allowed offline. + let lock_resolvable: HashSet = if common.offline { + let entries = + socket_patch_core::patch::vendor::lock_inventory::inventory_project(&common.cwd) + .await; + unmatched + .iter() + .filter(|p| { + socket_patch_core::patch::vendor::lock_inventory::lookup(&entries, p) + .is_some() + }) + .cloned() + .collect() + } else { + HashSet::new() + }; for purl in &unmatched { + let detail = if lock_resolvable.contains(purl) { + "no installed package found; --offline prevents fetching it from the \ + registry (the lockfile resolves it)" + } else { + "no installed package found" + }; env.record( PatchEvent::new(PatchAction::Skipped, purl.clone()) - .with_reason("package_not_installed", "no installed package found"), + .with_reason("package_not_installed", detail), ); if !common.silent && !common.json { - eprintln!("Cannot vendor {purl}: package not installed"); + eprintln!("Cannot vendor {}: {detail}", normalize_purl(purl)); } } } @@ -741,12 +971,6 @@ pub(crate) async fn reconcile_dropped( // Respect this run's --ecosystems scope: a `vendor --ecosystems npm` // invocation must not silently revert a cargo/go entry (restoring its // lockfile and deleting its artifact) as a cross-ecosystem side effect. - let in_scope = |eco: &str| match common.ecosystems.as_deref() { - None => true, - Some(list) => list.iter().any(|e| { - e.eq_ignore_ascii_case(eco) || (eco == "golang" && e.eq_ignore_ascii_case("go")) - }), - }; let stale: Vec = state .entries .iter() @@ -756,7 +980,7 @@ pub(crate) async fn reconcile_dropped( // normal state, not a drop — only `vendor --revert` or // `remove` may undo them. !entry.detached - && in_scope(&entry.ecosystem) + && ecosystem_in_scope(common, &entry.ecosystem) && !manifest.patches.contains_key(*purl) && !manifest.patches.contains_key(&entry.base_purl) }) @@ -849,19 +1073,7 @@ async fn run_revert(args: &VendorArgs, env: &mut Envelope) -> i32 { // state file, or artifacts left by an interrupted run). The lockfile // wiring for these is already gone or owned by a recorded entry, so // removal is safe; unparseable dirs are reported, never deleted. - let swept = vendor::path::sweep_vendor_dirs(&common.cwd).await; - let recorded_units: HashSet<(&str, &str)> = state - .entries - .values() - .map(|e| (e.ecosystem.as_str(), e.uuid.as_str())) - .collect(); - for unit in swept { - if recorded_units.contains(&(unit.eco.as_str(), unit.uuid.as_str())) { - continue; - } - if !common.dry_run { - let _ = remove_tree(&unit.dir).await; - } + for unit in sweep_orphan_vendor_dirs(&common.cwd, &state, common.dry_run).await { let label = unit .purls .first() @@ -899,3 +1111,381 @@ async fn run_revert(args: &VendorArgs, env: &mut Envelope) -> i32 { 0 } } + +// ───────────────────────── prune-time vendored GC ───────────────────────── + +/// Summary of the vendored-state GC pass `scan --prune` runs (wet or +/// preview). Purls are the state-ledger keys (manifest spelling). +#[derive(Debug, Default)] +pub(crate) struct VendorGcSummary { + /// (a) entries whose patch is gone from the manifest — reverted. + pub dropped_reverted: Vec, + /// (b) entries whose package left the lockfile dependency graph — + /// reverted, and their manifest entries dropped. + pub unused_reverted: Vec, + /// (c) orphan uuid dirs (no owning ledger entry) swept. + pub orphan_dirs: usize, + /// Entries that could not be reverted (kept in the ledger), plus any + /// pass-level skip marker (e.g. lock contention). + pub failed: Vec, +} + +/// The vendored-state GC behind `scan --prune`: +/// +/// (a) revert entries whose patch was dropped from the manifest (same +/// stale test as [`reconcile_dropped`], shared with the vendor flows); +/// (b) revert entries whose dependency is no longer in the lockfile graph +/// ([`dispatch_in_use_one`] == `Some(false)`; `None` keeps, fail-safe) +/// and drop their manifest entries so the caller's manifest prune + +/// blob sweep reclaims the rest in the same pass; +/// (c) sweep orphan uuid dirs. +/// +/// Detached entries are exempt from BOTH (a) (never manifest-tracked) and +/// (b) (lockfile-invisible by design — the probe would always call them +/// unused). A missing/unreadable manifest skips (a) only (a prune must +/// not mass-revert on a deleted manifest — that is `vendor --revert`'s +/// explicit contract). +/// +/// Wet runs take the apply lock (lockfiles + the manifest are rewritten); +/// contention records a skip marker and returns — it never fails the +/// scan. Dry runs are read-only, lock-free, and list-only. +pub(crate) async fn run_vendor_gc( + common: &GlobalArgs, + manifest_path: &Path, + dry_run: bool, +) -> VendorGcSummary { + let mut out = VendorGcSummary::default(); + let mut state = match load_state(&common.cwd).await { + Ok(s) if !s.entries.is_empty() => s, + // No ledger (or unreadable): only the orphan sweep could apply, and + // without a trustworthy ledger it must not delete anything. + _ => return out, + }; + + let socket_dir = manifest_path + .parent() + .map(Path::to_path_buf) + .unwrap_or_else(|| common.cwd.clone()); + let _guard = if dry_run { + None + } else { + match socket_patch_core::patch::apply_lock::acquire(&socket_dir, Duration::from_secs(0)) { + Ok(g) => Some(g), + Err(_) => { + out.failed.push( + "vendor GC skipped: another socket-patch run holds the apply lock".to_string(), + ); + return out; + } + } + }; + + // (a) manifest-dropped entries. + let mut manifest = socket_patch_core::manifest::operations::read_manifest(manifest_path) + .await + .ok() + .flatten(); + if let Some(m) = &manifest { + let stale: Vec = state + .entries + .iter() + .filter(|(purl, entry)| { + !entry.detached + && ecosystem_in_scope(common, &entry.ecosystem) + && !m.patches.contains_key(*purl) + && !m.patches.contains_key(&entry.base_purl) + }) + .map(|(purl, _)| purl.clone()) + .collect(); + for purl in stale { + if dry_run { + out.dropped_reverted.push(purl); + continue; + } + let entry = state.entries.get(&purl).cloned().expect("listed above"); + if dispatch_revert_one(&entry, &common.cwd, false).await.success { + state.entries.remove(&purl); + out.dropped_reverted.push(purl); + } else { + out.failed.push(purl); + } + } + } + + // (b) lockfile-unused entries. + let mut manifest_dirty = false; + let candidates: Vec = state + .entries + .iter() + .filter(|(_, entry)| !entry.detached && ecosystem_in_scope(common, &entry.ecosystem)) + .map(|(purl, _)| purl.clone()) + .collect(); + for purl in candidates { + let entry = state.entries.get(&purl).cloned().expect("listed above"); + if dispatch_in_use_one(&entry, &common.cwd).await != Some(false) { + continue; // in use, or cannot determine — keep + } + if dry_run { + out.unused_reverted.push(purl); + continue; + } + if !dispatch_revert_one(&entry, &common.cwd, false).await.success { + out.failed.push(purl); + continue; + } + state.entries.remove(&purl); + if let Some(m) = manifest.as_mut() { + let base = strip_purl_qualifiers(&entry.base_purl).to_string(); + let dropped: Vec = m + .patches + .keys() + .filter(|k| *k == &purl || strip_purl_qualifiers(k) == base) + .cloned() + .collect(); + for k in dropped { + m.patches.remove(&k); + manifest_dirty = true; + } + } + out.unused_reverted.push(purl); + } + + if !dry_run { + let _ = save_state(&common.cwd, &state).await; + if manifest_dirty { + if let Some(m) = &manifest { + let _ = + socket_patch_core::manifest::operations::write_manifest(manifest_path, m).await; + } + } + } + + // (c) orphan uuid dirs, against the post-removal ledger. + out.orphan_dirs = sweep_orphan_vendor_dirs(&common.cwd, &state, dry_run) + .await + .len(); + out +} + +#[cfg(test)] +mod gc_tests { + use super::*; + use socket_patch_core::manifest::operations::{read_manifest, write_manifest}; + use socket_patch_core::patch::vendor::state::VendorArtifact; + use socket_patch_core::patch::vendor::VendorState; + use std::path::PathBuf; + + const UUID: &str = "9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f"; + const PURL: &str = "pkg:npm/left-pad@1.3.0"; + + fn entry(detached: bool) -> VendorEntry { + VendorEntry { + ecosystem: "npm".into(), + base_purl: PURL.into(), + uuid: UUID.into(), + artifact: VendorArtifact { + path: format!(".socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz"), + sha256: String::new(), + size: None, + platform_locked: None, + }, + wiring: Vec::new(), + lock: None, + took_over_go_patches: false, + detached, + record: None, + flavor: Some("package-lock".into()), + uv: None, + pnpm: None, + poetry: None, + pdm: None, + pipenv: None, + } + } + + /// Tempdir with: a manifest carrying PURL, a ledger with one entry, + /// the artifact on disk, and a package-lock that resolves to it. + async fn gc_fixture(detached: bool) -> (tempfile::TempDir, GlobalArgs, PathBuf) { + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + let socket = root.join(".socket"); + tokio::fs::create_dir_all(socket.join(format!("vendor/npm/{UUID}"))) + .await + .unwrap(); + tokio::fs::write( + socket.join(format!("vendor/npm/{UUID}/left-pad-1.3.0.tgz")), + b"tgz", + ) + .await + .unwrap(); + + let mut manifest = PatchManifest::new(); + manifest.patches.insert( + PURL.to_string(), + socket_patch_core::manifest::schema::PatchRecord { + uuid: UUID.to_string(), + exported_at: String::new(), + files: HashMap::new(), + vulnerabilities: HashMap::new(), + description: String::new(), + license: String::new(), + tier: String::new(), + }, + ); + let manifest_path = socket.join("manifest.json"); + write_manifest(&manifest_path, &manifest).await.unwrap(); + + let mut state = VendorState::default(); + state.entries.insert(PURL.to_string(), entry(detached)); + save_state(root, &state).await.unwrap(); + + tokio::fs::write( + root.join("package-lock.json"), + format!( + "{{\"packages\":{{\"node_modules/left-pad\":{{\"resolved\":\"file:.socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz\"}}}}}}" + ), + ) + .await + .unwrap(); + + let common = GlobalArgs { + cwd: root.to_path_buf(), + json: true, + silent: true, + ..GlobalArgs::default() + }; + (tmp, common, manifest_path) + } + + /// In-manifest + in-lock: the GC keeps everything. + #[tokio::test] + async fn vendor_gc_keeps_in_use_entries() { + let (tmp, common, manifest_path) = gc_fixture(false).await; + let out = run_vendor_gc(&common, &manifest_path, false).await; + assert!(out.dropped_reverted.is_empty(), "{out:?}"); + assert!(out.unused_reverted.is_empty(), "{out:?}"); + assert_eq!(out.orphan_dirs, 0); + assert!(load_state(tmp.path()).await.unwrap().entries.contains_key(PURL)); + } + + /// (a) the patch is gone from the manifest: revert + drop the entry. + #[tokio::test] + async fn vendor_gc_reverts_manifest_dropped_entry() { + let (tmp, common, manifest_path) = gc_fixture(false).await; + write_manifest(&manifest_path, &PatchManifest::new()) + .await + .unwrap(); + + let out = run_vendor_gc(&common, &manifest_path, false).await; + assert_eq!(out.dropped_reverted, vec![PURL.to_string()], "{out:?}"); + assert!(out.failed.is_empty(), "{out:?}"); + assert!(load_state(tmp.path()).await.unwrap().entries.is_empty()); + assert!( + !tmp.path().join(format!(".socket/vendor/npm/{UUID}")).exists(), + "artifact dir removed by the revert" + ); + } + + /// (b) the dependency left the lockfile graph: revert + drop BOTH the + /// ledger entry and the manifest entry. + #[tokio::test] + async fn vendor_gc_reverts_unused_entry_and_drops_manifest_entry() { + let (tmp, common, manifest_path) = gc_fixture(false).await; + // Re-lock without the dependency (no reference to the artifact). + tokio::fs::write(tmp.path().join("package-lock.json"), "{\"packages\":{}}") + .await + .unwrap(); + + let out = run_vendor_gc(&common, &manifest_path, false).await; + assert_eq!(out.unused_reverted, vec![PURL.to_string()], "{out:?}"); + assert!(load_state(tmp.path()).await.unwrap().entries.is_empty()); + let manifest = read_manifest(&manifest_path).await.unwrap().unwrap(); + assert!( + !manifest.patches.contains_key(PURL), + "the unused entry's manifest record is dropped too" + ); + } + + /// Dry run lists without mutating anything. + #[tokio::test] + async fn vendor_gc_dry_run_is_read_only() { + let (tmp, common, manifest_path) = gc_fixture(false).await; + tokio::fs::write(tmp.path().join("package-lock.json"), "{\"packages\":{}}") + .await + .unwrap(); + let state_before = tokio::fs::read(tmp.path().join(".socket/vendor/state.json")) + .await + .unwrap(); + let manifest_before = tokio::fs::read(&manifest_path).await.unwrap(); + + let out = run_vendor_gc(&common, &manifest_path, true).await; + assert_eq!(out.unused_reverted, vec![PURL.to_string()], "{out:?}"); + assert_eq!( + tokio::fs::read(tmp.path().join(".socket/vendor/state.json")) + .await + .unwrap(), + state_before, + "dry run must not touch the ledger" + ); + assert_eq!( + tokio::fs::read(&manifest_path).await.unwrap(), + manifest_before, + "dry run must not touch the manifest" + ); + assert!( + tmp.path().join(format!(".socket/vendor/npm/{UUID}")).exists(), + "dry run must not remove artifacts" + ); + } + + /// A missing/undeterminable lockfile keeps the entry (fail-safe), and a + /// DETACHED entry is exempt from both (a) and (b). + #[tokio::test] + async fn vendor_gc_keeps_undeterminable_and_detached_entries() { + // Lock removed entirely: probe says None → keep. + let (tmp, common, manifest_path) = gc_fixture(false).await; + tokio::fs::remove_file(tmp.path().join("package-lock.json")) + .await + .unwrap(); + let out = run_vendor_gc(&common, &manifest_path, false).await; + assert!(out.unused_reverted.is_empty(), "{out:?}"); + assert!(load_state(tmp.path()).await.unwrap().entries.contains_key(PURL)); + + // Detached entry: absent from the manifest AND lockfile-invisible — + // exactly its normal state. Never reverted by the GC. + let (tmp, common, manifest_path) = gc_fixture(true).await; + write_manifest(&manifest_path, &PatchManifest::new()) + .await + .unwrap(); + tokio::fs::write(tmp.path().join("package-lock.json"), "{\"packages\":{}}") + .await + .unwrap(); + let out = run_vendor_gc(&common, &manifest_path, false).await; + assert!(out.dropped_reverted.is_empty(), "{out:?}"); + assert!(out.unused_reverted.is_empty(), "{out:?}"); + assert!(load_state(tmp.path()).await.unwrap().entries.contains_key(PURL)); + } + + /// (c) uuid dirs with no owning ledger entry are swept (wet) / counted + /// (dry). + #[tokio::test] + async fn vendor_gc_sweeps_orphan_uuid_dirs() { + let (tmp, common, manifest_path) = gc_fixture(false).await; + let orphan_uuid = "1a2b3c4d-5e6f-4a1b-8c2d-9e0f1a2b3c4d"; + let orphan_dir = tmp.path().join(format!(".socket/vendor/npm/{orphan_uuid}")); + tokio::fs::create_dir_all(&orphan_dir).await.unwrap(); + tokio::fs::write(orphan_dir.join("left-pad-1.3.0.tgz"), b"tgz") + .await + .unwrap(); + + let out = run_vendor_gc(&common, &manifest_path, true).await; + assert_eq!(out.orphan_dirs, 1, "{out:?}"); + assert!(orphan_dir.exists(), "dry run keeps the orphan"); + + let out = run_vendor_gc(&common, &manifest_path, false).await; + assert_eq!(out.orphan_dirs, 1, "{out:?}"); + assert!(!orphan_dir.exists(), "wet run sweeps the orphan"); + // The recorded entry's dir survives the sweep. + assert!(tmp.path().join(format!(".socket/vendor/npm/{UUID}")).exists()); + } +} diff --git a/crates/socket-patch-cli/tests/apply_network.rs b/crates/socket-patch-cli/tests/apply_network.rs index d9bb628..08f1fd8 100644 --- a/crates/socket-patch-cli/tests/apply_network.rs +++ b/crates/socket-patch-cli/tests/apply_network.rs @@ -449,49 +449,98 @@ async fn apply_with_force_overrides_hash_mismatch() { } #[tokio::test] -async fn apply_without_force_hash_mismatch_emits_failed_event() { +async fn apply_hash_mismatch_default_warns_and_applies_strict_fails() { let after = b"after\n"; let after_hash = git_sha256(after); let expected_before = b"expected-before\n"; let actual_before = b"DIFFERENT-CONTENT\n"; let expected_before_hash = git_sha256(expected_before); - let tmp = tempfile::tempdir().expect("tempdir"); - write_root_package_json(tmp.path()); - write_npm_package(tmp.path(), "mismatch", "1.0.0", "index.js", actual_before); - let socket = tmp.path().join(".socket"); - write_manifest_with_patch( - &socket, - "pkg:npm/mismatch@1.0.0", - "11111111-1111-4111-8111-111111111111", - &expected_before_hash, - &after_hash, - ); - let blobs = socket.join("blobs"); - std::fs::create_dir_all(&blobs).unwrap(); - std::fs::write(blobs.join(&after_hash), after).unwrap(); + let fixture = || { + let tmp = tempfile::tempdir().expect("tempdir"); + write_root_package_json(tmp.path()); + write_npm_package(tmp.path(), "mismatch", "1.0.0", "index.js", actual_before); + let socket = tmp.path().join(".socket"); + write_manifest_with_patch( + &socket, + "pkg:npm/mismatch@1.0.0", + "11111111-1111-4111-8111-111111111111", + &expected_before_hash, + &after_hash, + ); + let blobs = socket.join("blobs"); + std::fs::create_dir_all(&blobs).unwrap(); + std::fs::write(blobs.join(&after_hash), after).unwrap(); + tmp + }; + // DEFAULT: the mismatch is overwritten with the full verified patched + // content (the diff strategy would self-skip; the blob is hash-gated to + // afterHash) and surfaced as a warning event — exit 0. + let tmp = fixture(); let out = Command::new(binary()) .args(["apply", "--json", "--offline"]) .current_dir(tmp.path()) .env_remove("SOCKET_API_TOKEN") .output() .expect("run socket-patch"); - let code = out.status.code().unwrap_or(-1); let stdout = String::from_utf8_lossy(&out.stdout).to_string(); - assert_eq!(code, 1, "hash mismatch w/o --force must exit 1"); let v: serde_json::Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); - assert_eq!(v["status"], "partialFailure"); + assert_eq!( + out.status.code().unwrap_or(-1), + 0, + "default mismatch is a warning, not an error: {v:#}" + ); + assert_eq!(v["status"], "success", "{v:#}"); let events = v["events"].as_array().expect("events array"); - let has_failed = events.iter().any(|e| e["action"] == "failed"); assert!( - has_failed, - "must emit a failed event on hash mismatch; got events={events:?}" + events.iter().any(|e| e["action"] == "applied"), + "{events:?}" + ); + assert!( + events + .iter() + .any(|e| e["errorCode"] == "content_mismatch_overwritten"), + "the overwrite is surfaced as a warning event: {events:?}" ); + let content = std::fs::read(tmp.path().join("node_modules/mismatch/index.js")).unwrap(); + assert_eq!(content, after, "the file carries the verified patched bytes"); - // File must be UNCHANGED. + // The human run logs the warning to stderr. + let tmp = fixture(); + let out = Command::new(binary()) + .args(["apply", "--offline", "--yes"]) + .current_dir(tmp.path()) + .env_remove("SOCKET_API_TOKEN") + .output() + .expect("run socket-patch"); + let stderr = String::from_utf8_lossy(&out.stderr); + assert_eq!(out.status.code().unwrap_or(-1), 0, "stderr={stderr}"); + assert!( + stderr.contains("content_mismatch_overwritten"), + "stderr warning present: {stderr}" + ); + + // --strict: the old fail-closed contract — exit 1, failed event, file + // untouched. + let tmp = fixture(); + let out = Command::new(binary()) + .args(["apply", "--json", "--offline", "--strict"]) + .current_dir(tmp.path()) + .env_remove("SOCKET_API_TOKEN") + .output() + .expect("run socket-patch"); + let stdout = String::from_utf8_lossy(&out.stdout).to_string(); + let v: serde_json::Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); + assert_eq!(out.status.code().unwrap_or(-1), 1, "{v:#}"); + assert_eq!(v["status"], "partialFailure", "{v:#}"); + let events = v["events"].as_array().expect("events array"); + assert!( + events.iter().any(|e| e["action"] == "failed"), + "strict emits a failed event: {events:?}" + ); let content = std::fs::read(tmp.path().join("node_modules/mismatch/index.js")).unwrap(); - assert_eq!(content, actual_before, "hash mismatch must not modify file"); + assert_eq!(content, actual_before, "strict must not modify the file"); } // --------------------------------------------------------------------------- @@ -650,3 +699,87 @@ async fn apply_uses_locally_cached_blob_without_fetching() { "cached blob must survive apply" ); } + +// --------------------------------------------------------------------------- +// Mismatch + diff-mode sources: the full blob is redownloaded on demand. +// --------------------------------------------------------------------------- + +/// A mismatched file cannot be patched from a partial source (the diff +/// strategy needs the exact before-bytes), so the default mismatch policy +/// redownloads the FULL afterHash blob and applies that — even when a +/// local source archive made the stage step skip downloading. +#[tokio::test] +async fn apply_mismatch_redownloads_full_blob_and_applies() { + let after = b"after\n"; + let after_hash = git_sha256(after); + let expected_before_hash = git_sha256(b"expected-before\n"); + + let mock = MockServer::start().await; + Mock::given(method("GET")) + .and(path(format!( + "/v0/orgs/{ORG_SLUG}/patches/blob/{after_hash}" + ))) + .respond_with(ResponseTemplate::new(200).set_body_bytes(after.to_vec())) + .mount(&mock) + .await; + + let uuid = "11111111-1111-4111-8111-111111111111"; + let tmp = tempfile::tempdir().expect("tempdir"); + write_root_package_json(tmp.path()); + write_npm_package( + tmp.path(), + "mismatch", + "1.0.0", + "index.js", + b"DIFFERENT-CONTENT\n", + ); + let socket = tmp.path().join(".socket"); + write_manifest_with_patch( + &socket, + "pkg:npm/mismatch@1.0.0", + uuid, + &expected_before_hash, + &after_hash, + ); + // A LOCAL package archive exists (so the stage step downloads nothing) + // but carries no entry for index.js — only the blob can produce the + // patched bytes, and no blob is staged. + let packages = socket.join("packages"); + std::fs::create_dir_all(&packages).unwrap(); + { + use std::io::Write as _; + let mut builder = tar::Builder::new(flate2::write::GzEncoder::new( + std::fs::File::create(packages.join(format!("{uuid}.tar.gz"))).unwrap(), + flate2::Compression::default(), + )); + let mut header = tar::Header::new_gnu(); + let bytes = b"unrelated"; + header.set_size(bytes.len() as u64); + header.set_mode(0o644); + header.set_cksum(); + builder.append_data(&mut header, "other.js", &bytes[..]).unwrap(); + builder.into_inner().unwrap().finish().unwrap().flush().unwrap(); + } + + let (code, stdout, stderr) = run_apply(tmp.path(), &mock.uri(), &[]); + let v: serde_json::Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); + assert_eq!(code, 0, "stdout={v:#}\nstderr={stderr}"); + let events = v["events"].as_array().expect("events array"); + assert!( + events + .iter() + .any(|e| e["errorCode"] == "content_mismatch_overwritten"), + "{events:?}" + ); + + // The blob was fetched on demand… + let requests = mock.received_requests().await.unwrap(); + let blob_path = format!("/v0/orgs/{ORG_SLUG}/patches/blob/{after_hash}"); + assert!( + requests.iter().any(|r| r.url.path() == blob_path), + "the full blob must be redownloaded for the mismatched file" + ); + // …and the file carries the verified patched bytes. + let content = std::fs::read(tmp.path().join("node_modules/mismatch/index.js")).unwrap(); + assert_eq!(content, after); +} diff --git a/crates/socket-patch-cli/tests/cli_global_args.rs b/crates/socket-patch-cli/tests/cli_global_args.rs index bbcb84b..6faebd8 100644 --- a/crates/socket-patch-cli/tests/cli_global_args.rs +++ b/crates/socket-patch-cli/tests/cli_global_args.rs @@ -202,6 +202,7 @@ fn global_flag_cases_cover_every_global_field() { break_lock: _, debug: _, no_telemetry: _, + strict: _, } = common; // 20 fields ↔ 20 long-flag cases. Bump both this count and add a case when diff --git a/crates/socket-patch-cli/tests/cli_parse_scan.rs b/crates/socket-patch-cli/tests/cli_parse_scan.rs index 359994f..b961eb4 100644 --- a/crates/socket-patch-cli/tests/cli_parse_scan.rs +++ b/crates/socket-patch-cli/tests/cli_parse_scan.rs @@ -523,6 +523,7 @@ fn scan_json_empty_cwd_emits_updates_key() { let expected = serde_json::json!({ "status": "success", "scannedPackages": 0, + "lockfileOnlyPackages": 0, "packagesWithPatches": 0, "totalPatches": 0, "freePatches": 0, diff --git a/crates/socket-patch-cli/tests/e2e_vendor_cargo_build.rs b/crates/socket-patch-cli/tests/e2e_vendor_cargo_build.rs index 12e1b1a..28c118d 100644 --- a/crates/socket-patch-cli/tests/e2e_vendor_cargo_build.rs +++ b/crates/socket-patch-cli/tests/e2e_vendor_cargo_build.rs @@ -81,6 +81,10 @@ fn cargo(cwd: &Path, args: &[&str], cargo_home: &Path) -> Output { .args(args) .current_dir(cwd) .env("CARGO_HOME", cargo_home) + // The assertions read `/target/debug/...`; an ambient + // CARGO_TARGET_DIR (shared-build-cache setups) would redirect the + // child build elsewhere and break them. + .env_remove("CARGO_TARGET_DIR") .output() .expect("failed to run cargo") } diff --git a/crates/socket-patch-cli/tests/in_process_cargo_apply.rs b/crates/socket-patch-cli/tests/in_process_cargo_apply.rs index 860c4dd..bf17233 100644 --- a/crates/socket-patch-cli/tests/in_process_cargo_apply.rs +++ b/crates/socket-patch-cli/tests/in_process_cargo_apply.rs @@ -293,12 +293,14 @@ async fn cargo_fetch_scan_sync_patches_real_file() { } /// Safety gate: when the patch's advertised `beforeHash` does NOT match the -/// on-disk file, apply must REFUSE to write (it cannot trust that the blob is -/// a valid successor of whatever is actually on disk). The positive test -/// above only ever feeds a correct `beforeHash`, so a regression that made -/// apply blindly clobber the file regardless of its current content would -/// sail through it. This test pins the refusal: the file must be left -/// byte-for-byte untouched and the run must NOT report success. +/// on-disk file, `--strict` apply must REFUSE to write (the v3.4 DEFAULT +/// instead overwrites with the verified afterHash content and warns — see +/// `apply_hash_mismatch_default_warns_and_applies_strict_fails`). The +/// positive test above only ever feeds a correct `beforeHash`, so a +/// regression that made strict mode clobber the file regardless of its +/// current content would sail through it. This test pins the strict +/// refusal: the file must be left byte-for-byte untouched and the run must +/// NOT report success. #[tokio::test] #[serial] async fn cargo_apply_refuses_on_before_hash_mismatch() { @@ -344,9 +346,10 @@ async fn cargo_apply_refuses_on_before_hash_mismatch() { ecosystems: Some(vec!["cargo".to_string()]), download_mode: "diff".to_string(), dry_run: false, - // force MUST stay false: with --force, a hash mismatch is - // deliberately downgraded to "ready" and the file WOULD be - // overwritten. We are asserting the safe default refuses. + // strict pins the fail-closed contract: the v3.4 default (and + // --force) deliberately downgrade a hash mismatch to "ready" + // and the file WOULD be overwritten with verified content. + strict: true, ..socket_patch_cli::args::GlobalArgs::default() }, batch_size: 100, diff --git a/crates/socket-patch-cli/tests/in_process_get_update_count.rs b/crates/socket-patch-cli/tests/in_process_get_update_count.rs index a2101ae..36e4bef 100644 --- a/crates/socket-patch-cli/tests/in_process_get_update_count.rs +++ b/crates/socket-patch-cli/tests/in_process_get_update_count.rs @@ -72,6 +72,7 @@ fn params(root: &Path, server: &MockServer) -> DownloadParams { org_slug: Some(ORG.to_string()), proxy_url: None, }, + strict: false, // Skip release-narrowing; npm has no variants anyway. all_releases: true, } diff --git a/crates/socket-patch-cli/tests/in_process_vendor.rs b/crates/socket-patch-cli/tests/in_process_vendor.rs index 0ef740c..791d9f9 100644 --- a/crates/socket-patch-cli/tests/in_process_vendor.rs +++ b/crates/socket-patch-cli/tests/in_process_vendor.rs @@ -1228,3 +1228,181 @@ fn json_envelope_shape() { assert_eq!(env["status"], "noManifest"); assert!(events(&env).is_empty()); } + +// ──────────────── vendor auto-force + already-applied lifecycle ──────────────── + +/// A package already patched IN PLACE by `apply` must vendor cleanly on the +/// first run — and the envelope must report it as `applied` (this run packed +/// the artifact and rewired the lock), NOT `skipped/already_vendored`. The +/// second run is the true in-sync rerun and reports `already_vendored`. +#[test] +fn vendor_after_in_place_apply_emits_applied_event() { + let fx = npm_fixture(); + // Simulate a prior in-place `socket-patch apply`. + std::fs::write(fx.installed_index(), PATCHED_INDEX).unwrap(); + + let (code, env) = vendor_cli(fx.root(), &[]); + assert_eq!(code, 0, "{env:#}"); + let applied = find_event(&env, "applied", None); + assert_eq!(applied["purl"], PURL); + assert_eq!( + env["summary"]["applied"], 1, + "first vendor of an applied package counts as applied: {env:#}" + ); + assert!(fx.tgz_path().exists(), "artifact packed"); + assert!(fx.state_path().exists(), "ledger entry recorded"); + // No mismatch warning: afterHash content is AlreadyPatched, not divergent. + assert!( + !events(&env) + .iter() + .any(|e| e["errorCode"] == "vendor_content_mismatch_overwritten"), + "{env:#}" + ); + + // Second run: artifact + wiring already in sync. + let (code, env) = vendor_cli(fx.root(), &[]); + assert_eq!(code, 0, "{env:#}"); + find_event(&env, "skipped", Some("already_vendored")); + assert_eq!(env["summary"]["applied"], 0); +} + +/// Installed content matching NEITHER hash (a patch built against different +/// bytes than the installed artifact — the flatted@3.3.1 case) still vendors: +/// the stage is overwritten with the verified patched content, the run exits +/// 0 with an `applied` event, and the overwrite surfaces as a +/// `vendor_content_mismatch_overwritten` warning event. +#[test] +fn mismatched_baseline_vendors_with_warning_event() { + let fx = npm_fixture(); + std::fs::write( + fx.installed_index(), + b"module.exports = () => 'divergent';\n", + ) + .unwrap(); + + let (code, env) = vendor_cli(fx.root(), &[]); + assert_eq!(code, 0, "{env:#}"); + let applied = find_event(&env, "applied", None); + assert_eq!(applied["purl"], PURL); + let warning = find_event(&env, "skipped", Some("vendor_content_mismatch_overwritten")); + assert!( + warning["reason"] + .as_str() + .unwrap_or("") + .contains("left-pad@1.3.0"), + "warning names the package: {env:#}" + ); + assert!(fx.tgz_path().exists(), "artifact packed despite the mismatch"); + // The installed tree keeps its divergent bytes (only the stage changed). + assert_eq!( + std::fs::read(fx.installed_index()).unwrap(), + b"module.exports = () => 'divergent';\n" + ); +} + +/// A patch-target file MISSING from the installed package still fails closed +/// (auto-force must not inherit `--force`'s silent NotFound skip — the +/// tarball would ship without the fix); `--force` keeps that tolerance. +#[test] +fn vendor_missing_file_fails_closed_without_force() { + let fx = npm_fixture(); + std::fs::remove_file(fx.installed_index()).unwrap(); + + let (code, env) = vendor_cli(fx.root(), &[]); + assert_ne!(code, 0, "missing patch target must fail: {env:#}"); + let failed = find_event(&env, "failed", None); + assert!( + failed["error"] + .as_str() + .unwrap_or("") + .contains("File not found"), + "{env:#}" + ); + assert_eq!(fx.lock_bytes(), fx.original_lock, "lock byte-untouched"); + assert!(!fx.vendor_dir().exists(), "no artifacts on failure"); + + // --force: the missing file is tolerated (skipped) and the vendor lands. + let fx2 = npm_fixture(); + std::fs::remove_file(fx2.installed_index()).unwrap(); + let (code, env) = vendor_cli(fx2.root(), &["--force"]); + assert_eq!(code, 0, "{env:#}"); +} + +// ──────────────── percent-encoded scoped purls (Fix A integration) ──────────────── + +/// Build a fixture whose installed package is the SCOPED `@scope/left-pad` +/// while the manifest keys the patch by the API's percent-encoded purl +/// (`pkg:npm/%40scope/left-pad@1.3.0`) — exactly what `scan` writes. +fn npm_scoped_fixture() -> NpmFixture { + let fx = npm_fixture_with_purls(&["pkg:npm/%40scope/left-pad@1.3.0"]); + let root = fx.root(); + + // Re-home the installed package under the scope dir. + let scoped = root.join("node_modules/@scope/left-pad"); + std::fs::create_dir_all(scoped.parent().unwrap()).unwrap(); + std::fs::rename(root.join("node_modules/left-pad"), &scoped).unwrap(); + std::fs::write( + scoped.join("package.json"), + br#"{"name":"@scope/left-pad","version":"1.3.0"}"#, + ) + .unwrap(); + + // Re-key the lock entry to the scoped install path. + let mut lock: Value = serde_json::from_slice(&fx.original_lock).unwrap(); + let packages = lock["packages"].as_object_mut().unwrap(); + let entry = packages.remove("node_modules/left-pad").unwrap(); + packages.insert("node_modules/@scope/left-pad".to_string(), entry); + lock["packages"][""]["dependencies"] = json!({ "@scope/left-pad": "^1.3.0" }); + let mut lock_bytes = serde_json::to_vec_pretty(&lock).unwrap(); + lock_bytes.push(b'\n'); + std::fs::write(root.join("package-lock.json"), &lock_bytes).unwrap(); + + fx +} + +/// The API serves scoped purls percent-encoded and `scan` stores them +/// verbatim as manifest keys; vendor must decode them to find the installed +/// `node_modules/@scope/...` package and wire the lock — while the ledger +/// stays keyed by the verbatim encoded purl (manifest parity). +#[test] +fn vendor_resolves_percent_encoded_scope_purl() { + let fx = npm_scoped_fixture(); + + let (code, env) = vendor_cli(fx.root(), &[]); + assert_eq!(code, 0, "{env:#}"); + let applied = find_event(&env, "applied", None); + assert_eq!(applied["purl"], "pkg:npm/%40scope/left-pad@1.3.0"); + + // Artifact lands under the DECODED scope dir. + let tgz = fx + .root() + .join(format!(".socket/vendor/npm/{UUID}/@scope/left-pad-1.3.0.tgz")); + assert!(tgz.exists(), "tarball at the decoded scoped path"); + + // Lock rewired to the vendored artifact. + let lock = fx.lock_value(); + assert_eq!( + lock["packages"]["node_modules/@scope/left-pad"]["resolved"], + json!(format!( + "file:.socket/vendor/npm/{UUID}/@scope/left-pad-1.3.0.tgz" + )) + ); + + // Ledger keyed by the VERBATIM encoded purl (manifest key parity). + let state: Value = + serde_json::from_slice(&std::fs::read(fx.state_path()).unwrap()).unwrap(); + assert!( + state["entries"]["pkg:npm/%40scope/left-pad@1.3.0"].is_object(), + "state keyed by the encoded manifest purl: {state:#}" + ); + + // Round-trip: revert restores the original (scoped) lock bytes. + let (code, env) = vendor_cli(fx.root(), &["--revert"]); + assert_eq!(code, 0, "{env:#}"); + let lock = fx.lock_value(); + assert_eq!( + lock["packages"]["node_modules/@scope/left-pad"]["resolved"], + json!(REG_RESOLVED) + ); + assert!(!fx.vendor_dir().join("npm").exists(), "artifacts removed"); +} diff --git a/crates/socket-patch-cli/tests/scan_vendor_e2e.rs b/crates/socket-patch-cli/tests/scan_vendor_e2e.rs index cc13f29..d3e8cb5 100644 --- a/crates/socket-patch-cli/tests/scan_vendor_e2e.rs +++ b/crates/socket-patch-cli/tests/scan_vendor_e2e.rs @@ -412,3 +412,733 @@ async fn scan_vendor_flag_conflicts_are_clap_errors() { ); } } + +// ───────────── percent-encoded scoped purls (API canonical form) ───────────── + +const SCOPED_CRAWLER_PURL: &str = "pkg:npm/@scope/left-pad@1.3.0"; +const SCOPED_API_PURL: &str = "pkg:npm/%40scope/left-pad@1.3.0"; + +/// Like `write_fixture`, but the installed package is the SCOPED +/// `@scope/left-pad` (the crawler reports the literal `@scope` form). +fn write_scoped_fixture(root: &Path) { + std::fs::write( + root.join("package.json"), + r#"{ "name": "scan-vendor-test", "version": "0.0.0" }"#, + ) + .unwrap(); + let lock = serde_json::json!({ + "name": "scan-vendor-test", + "version": "0.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "scan-vendor-test", + "version": "0.0.0", + "dependencies": { "@scope/left-pad": "^1.3.0" } + }, + "node_modules/@scope/left-pad": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/@scope/left-pad/-/left-pad-1.3.0.tgz", + "integrity": "sha512-orig==", + "license": "WTFPL" + } + } + }); + let mut lock_bytes = serde_json::to_vec_pretty(&lock).unwrap(); + lock_bytes.push(b'\n'); + std::fs::write(root.join("package-lock.json"), lock_bytes).unwrap(); + + let pkg = root.join("node_modules/@scope/left-pad"); + std::fs::create_dir_all(&pkg).unwrap(); + std::fs::write( + pkg.join("package.json"), + br#"{"name":"@scope/left-pad","version":"1.3.0"}"#, + ) + .unwrap(); + std::fs::write(pkg.join("index.js"), BEFORE).unwrap(); +} + +/// Mock API that serves the patch under the percent-ENCODED purl (the +/// canonical form the production patches API returns for scoped packages), +/// while the batch request/response is keyed by the crawler's literal form. +async fn mount_scoped_patch_api(mock: &MockServer, uuid: &str) { + let before_hash = git_sha256(BEFORE); + let after_hash = git_sha256(AFTER); + Mock::given(method("POST")) + .and(path(format!("/v0/orgs/{ORG_SLUG}/patches/batch"))) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "packages": [{ + "purl": SCOPED_CRAWLER_PURL, + "patches": [{ + "uuid": uuid, + "purl": SCOPED_API_PURL, + "tier": "free", + "cveIds": ["CVE-2026-0001"], + "ghsaIds": [], + "severity": "high", + "title": "vendor target" + }] + }], + "canAccessPaidPatches": false, + }))) + .mount(mock) + .await; + // Per-package search: the crawler purl, urlencoded. + Mock::given(method("GET")) + .and(path(format!( + "/v0/orgs/{ORG_SLUG}/patches/by-package/pkg%3Anpm%2F%40scope%2Fleft-pad%401.3.0" + ))) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "patches": [{ + "uuid": uuid, + "purl": SCOPED_API_PURL, + "publishedAt": "2026-01-01T00:00:00Z", + "description": "Vendor patch", + "license": "MIT", + "tier": "free", + "vulnerabilities": {} + }], + "canAccessPaidPatches": false, + }))) + .mount(mock) + .await; + Mock::given(method("GET")) + .and(path(format!("/v0/orgs/{ORG_SLUG}/patches/view/{uuid}"))) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "uuid": uuid, + "purl": SCOPED_API_PURL, + "publishedAt": "2026-01-01T00:00:00Z", + "files": { + "package/index.js": { + "beforeHash": before_hash, + "afterHash": after_hash, + "blobContent": AFTER_B64, + } + }, + "vulnerabilities": {}, + "description": "Vendor patch", + "license": "MIT", + "tier": "free", + }))) + .mount(mock) + .await; +} + +/// The production patches API serves scoped purls percent-encoded +/// (`pkg:npm/%40scope/...`) and scan stores them verbatim as manifest keys. +/// The whole pipeline — download, vendor lookup against the literal +/// `node_modules/@scope/...` install, lock rewiring, prune exemption — must +/// bridge the two spellings. (Flowise regression: `%40modelcontextprotocol` +/// failed with `package not installed`.) +#[tokio::test] +async fn scan_vendor_resolves_percent_encoded_scoped_purl() { + let mock = MockServer::start().await; + mount_scoped_patch_api(&mock, UUID).await; + let tmp = tempfile::tempdir().unwrap(); + write_scoped_fixture(tmp.path()); + + // --prune in the same run: the freshly-downloaded ENCODED manifest + // entry must not be GC'd against the literal crawler purl. + let (code, stdout, stderr) = run_scan_vendor(tmp.path(), &mock.uri(), &["--prune"]); + assert_eq!(code, 0, "stdout={stdout}; stderr={stderr}"); + let v: serde_json::Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); + assert_eq!(v["status"], "success", "envelope={v}"); + + // Manifest keyed by the verbatim encoded purl — and NOT pruned. + let manifest: serde_json::Value = serde_json::from_str( + &std::fs::read_to_string(tmp.path().join(".socket/manifest.json")).unwrap(), + ) + .unwrap(); + assert_eq!( + manifest["patches"][SCOPED_API_PURL]["uuid"], UUID, + "manifest={manifest}" + ); + assert_eq!( + v["gc"]["prunedManifestEntries"], + serde_json::json!([]), + "the encoded entry must not look prunable: {v}" + ); + + // Vendored: artifact under the DECODED scope dir, lock rewired. + assert_eq!(v["vendor"]["summary"]["applied"], 1, "envelope={v}"); + let tgz = tmp.path().join(format!( + ".socket/vendor/npm/{UUID}/@scope/left-pad-1.3.0.tgz" + )); + assert!(tgz.is_file(), "tarball at the decoded scoped path"); + let lock = std::fs::read_to_string(tmp.path().join("package-lock.json")).unwrap(); + assert!( + lock.contains(&format!(".socket/vendor/npm/{UUID}/@scope/left-pad-1.3.0.tgz")), + "lock consumes the vendored tarball; lock={lock}" + ); + // Ledger keyed by the verbatim encoded purl. + let state: serde_json::Value = serde_json::from_str( + &std::fs::read_to_string(tmp.path().join(".socket/vendor/state.json")).unwrap(), + ) + .unwrap(); + assert_eq!(state["entries"][SCOPED_API_PURL]["uuid"], UUID, "{state}"); +} + +// ───────────────────── prune reconciles vendored state ───────────────────── + +/// After a dependency is removed and re-locked, `scan --prune` (without +/// `--vendor`) reverts the now-unused vendored entry: lock restored, ledger +/// entry + manifest entry dropped, artifact dir removed. +#[tokio::test] +async fn scan_prune_reverts_unused_vendored_entry() { + let mock = MockServer::start().await; + mount_patch_api(&mock, UUID).await; + let tmp = tempfile::tempdir().unwrap(); + write_fixture(tmp.path()); + + // A second installed package so the later prune run's crawl is + // non-empty (left-pad itself gets removed below). + let other = tmp.path().join("node_modules/keeper"); + std::fs::create_dir_all(&other).unwrap(); + std::fs::write( + other.join("package.json"), + br#"{"name":"keeper","version":"1.0.0"}"#, + ) + .unwrap(); + + let (code, stdout, stderr) = run_scan_vendor(tmp.path(), &mock.uri(), &[]); + assert_eq!(code, 0, "stdout={stdout}; stderr={stderr}"); + + // Simulate `npm uninstall left-pad` + re-lock: drop the dep from the + // lock graph and remove the installed copy. The override-free npm + // wiring leaves nothing else behind. + let lock = serde_json::json!({ + "name": "scan-vendor-test", + "version": "0.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { "name": "scan-vendor-test", "version": "0.0.0" } + } + }); + let mut lock_bytes = serde_json::to_vec_pretty(&lock).unwrap(); + lock_bytes.push(b'\n'); + std::fs::write(tmp.path().join("package-lock.json"), &lock_bytes).unwrap(); + std::fs::remove_dir_all(tmp.path().join("node_modules/left-pad")).unwrap(); + + // Plain prune scan (read-only discovery + GC; no --vendor, no --apply). + let out = Command::new(binary()) + .args([ + "scan", + "--json", + "--prune", + "--yes", + "--api-url", + &mock.uri(), + "--api-token", + "fake-token", + "--org", + ORG_SLUG, + ]) + .current_dir(tmp.path()) + .output() + .expect("run"); + let stdout = String::from_utf8_lossy(&out.stdout); + let code = out.status.code().unwrap_or(-1); + assert_eq!(code, 0, "stdout={stdout}"); + let v: serde_json::Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); + + assert_eq!( + v["gc"]["revertedVendoredEntries"], + serde_json::json!([PURL]), + "gc must report the reverted entry: {v}" + ); + + // Ledger empty (an emptied state file may be removed outright), + // manifest entry dropped, artifact gone. + match std::fs::read_to_string(tmp.path().join(".socket/vendor/state.json")) { + Ok(text) => { + let state: serde_json::Value = serde_json::from_str(&text).unwrap(); + assert!( + state["entries"].as_object().is_none_or(|m| m.is_empty()), + "ledger entry removed: {state}" + ); + } + Err(e) if e.kind() == std::io::ErrorKind::NotFound => {} + Err(e) => panic!("unexpected state.json read error: {e}"), + } + let manifest: serde_json::Value = serde_json::from_str( + &std::fs::read_to_string(tmp.path().join(".socket/manifest.json")).unwrap(), + ) + .unwrap(); + assert!( + manifest["patches"] + .as_object() + .is_none_or(|m| !m.contains_key(PURL)), + "manifest entry dropped: {manifest}" + ); + assert!( + !tmp.path().join(format!(".socket/vendor/npm/{UUID}")).exists(), + "artifact dir removed" + ); + // The (already left-pad-free) lock stays exactly as the user re-locked + // it — the revert had nothing to restore there. + assert_eq!( + std::fs::read(tmp.path().join("package-lock.json")).unwrap(), + lock_bytes + ); +} + +/// Interactive (non-JSON) `scan --vendor` pre-verifies patch baselines: +/// installed content matching NEITHER hash is annotated BEFORE the +/// confirm prompt, and the run still vendors (auto-force) with the +/// `vendor_content_mismatch_overwritten` warning on stderr. +#[tokio::test] +async fn scan_vendor_annotates_mismatched_baseline_and_vendors_anyway() { + let mock = MockServer::start().await; + mount_patch_api(&mock, UUID).await; + let tmp = tempfile::tempdir().unwrap(); + write_fixture(tmp.path()); + // Divergent installed bytes: neither BEFORE nor AFTER. + std::fs::write( + tmp.path().join("node_modules/left-pad/index.js"), + b"divergent\n", + ) + .unwrap(); + + let out = Command::new(binary()) + .args([ + "scan", + "--vendor", + "--yes", + "--api-url", + &mock.uri(), + "--api-token", + "fake-token", + "--org", + ORG_SLUG, + ]) + .current_dir(tmp.path()) + .output() + .expect("run"); + let stdout = String::from_utf8_lossy(&out.stdout); + let stderr = String::from_utf8_lossy(&out.stderr); + assert_eq!( + out.status.code().unwrap_or(-1), + 0, + "stdout={stdout}; stderr={stderr}" + ); + assert!( + stdout.contains("installed content differs from patch baseline"), + "pre-prompt annotation present; stdout={stdout}" + ); + assert!( + stderr.contains("vendor_content_mismatch_overwritten"), + "overwrite warning surfaced; stderr={stderr}" + ); + // Vendored despite the mismatch. + assert!(tmp + .path() + .join(format!(".socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz")) + .is_file()); +} + +// ───────────── lockfile auto-fetch + scan lockfile supplement ───────────── + +/// sha512 SRI of the given bytes (what an npm-family lock records). +fn sri_of(bytes: &[u8]) -> String { + use base64::Engine as _; + use sha2::Sha512; + format!( + "sha512-{}", + base64::engine::general_purpose::STANDARD.encode(Sha512::digest(bytes)) + ) +} + +/// A pristine registry tarball for left-pad@1.3.0 whose index.js carries +/// the patch's BEFORE bytes. +fn pristine_tgz() -> Vec { + let mut builder = tar::Builder::new(flate2::write::GzEncoder::new( + Vec::new(), + flate2::Compression::default(), + )); + for (path, bytes) in [ + ( + "package/package.json", + br#"{"name":"left-pad","version":"1.3.0"}"#.as_slice(), + ), + ("package/index.js", BEFORE), + ] { + let mut header = tar::Header::new_gnu(); + header.set_size(bytes.len() as u64); + header.set_mode(0o644); + header.set_cksum(); + builder.append_data(&mut header, path, bytes).unwrap(); + } + builder.into_inner().unwrap().finish().unwrap() +} + +/// Project fixture with a lockfile but NO node_modules: package.json + +/// package-lock.json whose left-pad entry resolves to `resolved_url` with +/// `integrity`. +fn write_lockfile_only_fixture(root: &Path, resolved_url: &str, integrity: &str) { + std::fs::write( + root.join("package.json"), + r#"{ "name": "scan-vendor-test", "version": "0.0.0", "dependencies": { "left-pad": "^1.3.0" } }"#, + ) + .unwrap(); + let lock = serde_json::json!({ + "name": "scan-vendor-test", + "version": "0.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "scan-vendor-test", + "version": "0.0.0", + "dependencies": { "left-pad": "^1.3.0" } + }, + "node_modules/left-pad": { + "version": "1.3.0", + "resolved": resolved_url, + "integrity": integrity, + "license": "WTFPL" + } + } + }); + let mut lock_bytes = serde_json::to_vec_pretty(&lock).unwrap(); + lock_bytes.push(b'\n'); + std::fs::write(root.join("package-lock.json"), lock_bytes).unwrap(); +} + +/// Pre-seed `.socket/manifest.json` + the after-blob so a standalone +/// `vendor` run has local patch sources (no patch-API traffic). +fn seed_manifest_and_blob(root: &Path) { + let socket = root.join(".socket"); + std::fs::create_dir_all(socket.join("blobs")).unwrap(); + let manifest = serde_json::json!({ + "patches": { + PURL: { + "uuid": UUID, + "exportedAt": "2026-01-01T00:00:00Z", + "files": { + "package/index.js": { + "beforeHash": git_sha256(BEFORE), + "afterHash": git_sha256(AFTER), + } + }, + "vulnerabilities": {}, + "description": "synthetic", + "license": "MIT", + "tier": "free" + } + } + }); + std::fs::write( + socket.join("manifest.json"), + serde_json::to_vec_pretty(&manifest).unwrap(), + ) + .unwrap(); + std::fs::write(socket.join("blobs").join(git_sha256(AFTER)), AFTER).unwrap(); +} + +async fn mount_registry_tarball(mock: &MockServer, tgz: Vec) { + Mock::given(method("GET")) + .and(path("/left-pad/-/left-pad-1.3.0.tgz")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(tgz)) + .mount(mock) + .await; +} + +fn run_vendor(root: &Path, extra: &[&str]) -> (i32, serde_json::Value, String) { + let mut argv = vec!["vendor", "--json"]; + argv.extend_from_slice(extra); + let out = Command::new(binary()) + .args(&argv) + .current_dir(root) + .env("SOCKET_TELEMETRY_DISABLED", "1") + .output() + .expect("run vendor"); + let stdout = String::from_utf8_lossy(&out.stdout).into_owned(); + let stderr = String::from_utf8_lossy(&out.stderr).into_owned(); + let v: serde_json::Value = serde_json::from_str(stdout.trim()) + .unwrap_or_else(|e| panic!("vendor --json must emit JSON: {e}\n{stdout}\n{stderr}")); + (out.status.code().unwrap_or(-1), v, stderr) +} + +/// A manifest patch whose package is NOT installed but IS lockfile-resolved +/// is fetched pristine from the registry (integrity-verified against the +/// lock) and vendored — node_modules never appears. +#[tokio::test] +async fn vendor_auto_fetches_missing_package_from_lockfile() { + let mock = MockServer::start().await; + let tgz = pristine_tgz(); + let integrity = sri_of(&tgz); + mount_registry_tarball(&mock, tgz).await; + + let tmp = tempfile::tempdir().unwrap(); + write_lockfile_only_fixture( + tmp.path(), + &format!("{}/left-pad/-/left-pad-1.3.0.tgz", mock.uri()), + &integrity, + ); + seed_manifest_and_blob(tmp.path()); + + let (code, v, _) = run_vendor(tmp.path(), &[]); + assert_eq!(code, 0, "{v:#}"); + let events = v["events"].as_array().unwrap(); + assert!( + events.iter().any(|e| e["action"] == "applied" && e["purl"] == PURL), + "{v:#}" + ); + assert!( + events + .iter() + .any(|e| e["errorCode"] == "vendor_fetched_missing"), + "fetch surfaced as a warning event: {v:#}" + ); + assert!(tmp + .path() + .join(format!(".socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz")) + .is_file()); + let lock = std::fs::read_to_string(tmp.path().join("package-lock.json")).unwrap(); + assert!(lock.contains(&format!(".socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz"))); + assert!( + !tmp.path().join("node_modules").exists(), + "the project tree is never touched" + ); +} + +/// Integrity mismatch between the lock and the served bytes is a distinct +/// vendor_fetch_failed failure — and nothing is written. +#[tokio::test] +async fn vendor_fetch_integrity_mismatch_is_vendor_fetch_failed() { + let mock = MockServer::start().await; + mount_registry_tarball(&mock, pristine_tgz()).await; + + let tmp = tempfile::tempdir().unwrap(); + write_lockfile_only_fixture( + tmp.path(), + &format!("{}/left-pad/-/left-pad-1.3.0.tgz", mock.uri()), + &sri_of(b"the lock expects different bytes"), + ); + seed_manifest_and_blob(tmp.path()); + + let (code, v, _) = run_vendor(tmp.path(), &[]); + assert_ne!(code, 0, "{v:#}"); + let events = v["events"].as_array().unwrap(); + assert!( + events + .iter() + .any(|e| e["action"] == "failed" && e["errorCode"] == "vendor_fetch_failed"), + "{v:#}" + ); + assert!( + !events + .iter() + .any(|e| e["errorCode"] == "package_not_installed"), + "no duplicate not-installed skip: {v:#}" + ); + assert!(!tmp.path().join(".socket/vendor").exists()); +} + +/// --offline refuses the fetch with a calm package_not_installed skip that +/// names the lockfile as the would-be source. No HTTP traffic happens (no +/// registry route is mounted — a request would 404 and fail differently). +#[tokio::test] +async fn vendor_offline_refuses_fetch_with_calm_skip() { + let tmp = tempfile::tempdir().unwrap(); + write_lockfile_only_fixture( + tmp.path(), + "http://127.0.0.1:1/left-pad/-/left-pad-1.3.0.tgz", + &sri_of(b"irrelevant"), + ); + seed_manifest_and_blob(tmp.path()); + + let (code, v, _) = run_vendor(tmp.path(), &["--offline"]); + assert_ne!(code, 0, "not-installed stays a non-benign skip: {v:#}"); + let events = v["events"].as_array().unwrap(); + let skip = events + .iter() + .find(|e| e["errorCode"] == "package_not_installed") + .unwrap_or_else(|| panic!("{v:#}")); + assert!( + skip["reason"] + .as_str() + .unwrap_or("") + .contains("--offline prevents fetching"), + "offline detail names the lockfile resolution: {v:#}" + ); +} + +/// An entry whose lock records no integrity is never fetched (fail-closed) +/// and keeps the plain not-installed outcome plus an explanatory warning. +#[tokio::test] +async fn vendor_fetch_unverifiable_lock_entry_stays_not_installed() { + let tmp = tempfile::tempdir().unwrap(); + // Hand-write a lock whose entry has no integrity field. + std::fs::write( + tmp.path().join("package.json"), + r#"{ "name": "x", "version": "0.0.0" }"#, + ) + .unwrap(); + std::fs::write( + tmp.path().join("package-lock.json"), + serde_json::to_vec_pretty(&serde_json::json!({ + "name": "x", "version": "0.0.0", "lockfileVersion": 3, + "packages": { + "": { "name": "x", "version": "0.0.0" }, + "node_modules/left-pad": { + "version": "1.3.0", + "resolved": "http://127.0.0.1:1/left-pad/-/left-pad-1.3.0.tgz" + } + } + })) + .unwrap(), + ) + .unwrap(); + seed_manifest_and_blob(tmp.path()); + + let (code, v, _) = run_vendor(tmp.path(), &[]); + assert_ne!(code, 0, "{v:#}"); + let events = v["events"].as_array().unwrap(); + assert!( + events + .iter() + .any(|e| e["errorCode"] == "vendor_fetch_unverifiable"), + "{v:#}" + ); + assert!( + events + .iter() + .any(|e| e["errorCode"] == "package_not_installed"), + "{v:#}" + ); +} + +/// The headline flow: a COMPLETELY fresh clone (lockfile, no node_modules, +/// no .socket) discovers from the lockfile and `scan --vendor` vendors +/// end-to-end via the registry fetch. +#[tokio::test] +async fn scan_vendor_works_on_a_completely_fresh_clone() { + let mock = MockServer::start().await; + mount_patch_api(&mock, UUID).await; + let tgz = pristine_tgz(); + let integrity = sri_of(&tgz); + mount_registry_tarball(&mock, tgz).await; + + let tmp = tempfile::tempdir().unwrap(); + write_lockfile_only_fixture( + tmp.path(), + &format!("{}/left-pad/-/left-pad-1.3.0.tgz", mock.uri()), + &integrity, + ); + + let (code, stdout, stderr) = run_scan_vendor(tmp.path(), &mock.uri(), &[]); + assert_eq!(code, 0, "stdout={stdout}; stderr={stderr}"); + let v: serde_json::Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); + assert_eq!(v["lockfileOnlyPackages"], 1, "{v}"); + assert_eq!(v["vendor"]["summary"]["applied"], 1, "{v}"); + assert!(tmp + .path() + .join(format!(".socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz")) + .is_file()); + assert!(!tmp.path().join("node_modules").exists()); + + // Second run: in sync. + let (code, stdout, stderr) = run_scan_vendor(tmp.path(), &mock.uri(), &[]); + assert_eq!(code, 0, "stdout={stdout}; stderr={stderr}"); + let v: serde_json::Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); + let events = v["vendor"]["events"].as_array().unwrap(); + assert!( + events + .iter() + .any(|e| e["errorCode"] == "already_vendored"), + "{v}" + ); +} + +/// Read-only discovery flags lockfile-only packages in JSON and the human +/// table. +#[tokio::test] +async fn scan_discovers_lockfile_only_packages_with_warning() { + let mock = MockServer::start().await; + mount_patch_api(&mock, UUID).await; + let tmp = tempfile::tempdir().unwrap(); + write_lockfile_only_fixture( + tmp.path(), + "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz", + &sri_of(b"unused for discovery"), + ); + + // JSON shape. + let out = Command::new(binary()) + .args([ + "scan", "--json", "--api-url", &mock.uri(), "--api-token", "fake-token", + "--org", ORG_SLUG, + ]) + .current_dir(tmp.path()) + .env("SOCKET_TELEMETRY_DISABLED", "1") + .output() + .expect("run"); + let stdout = String::from_utf8_lossy(&out.stdout); + let v: serde_json::Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); + assert_eq!(v["scannedPackages"], 1, "{v}"); + assert_eq!(v["lockfileOnlyPackages"], 1, "{v}"); + assert_eq!(v["packages"][0]["notInstalled"], true, "{v}"); + + // Human output: the table marker + the note. + let out = Command::new(binary()) + .args([ + "scan", "--api-url", &mock.uri(), "--api-token", "fake-token", + "--org", ORG_SLUG, "--dry-run", "--yes", + ]) + .current_dir(tmp.path()) + .env("SOCKET_TELEMETRY_DISABLED", "1") + .output() + .expect("run"); + let stdout = String::from_utf8_lossy(&out.stdout); + let stderr = String::from_utf8_lossy(&out.stderr); + assert!( + stdout.contains("[NOT INSTALLED]"), + "stdout={stdout}; stderr={stderr}" + ); + assert!( + stderr.contains("not yet installed (lockfile-only)"), + "stderr={stderr}" + ); +} + +/// `scan --apply` skips lockfile-only patches calmly: exit 0, a skipped +/// record with package_not_installed, and NO manifest entry written. +#[tokio::test] +async fn scan_apply_skips_lockfile_only_without_error() { + let mock = MockServer::start().await; + mount_patch_api(&mock, UUID).await; + let tmp = tempfile::tempdir().unwrap(); + write_lockfile_only_fixture( + tmp.path(), + "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz", + &sri_of(b"unused"), + ); + + let out = Command::new(binary()) + .args([ + "scan", "--json", "--apply", "--yes", "--api-url", &mock.uri(), + "--api-token", "fake-token", "--org", ORG_SLUG, + ]) + .current_dir(tmp.path()) + .env("SOCKET_TELEMETRY_DISABLED", "1") + .output() + .expect("run"); + let stdout = String::from_utf8_lossy(&out.stdout); + let code = out.status.code().unwrap_or(-1); + let v: serde_json::Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); + assert_eq!(code, 0, "lockfile-only must not flip the exit code: {v}"); + assert_eq!(v["status"], "success", "{v}"); + let patches = v["apply"]["patches"].as_array().unwrap(); + assert!( + patches.iter().any(|p| p["action"] == "skipped" + && p["errorCode"] == "package_not_installed"), + "{v}" + ); + assert!( + !tmp.path().join(".socket/manifest.json").exists(), + "no manifest entry is written for a not-installed package" + ); +} diff --git a/crates/socket-patch-core/src/crawlers/deno_crawler.rs b/crates/socket-patch-core/src/crawlers/deno_crawler.rs index 5a12c2d..150cc27 100644 --- a/crates/socket-patch-core/src/crawlers/deno_crawler.rs +++ b/crates/socket-patch-core/src/crawlers/deno_crawler.rs @@ -120,16 +120,21 @@ impl DenoCrawler { // manifest PURL and are joined onto the cache root below. A real // JSR coordinate is a single path segment, so reject any that // could traverse out of the cache (`..`/`.`, a separator, NUL). + // The parser percent-decodes components, so these guards see the + // decoded form — `%2e%2e` cannot smuggle a traversal past them. // Unlike the cargo/npm crawlers there is no content check to catch // a bogus path, and jsr patches in place — so fail closed here. - if !(is_safe_jsr_component(scope) - && is_safe_jsr_component(name) - && is_safe_jsr_component(version)) + if !(is_safe_jsr_component(&scope) + && is_safe_jsr_component(&name) + && is_safe_jsr_component(&version)) { continue; } // Cache layout: //// - let pkg_dir = jsr_cache_path.join(scope).join(name).join(version); + let pkg_dir = jsr_cache_path + .join(&*scope) + .join(&*name) + .join(&*version); if !is_dir(&pkg_dir).await { continue; } diff --git a/crates/socket-patch-core/src/crawlers/npm_crawler.rs b/crates/socket-patch-core/src/crawlers/npm_crawler.rs index 91cb624..2d9f8d3 100644 --- a/crates/socket-patch-core/src/crawlers/npm_crawler.rs +++ b/crates/socket-patch-core/src/crawlers/npm_crawler.rs @@ -4,6 +4,7 @@ use std::path::{Path, PathBuf}; use serde::Deserialize; use super::types::{CrawledPackage, CrawlerOptions}; +use crate::utils::purl::{percent_decode_purl_component, strip_purl_qualifiers}; /// Default batch size for crawling. #[cfg(test)] @@ -686,11 +687,7 @@ impl NpmCrawler { /// Parse a PURL string to extract namespace, name, and version. fn parse_purl_components(purl: &str) -> Option<(Option, String, String)> { - // Strip qualifiers - let base = match purl.find('?') { - Some(idx) => &purl[..idx], - None => purl, - }; + let base = strip_purl_qualifiers(purl); let rest = base.strip_prefix("pkg:npm/")?; let at_idx = rest.rfind('@')?; @@ -701,16 +698,33 @@ impl NpmCrawler { return None; } - if name_part.starts_with('@') { - let slash_idx = name_part.find('/')?; - let namespace = name_part[..slash_idx].to_string(); - let name = name_part[slash_idx + 1..].to_string(); - if name.is_empty() { + // SECURITY: components are percent-decoded AFTER the `/`/`@` splits + // above (so an encoded `%2f` cannot create a new path segment here) + // and BEFORE the `is_safe_npm_component` guards in `find_by_purls` + // (so `%2e%2e` cannot smuggle a traversal past them). The API serves + // scoped purls as `pkg:npm/%40scope/name@version`, which must match + // the literal `node_modules/@scope/name` install. + let version = percent_decode_purl_component(version); + + if let Some(slash_idx) = name_part.find('/') { + let namespace = percent_decode_purl_component(&name_part[..slash_idx]); + let name = percent_decode_purl_component(&name_part[slash_idx + 1..]); + // An npm namespace is always an `@scope` (checked post-decode). + if name.is_empty() || !namespace.starts_with('@') { return None; } - Some((Some(namespace), name, version.to_string())) + Some(( + Some(namespace.into_owned()), + name.into_owned(), + version.into_owned(), + )) } else { - Some((None, name_part.to_string(), version.to_string())) + let name = percent_decode_purl_component(name_part); + // A bare `@scope` with no `/name` is not a package name. + if name.starts_with('@') { + return None; + } + Some((None, name.into_owned(), version.into_owned())) } } } @@ -1031,6 +1045,93 @@ mod tests { assert!(!result.contains_key("pkg:npm/not-installed@0.0.1")); } + /// Regression: the patches API serves scoped purls percent-encoded + /// (`pkg:npm/%40scope/name@version`) and `scan` stores them verbatim as + /// manifest keys. `find_by_purls` must decode the components to match + /// the literal `node_modules/@scope/name` install — while keeping the + /// result keyed by the *verbatim* encoded input (downstream contract). + #[test] + fn test_parse_purl_components_percent_encoded_scope() { + let (ns, name, ver) = + NpmCrawler::parse_purl_components("pkg:npm/%40modelcontextprotocol/sdk@1.12.0") + .unwrap(); + assert_eq!(ns.as_deref(), Some("@modelcontextprotocol")); + assert_eq!(name, "sdk"); + assert_eq!(ver, "1.12.0"); + // An encoded bare scope with no `/name` is still not a package. + assert!(NpmCrawler::parse_purl_components("pkg:npm/%40scope@1.0.0").is_none()); + // A `#subpath` without a qualifier must not bleed into the version. + let (_, name, ver) = + NpmCrawler::parse_purl_components("pkg:npm/foo@1.0.0#lib/util").unwrap(); + assert_eq!(name, "foo"); + assert_eq!(ver, "1.0.0"); + } + + #[tokio::test] + async fn test_find_by_purls_percent_encoded_scope_resolves() { + let dir = tempfile::tempdir().unwrap(); + let nm = dir.path().join("node_modules"); + + let sdk_dir = nm.join("@modelcontextprotocol").join("sdk"); + tokio::fs::create_dir_all(&sdk_dir).await.unwrap(); + tokio::fs::write( + sdk_dir.join("package.json"), + r#"{"name": "@modelcontextprotocol/sdk", "version": "1.12.0"}"#, + ) + .await + .unwrap(); + + let crawler = NpmCrawler::new(); + let encoded = "pkg:npm/%40modelcontextprotocol/sdk@1.12.0".to_string(); + let result = crawler + .find_by_purls(&nm, std::slice::from_ref(&encoded)) + .await + .unwrap(); + + assert_eq!(result.len(), 1, "encoded scope must resolve: {result:?}"); + let pkg = result + .get(&encoded) + .expect("result keyed by the verbatim encoded input purl"); + assert_eq!(pkg.path, sdk_dir); + assert_eq!(pkg.name, "sdk"); + assert_eq!(pkg.namespace.as_deref(), Some("@modelcontextprotocol")); + } + + /// SECURITY regression: percent-encoded traversal sequences must be + /// rejected by the post-decode guards — `%2e%2e` decodes to `..` and + /// `%2f` to `/`, so guarding the *encoded* form would be a bypass. + #[tokio::test] + async fn test_find_by_purls_rejects_encoded_traversal() { + let root = tempfile::tempdir().unwrap(); + let nm = root.path().join("node_modules"); + // A real scope dir so a scoped traversal's kernel walk could resolve. + tokio::fs::create_dir_all(nm.join("@x")).await.unwrap(); + + // A victim package OUTSIDE node_modules, reachable only via `..`. + let evil_dir = root.path().join("evil"); + tokio::fs::create_dir_all(&evil_dir).await.unwrap(); + tokio::fs::write( + evil_dir.join("package.json"), + r#"{"name": "evil", "version": "1.0.0"}"#, + ) + .await + .unwrap(); + + let crawler = NpmCrawler::new(); + let purls = vec![ + "pkg:npm/%2e%2e/evil@1.0.0".to_string(), + "pkg:npm/@x/%2e%2e@1.0.0".to_string(), + "pkg:npm/@x/%2e%2e%2f%2e%2e%2fevil@1.0.0".to_string(), + "pkg:npm/..%2fevil@1.0.0".to_string(), + ]; + let result = crawler.find_by_purls(&nm, &purls).await.unwrap(); + + assert!( + result.is_empty(), + "encoded traversal must not escape node_modules; got {result:?}" + ); + } + /// Regression: a qualified PURL (carrying `?qualifiers`) must resolve and /// be keyed by the *verbatim* input PURL — not a reconstructed, stripped /// form. The dispatcher drives npm with `passthrough_purls` + diff --git a/crates/socket-patch-core/src/patch/apply.rs b/crates/socket-patch-core/src/patch/apply.rs index e880707..fe5a9d5 100644 --- a/crates/socket-patch-core/src/patch/apply.rs +++ b/crates/socket-patch-core/src/patch/apply.rs @@ -11,7 +11,7 @@ use crate::patch::file_hash::compute_file_git_sha256; use crate::patch::package::read_archive_filtered; /// Status of a file patch verification. -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum VerifyStatus { /// File is ready to be patched (current hash matches beforeHash). Ready, @@ -34,6 +34,33 @@ pub struct VerifyResult { pub target_hash: Option, } +/// How the apply pipeline treats a file whose on-disk content matches +/// NEITHER `beforeHash` nor `afterHash` (and a pre-existing file that is +/// missing). +/// +/// Mismatch tolerance is safe content-wise in every mode: the diff +/// strategy self-disables on a wrong base, and the archive/blob +/// strategies verify their bytes hash to exactly `afterHash` BEFORE any +/// write — a tolerated mismatch is overwritten with the verified patched +/// content or fails, never silently corrupted. What tolerance can do is +/// discard local modifications to the dependency file, which is why +/// `Strict` exists. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub enum MismatchPolicy { + /// DEFAULT: a beforeHash mismatch is overwritten with the verified + /// patched content and surfaced as a warning (the promoted + /// [`VerifyResult`] keeps `expected_hash`/`current_hash`, which is + /// how callers detect and report it). A MISSING pre-existing file is + /// still a hard error. + #[default] + Warn, + /// A beforeHash mismatch is a hard error (`--strict`). + Strict, + /// [`MismatchPolicy::Warn`] PLUS missing pre-existing files are + /// skipped instead of failing (`--force`). + Force, +} + /// Which patch source actually wrote the patched bytes for a file. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum AppliedVia { @@ -682,7 +709,7 @@ pub async fn apply_package_patch( sources: &PatchSources<'_>, uuid: Option<&str>, dry_run: bool, - force: bool, + policy: MismatchPolicy, ) -> ApplyResult { let mut result = ApplyResult { package_key: package_key.to_string(), @@ -714,30 +741,32 @@ pub async fn apply_package_patch( if verify_result.status != VerifyStatus::Ready && verify_result.status != VerifyStatus::AlreadyPatched { - if force { - match verify_result.status { - VerifyStatus::HashMismatch => { - // Force: treat hash mismatch as ready - verify_result.status = VerifyStatus::Ready; - } - VerifyStatus::NotFound => { - // Force: skip files that don't exist (non-new files) - result.files_verified.push(verify_result); - continue; - } - _ => {} + match (verify_result.status, policy) { + // Mismatch tolerated (default + force): promote to Ready. + // The promoted result KEEPS `expected_hash`/`current_hash` + // — the signature callers use to surface the warning. The + // diff strategy self-disables on the wrong base; the + // archive/blob strategies are hash-gated to afterHash. + (VerifyStatus::HashMismatch, MismatchPolicy::Warn | MismatchPolicy::Force) => { + verify_result.status = VerifyStatus::Ready; + } + // Force only: skip missing pre-existing files. + (VerifyStatus::NotFound, MismatchPolicy::Force) => { + result.files_verified.push(verify_result); + continue; + } + _ => { + let msg = verify_result + .message + .clone() + .unwrap_or_else(|| format!("{:?}", verify_result.status)); + result.error = Some(format!( + "Cannot apply patch: {} - {}", + verify_result.file, msg + )); + result.files_verified.push(verify_result); + return result; } - } else { - let msg = verify_result - .message - .clone() - .unwrap_or_else(|| format!("{:?}", verify_result.status)); - result.error = Some(format!( - "Cannot apply patch: {} - {}", - verify_result.file, msg - )); - result.files_verified.push(verify_result); - return result; } } @@ -1654,7 +1683,7 @@ mod tests { &PatchSources::blobs_only(blobs_dir.path()), None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -1706,7 +1735,7 @@ mod tests { &PatchSources::blobs_only(blobs_dir.path()), None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -1743,7 +1772,7 @@ mod tests { &PatchSources::blobs_only(blobs_dir.path()), None, true, - false, + MismatchPolicy::Warn, ) .await; @@ -1785,7 +1814,7 @@ mod tests { &PatchSources::blobs_only(blobs_dir.path()), None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -1818,7 +1847,7 @@ mod tests { &PatchSources::blobs_only(blobs_dir.path()), None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -1826,24 +1855,23 @@ mod tests { assert!(result.error.is_some()); } + /// beforeHash mismatch across the three policies: the DEFAULT (Warn) + /// overwrites with the verified patched content and keeps the + /// promoted warning signature (`Ready` + `expected_hash: Some` + + /// differing `current_hash`); `Strict` is the old hard error; `Force` + /// behaves like Warn (its extra tolerance is missing files). #[tokio::test] - async fn test_apply_package_patch_force_hash_mismatch() { + async fn test_apply_package_patch_hash_mismatch_policies() { let pkg_dir = tempfile::tempdir().unwrap(); let blobs_dir = tempfile::tempdir().unwrap(); let patched = b"patched content"; let after_hash = compute_git_sha256_from_bytes(patched); + let divergent = b"something unexpected"; - // Write a file whose hash does NOT match before_hash - tokio::fs::write(pkg_dir.path().join("index.js"), b"something unexpected") - .await - .unwrap(); - - // Write blob tokio::fs::write(blobs_dir.path().join(&after_hash), patched) .await .unwrap(); - let mut files = HashMap::new(); files.insert( "index.js".to_string(), @@ -1853,25 +1881,41 @@ mod tests { }, ); - // Without force: should fail - let result = apply_package_patch( - "pkg:npm/test@1.0.0", - pkg_dir.path(), - &files, - &PatchSources::blobs_only(blobs_dir.path()), - None, - false, - false, - ) - .await; - assert!(!result.success); + for policy in [MismatchPolicy::Warn, MismatchPolicy::Force] { + tokio::fs::write(pkg_dir.path().join("index.js"), divergent) + .await + .unwrap(); + let result = apply_package_patch( + "pkg:npm/test@1.0.0", + pkg_dir.path(), + &files, + &PatchSources::blobs_only(blobs_dir.path()), + None, + false, + policy, + ) + .await; + assert!(result.success, "{policy:?}: {:?}", result.error); + assert_eq!(result.files_patched.len(), 1, "{policy:?}"); + // The promoted verify keeps the mismatch signature for the + // caller's warning report. + let v = &result.files_verified[0]; + assert_eq!(v.status, VerifyStatus::Ready, "{policy:?}"); + assert!( + v.expected_hash.is_some() && v.current_hash != v.expected_hash, + "{policy:?}: promoted signature retained" + ); + // The bytes on disk are EXACTLY the verified patched content. + let written = tokio::fs::read(pkg_dir.path().join("index.js")) + .await + .unwrap(); + assert_eq!(written, patched, "{policy:?}"); + } - // Reset the file - tokio::fs::write(pkg_dir.path().join("index.js"), b"something unexpected") + // Strict: the old fail-closed behavior, file untouched. + tokio::fs::write(pkg_dir.path().join("index.js"), divergent) .await .unwrap(); - - // With force: should succeed let result = apply_package_patch( "pkg:npm/test@1.0.0", pkg_dir.path(), @@ -1879,16 +1923,38 @@ mod tests { &PatchSources::blobs_only(blobs_dir.path()), None, false, - true, + MismatchPolicy::Strict, ) .await; - assert!(result.success); - assert_eq!(result.files_patched.len(), 1); + assert!(!result.success); + assert!(result + .error + .as_deref() + .unwrap_or("") + .contains("does not match")); + let untouched = tokio::fs::read(pkg_dir.path().join("index.js")) + .await + .unwrap(); + assert_eq!(untouched, divergent, "strict never writes"); - let written = tokio::fs::read(pkg_dir.path().join("index.js")) + // A missing pre-existing file is STILL an error by default and + // under strict — only Force skips it. + tokio::fs::remove_file(pkg_dir.path().join("index.js")) .await .unwrap(); - assert_eq!(written, patched); + for policy in [MismatchPolicy::Warn, MismatchPolicy::Strict] { + let result = apply_package_patch( + "pkg:npm/test@1.0.0", + pkg_dir.path(), + &files, + &PatchSources::blobs_only(blobs_dir.path()), + None, + false, + policy, + ) + .await; + assert!(!result.success, "{policy:?}: missing file fails closed"); + } } #[tokio::test] @@ -1913,7 +1979,7 @@ mod tests { &PatchSources::blobs_only(blobs_dir.path()), None, false, - false, + MismatchPolicy::Warn, ) .await; assert!(!result.success); @@ -1926,7 +1992,7 @@ mod tests { &PatchSources::blobs_only(blobs_dir.path()), None, false, - true, + MismatchPolicy::Force, ) .await; assert!(result.success); @@ -2054,7 +2120,7 @@ mod tests { &sources, Some(TEST_UUID), false, - false, + MismatchPolicy::Warn, ) .await; @@ -2089,7 +2155,7 @@ mod tests { &sources, Some(TEST_UUID), false, - false, + MismatchPolicy::Warn, ) .await; @@ -2123,7 +2189,7 @@ mod tests { &sources, Some(TEST_UUID), false, - false, + MismatchPolicy::Warn, ) .await; @@ -2152,7 +2218,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -2189,7 +2255,7 @@ mod tests { &sources, Some(TEST_UUID), false, - true, // --force + MismatchPolicy::Force, ) .await; @@ -2229,7 +2295,7 @@ mod tests { &sources, Some(TEST_UUID), false, - false, + MismatchPolicy::Warn, ) .await; @@ -2259,7 +2325,7 @@ mod tests { &sources, Some(TEST_UUID), true, // dry-run - false, + MismatchPolicy::Warn, ) .await; @@ -2553,7 +2619,7 @@ mod tests { &PatchSources::blobs_only(blobs_dir.path()), None, false, - false, + MismatchPolicy::Warn, ) .await; diff --git a/crates/socket-patch-core/src/patch/go_redirect.rs b/crates/socket-patch-core/src/patch/go_redirect.rs index 34e5778..f54cd60 100644 --- a/crates/socket-patch-core/src/patch/go_redirect.rs +++ b/crates/socket-patch-core/src/patch/go_redirect.rs @@ -27,7 +27,8 @@ use std::path::{Path, PathBuf}; use crate::manifest::schema::{PatchFileInfo, PatchManifest}; use crate::patch::apply::{ - apply_package_patch, normalize_file_path, ApplyResult, PatchSources, VerifyResult, VerifyStatus, + apply_package_patch, normalize_file_path, ApplyResult, MismatchPolicy, PatchSources, + VerifyResult, VerifyStatus, }; use crate::patch::file_hash::compute_file_git_sha256; use crate::utils::purl::{build_golang_purl, parse_golang_purl, strip_purl_qualifiers}; @@ -164,7 +165,7 @@ pub async fn apply_go_redirect( sources: &PatchSources<'_>, uuid: Option<&str>, dry_run: bool, - force: bool, + policy: MismatchPolicy, ) -> ApplyResult { // SECURITY: refuse coordinates that would escape the copy base. // A `..`/separator-laden `module`/`version` (a tampered manifest PURL) would @@ -195,7 +196,7 @@ pub async fn apply_go_redirect( // Verify (read-only) against the pristine source for an accurate // "would patch" report, without creating the copy or editing go.mod. let mut result = - apply_package_patch(purl, pristine_src, files, sources, uuid, true, force).await; + apply_package_patch(purl, pristine_src, files, sources, uuid, true, policy).await; result.package_path = copy_dir.display().to_string(); result.sidecar = None; // a replace copy is not the cache (no go.sum advisory) return result; @@ -235,7 +236,7 @@ pub async fn apply_go_redirect( } // Delegate to the hardened pipeline, pointed at the copy. - let mut result = apply_package_patch(purl, ©_dir, files, sources, uuid, false, force).await; + let mut result = apply_package_patch(purl, ©_dir, files, sources, uuid, false, policy).await; result.package_path = copy_dir.display().to_string(); // The golang sidecar advisory ("go mod verify will fail against go.sum") // is about in-cache patching; a `replace` copy bypasses go.sum entirely, so @@ -761,7 +762,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; assert!(result.success, "apply failed: {:?}", result.error); @@ -810,7 +811,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -830,7 +831,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; assert!(result.success); @@ -866,7 +867,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -884,7 +885,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; assert!(result.success); @@ -910,7 +911,7 @@ mod tests { &sources, None, true, - false, + MismatchPolicy::Warn, ) .await; assert!(result.success); @@ -945,7 +946,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; assert!(!result.success); @@ -980,7 +981,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; assert!(result.success, "apply failed: {:?}", result.error); @@ -1016,7 +1017,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; assert!(result.success, "apply failed: {:?}", result.error); @@ -1056,7 +1057,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -1090,7 +1091,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -1119,7 +1120,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; // Add a user-authored replace. @@ -1157,7 +1158,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -1209,7 +1210,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; // Drop the directive but keep the copy. @@ -1243,7 +1244,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -1287,7 +1288,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -1328,7 +1329,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -1365,7 +1366,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; assert!(result.success); @@ -1446,7 +1447,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; assert!(result.success, "apply failed: {:?}", result.error); @@ -1495,7 +1496,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -1536,7 +1537,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; assert!(!result.success); diff --git a/crates/socket-patch-core/src/patch/vendor/bun_lock.rs b/crates/socket-patch-core/src/patch/vendor/bun_lock.rs index 8199bc6..f35a2c2 100644 --- a/crates/socket-patch-core/src/patch/vendor/bun_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/bun_lock.rs @@ -82,7 +82,7 @@ pub async fn vendor_bun( Ok(coords) => coords, Err(outcome) => return *outcome, }; - let (name, version) = (coords.name, coords.version); + let (name, version) = (coords.name.as_str(), coords.version.as_str()); // BN3 spelling: BARE project-relative path, no `file:`/`./` prefix. let rel_tgz = format!("{}/{}", coords.uuid_dir_rel, tgz_rel_leaf(name, version)); @@ -136,6 +136,7 @@ pub async fn vendor_bun( sources, dry_run, force, + &mut warnings, ) .await { @@ -422,7 +423,7 @@ fn revert_one_record( // ───────────────────────── conservative line grammar ────────────────────── /// One parsed single-line packages entry. -struct BunEntry { +pub(super) struct BunEntry { line_idx: usize, /// Leading whitespace, re-emitted verbatim. indent: String, @@ -431,7 +432,7 @@ struct BunEntry { /// The key token exactly as spelled (incl. quotes), re-emitted verbatim. key_raw: String, /// Verbatim top-level tuple elements (trimmed). - elems: Vec, + pub(super) elems: Vec, trailing_comma: bool, } @@ -471,14 +472,14 @@ fn classify(entry: &BunEntry, target_spec: &str, name: &str) -> Option Option<(&str, &str)> { +pub(super) fn split_name_spec(s: &str) -> Option<(&str, &str)> { let at = s.rfind('@').filter(|&i| i > 0)?; Some((&s[..at], &s[at + 1..])) } /// `"lockfileVersion": ` head check — only the fixture-pinned text /// lockfile version is spliced (fail-closed on anything newer/older). -fn check_lock_version(text: &str) -> Result<(), String> { +pub(super) fn check_lock_version(text: &str) -> Result<(), String> { let version = text.lines().take(5).find_map(|line| { line.trim() .strip_prefix("\"lockfileVersion\":") @@ -513,7 +514,7 @@ fn packages_bounds(lines: &[String]) -> Option<(usize, usize)> { /// Strictly parse every entry line of the packages section. Any line that /// is neither blank nor a single-line `"key": [tuple]` entry fails CLOSED. -fn parse_packages_section(lines: &[String]) -> Result, String> { +pub(super) fn parse_packages_section(lines: &[String]) -> Result, String> { let Some((start, end)) = packages_bounds(lines) else { // No (or unterminated) packages section: an empty lock simply has // no entries; an unterminated one is malformed. @@ -650,7 +651,7 @@ fn split_top_level(interior: &str) -> Result, String> { } /// Decode a verbatim JSON string token; `None` if it is not one. -fn decode_json_string(token: &str) -> Option { +pub(super) fn decode_json_string(token: &str) -> Option { if !token.starts_with('"') { return None; } diff --git a/crates/socket-patch-core/src/patch/vendor/cargo.rs b/crates/socket-patch-core/src/patch/vendor/cargo.rs index 85ea5cb..614fd28 100644 --- a/crates/socket-patch-core/src/patch/vendor/cargo.rs +++ b/crates/socket-patch-core/src/patch/vendor/cargo.rs @@ -18,7 +18,7 @@ use std::path::{Path, PathBuf}; use crate::manifest::schema::{PatchFileInfo, PatchRecord}; use crate::patch::apply::{ - apply_package_patch, normalize_file_path, ApplyResult, PatchSources, VerifyResult, VerifyStatus, + normalize_file_path, ApplyResult, PatchSources, VerifyResult, VerifyStatus, }; use crate::patch::copy_tree::{fresh_copy, remove_tree}; use crate::patch::file_hash::compute_file_git_sha256; @@ -269,22 +269,27 @@ pub async fn vendor_cargo_crate( } if dry_run { - // Verify (read-only) against the pristine source — apply_package_patch - // never writes when dry_run — for an accurate "would patch" report, - // without creating the copy or editing config/lock. - let mut result = apply_package_patch( + // Verify (read-only) against the pristine source — the apply + // pipeline never writes when dry_run — for an accurate "would + // patch" report (including the auto-force overwrite warnings the + // real run would emit), without creating the copy or editing + // config/lock. + let mut dry_warnings: Vec = Vec::new(); + let mut result = super::force_apply_staged( purl, pristine_src, - &record.files, + record, sources, - Some(&record.uuid), true, force, + name, + version, + &mut dry_warnings, ) .await; result.package_path = copy_dir.display().to_string(); result.sidecar = None; - return done(result, None, Vec::new()); + return done(result, None, dry_warnings); } // Hot path: already in sync → touch nothing (entry stays with the caller's @@ -333,15 +338,19 @@ pub async fn vendor_cargo_crate( ); } - // Delegate to the hardened pipeline, pointed at the copy. - let mut result = apply_package_patch( + // Delegate to the hardened pipeline (vendor auto-force policy — see + // `force_apply_staged`), pointed at the copy. + let mut warnings: Vec = Vec::new(); + let mut result = super::force_apply_staged( purl, ©_dir, - &record.files, + record, sources, - Some(&record.uuid), false, force, + name, + version, + &mut warnings, ) .await; result.package_path = copy_dir.display().to_string(); @@ -350,7 +359,7 @@ pub async fn vendor_cargo_crate( // Don't leave a half-built copy (or an empty uuid husk) that // verify/sweep would misjudge. let _ = remove_tree(&uuid_dir).await; - return done(result, None, Vec::new()); + return done(result, None, warnings); } // A path-dep copy must never carry a checksum sidecar. The fresh copy @@ -370,10 +379,9 @@ pub async fn vendor_cargo_crate( let _ = remove_tree(&uuid_dir).await; result.success = false; result.error = Some(format!("failed to update .cargo/config.toml: {e}")); - return done(result, None, Vec::new()); + return done(result, None, warnings); } - let mut warnings = Vec::new(); let prior_path = prior_entry.as_ref().and_then(|i| i.path.clone()); if prior_path.as_deref().is_some_and(is_legacy_redirect_path) { warnings.push(VendorWarning::new( diff --git a/crates/socket-patch-core/src/patch/vendor/composer_lock.rs b/crates/socket-patch-core/src/patch/vendor/composer_lock.rs index b2ae9cd..5a533eb 100644 --- a/crates/socket-patch-core/src/patch/vendor/composer_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/composer_lock.rs @@ -35,7 +35,7 @@ use serde_json::{json, Map, Value}; use crate::manifest::schema::{PatchFileInfo, PatchRecord}; use crate::patch::apply::{ - apply_package_patch, is_safe_relative_subpath, normalize_file_path, ApplyResult, PatchSources, + is_safe_relative_subpath, normalize_file_path, ApplyResult, PatchSources, VerifyResult, VerifyStatus, }; use crate::patch::copy_tree::{fresh_copy, remove_tree}; @@ -193,21 +193,24 @@ pub async fn vendor_composer( // ── dry run: verify-only against the installed dir, no writes ──────── if dry_run { - let mut result = apply_package_patch( + let mut dry_warnings: Vec = Vec::new(); + let mut result = super::force_apply_staged( purl, installed_dir, - &record.files, + record, sources, - Some(&record.uuid), true, force, + &pkg, + version, + &mut dry_warnings, ) .await; result.package_path = copy_dir.display().to_string(); return VendorOutcome::Done { result, entry: None, - warnings: Vec::new(), + warnings: dry_warnings, }; } @@ -225,14 +228,17 @@ pub async fn vendor_composer( warnings: Vec::new(), }; } - let mut result = apply_package_patch( + let mut warnings: Vec = Vec::new(); + let mut result = super::force_apply_staged( purl, ©_dir, - &record.files, + record, sources, - Some(&record.uuid), false, force, + &pkg, + version, + &mut warnings, ) .await; result.package_path = copy_dir.display().to_string(); @@ -242,7 +248,7 @@ pub async fn vendor_composer( return VendorOutcome::Done { result, entry: None, - warnings: Vec::new(), + warnings, }; } @@ -256,7 +262,7 @@ pub async fn vendor_composer( return VendorOutcome::Done { result, entry: None, - warnings: Vec::new(), + warnings, }; }; let rewritten = rewrite_lock_entry(original_obj, ©_rel, &record.uuid); @@ -272,12 +278,11 @@ pub async fn vendor_composer( return VendorOutcome::Done { result, entry: None, - warnings: Vec::new(), + warnings, }; } // ── marker + ledger entry ──────────────────────────────────────────── - let mut warnings = Vec::new(); let base_purl = build_composer_purl(&vendor, &name, version); let mut vulnerabilities: Vec = record.vulnerabilities.keys().cloned().collect(); vulnerabilities.sort(); diff --git a/crates/socket-patch-core/src/patch/vendor/gem.rs b/crates/socket-patch-core/src/patch/vendor/gem.rs index 5ce51a2..0eccace 100644 --- a/crates/socket-patch-core/src/patch/vendor/gem.rs +++ b/crates/socket-patch-core/src/patch/vendor/gem.rs @@ -53,7 +53,7 @@ use serde_json::Value; use crate::manifest::schema::{PatchFileInfo, PatchRecord}; use crate::patch::apply::{ - apply_package_patch, is_safe_relative_subpath, normalize_file_path, ApplyResult, PatchSources, + is_safe_relative_subpath, normalize_file_path, ApplyResult, PatchSources, VerifyResult, VerifyStatus, }; use crate::patch::copy_tree::{fresh_copy, remove_tree}; @@ -282,21 +282,24 @@ pub async fn vendor_gem( // ── dry run: verify-only against the installed dir, no writes ──────── if dry_run { - let mut result = apply_package_patch( + let mut dry_warnings: Vec = Vec::new(); + let mut result = super::force_apply_staged( purl, installed_dir, - &record.files, + record, sources, - Some(&record.uuid), true, force, + name, + version, + &mut dry_warnings, ) .await; result.package_path = copy_dir.display().to_string(); return VendorOutcome::Done { result, entry: None, - warnings: Vec::new(), + warnings: dry_warnings, }; } @@ -338,14 +341,17 @@ pub async fn vendor_gem( warnings: Vec::new(), }; } - let mut result = apply_package_patch( + let mut warnings: Vec = Vec::new(); + let mut result = super::force_apply_staged( purl, ©_dir, - &record.files, + record, sources, - Some(&record.uuid), false, force, + name, + version, + &mut warnings, ) .await; result.package_path = copy_dir.display().to_string(); @@ -355,7 +361,7 @@ pub async fn vendor_gem( return VendorOutcome::Done { result, entry: None, - warnings: Vec::new(), + warnings, }; } @@ -368,7 +374,7 @@ pub async fn vendor_gem( return VendorOutcome::Done { result, entry: None, - warnings: Vec::new(), + warnings, }; } @@ -395,13 +401,12 @@ pub async fn vendor_gem( return VendorOutcome::Done { result, entry: None, - warnings: Vec::new(), + warnings, }; } }; // ── marker + ledger entry ──────────────────────────────────────────── - let mut warnings = Vec::new(); let base_purl = build_gem_purl(name, version); let mut vulnerabilities: Vec = record.vulnerabilities.keys().cloned().collect(); vulnerabilities.sort(); diff --git a/crates/socket-patch-core/src/patch/vendor/golang.rs b/crates/socket-patch-core/src/patch/vendor/golang.rs index 8961482..d532485 100644 --- a/crates/socket-patch-core/src/patch/vendor/golang.rs +++ b/crates/socket-patch-core/src/patch/vendor/golang.rs @@ -101,6 +101,29 @@ pub async fn vendor_go_module( .is_some_and(|e| e.owner == Some(ReplaceOwner::GoPatches)); let prior_path = prior.as_ref().and_then(|e| e.path.clone()); + // Vendor auto-force policy (the engine's copy is staged from the + // pristine source, never the user's tree — see `force_apply_staged`): + // missing patch targets still fail closed unless the caller's own + // `--force` asked for the skip tolerance, then the engine apply runs + // forced so a beforeHash mismatch (already-applied module, or a patch + // built against different bytes) overwrites with the verified patched + // content. The engine is shared with the in-place `apply` redirect + // path, whose strict semantics stay unchanged. + let mut warnings: Vec = Vec::new(); + if !force { + let missing = super::missing_existing_patch_files(pristine_src, &record.files).await; + if let Some(first) = missing.first() { + return VendorOutcome::Done { + result: super::failed_apply_result( + purl, + format!("Cannot apply patch: {first} - File not found"), + ), + entry: None, + warnings, + }; + } + } + // The engine does the heavy lifting: fresh copy → hardened apply pipeline // → `replace` upsert (which refuses a user-authored same-version pin). let result = apply_go_redirect( @@ -114,15 +137,18 @@ pub async fn vendor_go_module( sources, Some(&record.uuid), dry_run, - force, + crate::patch::apply::MismatchPolicy::Force, ) .await; + if result.success { + warnings.extend(super::mismatch_overwrite_warnings(&result, module, version)); + } if dry_run { return VendorOutcome::Done { result, entry: None, - warnings: Vec::new(), + warnings, }; } if !result.success { @@ -134,7 +160,7 @@ pub async fn vendor_go_module( return VendorOutcome::Done { result, entry: None, - warnings: Vec::new(), + warnings, }; } // A patch with no files is a no-op success: the engine wrote no copy and @@ -143,12 +169,10 @@ pub async fn vendor_go_module( return VendorOutcome::Done { result, entry: None, - warnings: Vec::new(), + warnings, }; } - let mut warnings = Vec::new(); - if takeover { // The `replace` line was already atomically repointed by the upsert; // the apply backend's copy is now unreachable — delete it (built from @@ -311,6 +335,7 @@ pub async fn revert_go_vendor( #[cfg(test)] mod tests { use super::*; + use crate::patch::apply::MismatchPolicy; use crate::hash::git_sha256::compute_git_sha256_from_bytes; use crate::manifest::schema::{PatchFileInfo, VulnerabilityInfo}; use crate::patch::apply::ApplyResult; @@ -532,7 +557,7 @@ mod tests { &sources, Some(UUID), false, - false, + MismatchPolicy::Warn, ) .await; assert!(pre.success, "fixture redirect failed: {:?}", pre.error); @@ -696,7 +721,7 @@ mod tests { &sources, Some(UUID), false, - false, + MismatchPolicy::Warn, ) .await; let (_result, entry, _warnings) = diff --git a/crates/socket-patch-core/src/patch/vendor/lock_inventory.rs b/crates/socket-patch-core/src/patch/vendor/lock_inventory.rs new file mode 100644 index 0000000..476353c --- /dev/null +++ b/crates/socket-patch-core/src/patch/vendor/lock_inventory.rs @@ -0,0 +1,1576 @@ +//! Read-only lockfile inventories: the dependency set a project's lockfile +//! resolves, independent of what is installed on disk. +//! +//! Two consumers: +//! +//! * `scan` supplements its installed-tree crawl with lockfile-only entries +//! (discovery on fresh clones and partial installs), warning that those +//! packages are not yet installed; +//! * `vendor` fetches the pristine artifact for a lockfile-resolved package +//! with no installed copy ([`super::registry_fetch`]), verifying the bytes +//! against the integrity the lock records — FAIL-CLOSED: an entry whose +//! lock carries no content verifier is never fetched. +//! +//! Parsing is fail-soft per entry (a malformed entry is skipped, never an +//! error; a malformed file yields `None`) and fail-closed per value: +//! names/versions are path-safety-guarded before an entry is emitted — the +//! lockfile is committed, tamperable input that later feeds filesystem paths +//! and download URLs. + +use std::collections::HashMap; +use std::path::Path; + +use serde_json::Value; + +use crate::patch::path_safety; +use crate::utils::purl::strip_purl_qualifiers; + +use super::npm_common::is_safe_npm_name; +use super::npm_flavor::{detect_npm_lock_flavor, NpmLockFlavor}; +use super::path::parse_vendor_path; +use super::{bun_lock, pnpm_lock, yarn_berry_lock, yarn_classic_lock}; + +/// The content verifier a lockfile records for an entry. The fetch layer +/// refuses entries whose verifier is [`LockIntegrity::None`]. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum LockIntegrity { + /// SRI string (`sha512-`, possibly multi-hash space-separated) — + /// npm family; verified against the raw tarball bytes. + Sri(String), + /// yarn classic `resolved "...#"` fragment (40-hex) — verified + /// against the raw tarball bytes. + Sha1Hex(String), + /// yarn berry cache-zip checksum (`/`, e.g. `10c0/…`) — + /// verified by rebuilding the deterministic cache zip from the fetched + /// tarball and comparing (the lock never hashes the tarball itself). + BerryChecksum(String), + /// Hex sha256 of the artifact (Cargo.lock `checksum`, pypi file hashes, + /// Gemfile.lock `CHECKSUMS`). + Sha256Hex(String), + /// go.sum module-zip dirhash (`h1:`). + GoH1(String), + /// The lock records no content verifier. + None, +} + +/// One lockfile-resolved package. +#[derive(Debug, Clone)] +pub struct LockfileEntry { + /// Vendor-ecosystem tag (`npm`, `cargo`, `golang`, `pypi`, `gem`, + /// `composer`) — matches `VendorEntry::ecosystem`. + pub ecosystem: &'static str, + /// Literal (percent-decoded) package name, e.g. `@scope/name`. + pub name: String, + /// Exact resolved version. + pub version: String, + /// Canonical literal purl (`pkg:npm/@scope/name@1.0.0`) — the same form + /// the crawlers emit. + pub purl: String, + /// Artifact URL when the lock records one (package-lock `resolved`, + /// yarn `resolved` minus its `#sha1` fragment, pnpm `tarball:`); `None` + /// means the fetcher constructs the conventional registry URL. + pub resolved: Option, + pub integrity: LockIntegrity, +} + +impl LockfileEntry { + fn npm( + name: impl Into, + version: impl Into, + resolved: Option, + integrity: LockIntegrity, + ) -> Self { + let (name, version) = (name.into(), version.into()); + let purl = format!("pkg:npm/{name}@{version}"); + LockfileEntry { + ecosystem: "npm", + name, + version, + purl, + resolved, + integrity, + } + } +} + +/// Inventory the project's npm-family lockfile. Routes by +/// [`detect_npm_lock_flavor`] (PnP markers, bun.lockb, unsupported lock +/// versions, and a missing lockfile all yield `None`). +pub async fn inventory_npm_lock( + project_root: &Path, +) -> Option<(NpmLockFlavor, Vec)> { + let (flavor, _warnings) = detect_npm_lock_flavor(project_root).await.ok()?; + let raw = match flavor { + NpmLockFlavor::PackageLock => inventory_package_lock(project_root).await, + NpmLockFlavor::Pnpm => inventory_pnpm_lock(project_root).await, + NpmLockFlavor::YarnClassic => inventory_yarn_classic(project_root).await, + NpmLockFlavor::YarnBerry => inventory_yarn_berry(project_root).await, + NpmLockFlavor::Bun => inventory_bun(project_root).await, + }?; + Some((flavor, finalize_npm(raw))) +} + +/// Match a manifest/API purl (possibly percent-encoded, possibly carrying +/// qualifiers) against the inventory: components decode via +/// [`crate::utils::purl::normalize_purl`], so `pkg:npm/%40scope/x@1` +/// matches the literal entry. +pub fn lookup<'a>(entries: &'a [LockfileEntry], purl: &str) -> Option<&'a LockfileEntry> { + let decoded = crate::utils::purl::normalize_purl(strip_purl_qualifiers(purl)).into_owned(); + let rest = decoded.strip_prefix("pkg:")?; + let (purl_type, rest) = rest.split_once('/')?; + // purl type → vendor-ecosystem tag (same mapping the dispatcher uses). + let eco = match purl_type { + "npm" => "npm", + "cargo" => "cargo", + "golang" => "golang", + "pypi" => "pypi", + "gem" => "gem", + "composer" => "composer", + _ => return None, + }; + let at = rest.rfind('@').filter(|&i| i > 0)?; + let (name, version) = (&rest[..at], &rest[at + 1..]); + // pypi names compare in PEP 503 normalized form. + let name = if eco == "pypi" { + pep503(name) + } else { + name.to_string() + }; + entries + .iter() + .find(|e| e.ecosystem == eco && e.name == name && e.version == version) +} + +/// Everything every recognized lockfile in the project resolves — the +/// union the scan supplement and the vendor auto-fetch consume. +pub async fn inventory_project(project_root: &Path) -> Vec { + let mut out: Vec = Vec::new(); + if let Some((_, entries)) = inventory_npm_lock(project_root).await { + out.extend(entries); + } + #[cfg(feature = "cargo")] + if let Some(entries) = inventory_cargo_lock(project_root).await { + out.extend(entries); + } + #[cfg(feature = "golang")] + if let Some(entries) = inventory_go_sum(project_root).await { + out.extend(entries); + } + #[cfg(feature = "composer")] + if let Some(entries) = inventory_composer_lock(project_root).await { + out.extend(entries); + } + if let Some(entries) = inventory_gemfile_lock(project_root).await { + out.extend(entries); + } + if let Some(entries) = inventory_pypi_locks(project_root).await { + out.extend(entries); + } + out +} + +/// Guard + dedup the raw npm entries: unsafe names/versions are dropped +/// fail-closed; duplicate (name, version) instances collapse to one, +/// preferring the instance that carries a verifier. +fn finalize_npm(raw: Vec) -> Vec { + dedup_prefer_integrity( + raw.into_iter() + .filter(|e| { + is_safe_npm_name(&e.name) && path_safety::is_safe_single_segment(&e.version) + }) + .collect(), + ) +} + +/// Collapse duplicate (name, version) instances, preferring one that +/// carries a verifier. +fn dedup_prefer_integrity(raw: Vec) -> Vec { + let mut seen: HashMap<(String, String), usize> = HashMap::new(); + let mut out: Vec = Vec::new(); + for entry in raw { + let key = (entry.name.clone(), entry.version.clone()); + match seen.get(&key) { + Some(&i) => { + if out[i].integrity == LockIntegrity::None + && entry.integrity != LockIntegrity::None + { + out[i] = entry; + } + } + None => { + seen.insert(key, out.len()); + out.push(entry); + } + } + } + out +} + +// ──────────────────────────────── Cargo.lock ──────────────────────────────── + +/// Inventory `Cargo.lock` `[[package]]` blocks. Only crates.io-sourced +/// entries are fetchable (their `checksum` is the sha256 of the `.crate` +/// file); workspace members (no `source`) are skipped, and git/custom- +/// registry sources stay listed for discovery without a verifier. +#[cfg(feature = "cargo")] +pub async fn inventory_cargo_lock(project_root: &Path) -> Option> { + let text = tokio::fs::read_to_string(project_root.join("Cargo.lock")) + .await + .ok()?; + /// One in-flight `[[package]]` block: name, version, source, checksum. + type CargoBlock = (Option, Option, Option, Option); + let mut out = Vec::new(); + let mut cur: Option = None; + let flush = |cur: &mut Option, out: &mut Vec| { + if let Some((Some(name), Some(version), source, checksum)) = cur.take() { + let Some(source) = source else { + return; // workspace member + }; + if !path_safety::is_safe_single_segment(&name) + || !path_safety::is_safe_single_segment(&version) + { + return; + } + let crates_io = source.contains("github.com/rust-lang/crates.io-index") + || source.contains("index.crates.io"); + let integrity = match checksum { + Some(c) if crates_io && c.len() == 64 && c.bytes().all(|b| b.is_ascii_hexdigit()) => { + LockIntegrity::Sha256Hex(c) + } + _ => LockIntegrity::None, + }; + let purl = format!("pkg:cargo/{name}@{version}"); + out.push(LockfileEntry { + ecosystem: "cargo", + name, + version, + purl, + resolved: None, + integrity, + }); + } + }; + for line in text.lines() { + let line = line.trim(); + if line == "[[package]]" { + flush(&mut cur, &mut out); + cur = Some((None, None, None, None)); + continue; + } + if line.starts_with('[') { + flush(&mut cur, &mut out); + continue; + } + let Some(slot) = cur.as_mut() else { continue }; + let Some((key, value)) = line.split_once('=') else { + continue; + }; + let value = value.trim().trim_matches('"').to_string(); + match key.trim() { + "name" => slot.0 = Some(value), + "version" => slot.1 = Some(value), + "source" => slot.2 = Some(value), + "checksum" => slot.3 = Some(value), + _ => {} + } + } + flush(&mut cur, &mut out); + Some(dedup_prefer_integrity(out)) +} + +// ────────────────────────────────── go.sum ────────────────────────────────── + +/// Inventory `go.sum` module-zip lines (` h1:`); the +/// `/go.mod`-suffixed lines hash only the manifest and are skipped. go.sum +/// may list more modules than the final build graph — acceptable for +/// discovery, and the manifest decides what actually gets vendored. +#[cfg(feature = "golang")] +pub async fn inventory_go_sum(project_root: &Path) -> Option> { + let text = tokio::fs::read_to_string(project_root.join("go.sum")) + .await + .ok()?; + let mut out = Vec::new(); + for line in text.lines() { + let mut parts = line.split_whitespace(); + let (Some(module), Some(version), Some(hash)) = + (parts.next(), parts.next(), parts.next()) + else { + continue; + }; + if version.ends_with("/go.mod") || !hash.starts_with("h1:") { + continue; + } + // SECURITY: module path segments and the version feed paths/URLs. + if !path_safety::is_safe_multi_segment(module) + || !path_safety::is_safe_single_segment(version) + { + continue; + } + out.push(LockfileEntry { + ecosystem: "golang", + name: module.to_string(), + version: version.to_string(), + purl: format!("pkg:golang/{module}@{version}"), + resolved: None, + integrity: LockIntegrity::GoH1(hash.to_string()), + }); + } + Some(dedup_prefer_integrity(out)) +} + +/// Keep a lock-recorded URL only when it is a plain http(s) artifact URL +/// (drops `git+…`, `file:…`, `link:…` — content the registry conventions +/// cannot reproduce; such entries stay listed for discovery but the fetch +/// layer's integrity rule decides fetchability). +fn http_url(raw: &str) -> Option { + (raw.starts_with("https://") || raw.starts_with("http://")).then(|| raw.to_string()) +} + +// ──────────────────── package-lock.json / npm-shrinkwrap ──────────────────── + +async fn inventory_package_lock(root: &Path) -> Option> { + // Shrinkwrap wins, mirroring `npm_lock::select_lockfile`. + let mut bytes = None; + for lock in ["npm-shrinkwrap.json", "package-lock.json"] { + if let Ok(b) = tokio::fs::read(root.join(lock)).await { + bytes = Some(b); + break; + } + } + let doc: Value = serde_json::from_slice(&bytes?).ok()?; + // v1 legacy locks have no `packages` map — no inventory (documented). + let packages = doc.get("packages")?.as_object()?; + + let mut out = Vec::new(); + for (key, node) in packages { + // "" is the root project; keys without node_modules/ are workspace + // members (mirrors npm_lock::scan_lock_matches' member rule). + let Some((_, key_name)) = key.rsplit_once("node_modules/") else { + continue; + }; + if node.get("link").and_then(Value::as_bool).unwrap_or(false) + || node.get("inBundle").and_then(Value::as_bool).unwrap_or(false) + { + continue; + } + let name = node + .get("name") + .and_then(Value::as_str) + .unwrap_or(key_name) + .to_string(); + let Some(version) = node.get("version").and_then(Value::as_str) else { + continue; + }; + let resolved_raw = node.get("resolved").and_then(Value::as_str); + // Our own vendored spec: not a registry dependency. + if resolved_raw.is_some_and(|r| parse_vendor_path(r).is_some()) { + continue; + } + let integrity = node + .get("integrity") + .and_then(Value::as_str) + .map(|i| LockIntegrity::Sri(i.to_string())) + .unwrap_or(LockIntegrity::None); + out.push(LockfileEntry::npm( + name, + version, + resolved_raw.and_then(http_url), + integrity, + )); + } + Some(out) +} + +// ─────────────────────────── pnpm-lock.yaml v9 ─────────────────────────── + +/// Extract one value from an inline YAML map fragment like +/// `{integrity: sha512-…, tarball: file:…}` (values optionally quoted). +fn inline_map_value(fragment: &str, field: &str) -> Option { + let at = fragment.find(&format!("{field}:"))?; + let rest = fragment[at + field.len() + 1..].trim_start(); + let end = rest.find([',', '}']).unwrap_or(rest.len()); + let value = rest[..end].trim().trim_matches(['\'', '"']); + (!value.is_empty()).then(|| value.to_string()) +} + +async fn inventory_pnpm_lock(root: &Path) -> Option> { + let text = tokio::fs::read_to_string(root.join("pnpm-lock.yaml")) + .await + .ok()?; + let lines = pnpm_lock::split_lines(&text); + let (start, end) = pnpm_lock::section_bounds(&lines, "packages")?; + + let mut out = Vec::new(); + let mut i = start + 1; + while let Some(block) = pnpm_lock::next_block(&lines, i, end) { + i = block.end; + // Key grammar: `name@version` (name may be `@scope/name`), with + // optional peer-dep suffixes `(peer@1.2.3)…` after the version. + let base = match block.key.find('(') { + Some(p) => block.key[..p].trim_end(), + None => block.key.as_str(), + }; + let Some(at) = base.rfind('@').filter(|&p| p > 0) else { + continue; + }; + let (name, version) = (&base[..at], &base[at + 1..]); + // Only plain registry versions: `file:`/`link:`/`https:`/git specs + // are not registry-resolvable. + if !version.chars().next().is_some_and(|c| c.is_ascii_digit()) { + continue; + } + let mut integrity = LockIntegrity::None; + let mut tarball: Option = None; + for line in &lines[block.header + 1..block.end] { + let t = line.trim(); + if let Some(rest) = t.strip_prefix("resolution:") { + if let Some(v) = inline_map_value(rest, "integrity") { + integrity = LockIntegrity::Sri(v); + } + tarball = inline_map_value(rest, "tarball"); + break; + } + } + // Our own vendored spec: not a registry dependency. + if tarball.as_deref().is_some_and(|t| parse_vendor_path(t).is_some()) { + continue; + } + out.push(LockfileEntry::npm( + name, + version, + tarball.as_deref().and_then(http_url), + integrity, + )); + } + Some(out) +} + +// ───────────────────────────── yarn.lock (classic) ───────────────────────────── + +async fn inventory_yarn_classic(root: &Path) -> Option> { + let text = tokio::fs::read_to_string(root.join("yarn.lock")).await.ok()?; + let mut out = Vec::new(); + for block in yarn_classic_lock::scan_blocks(&text) { + // Our own vendored block: not a registry dependency. + if yarn_classic_lock::block_points_into_vendor(&block.lines) { + continue; + } + let patterns = yarn_classic_lock::split_key_patterns(&block.key); + let Some(name) = patterns + .first() + .and_then(|p| yarn_classic_lock::pattern_real_name(p)) + else { + continue; + }; + let Some(version) = yarn_classic_lock::classic_field(&block.lines, "version") else { + continue; + }; + let resolved_raw = yarn_classic_lock::classic_field(&block.lines, "resolved"); + // `resolved "url#sha1hex"` — the fragment is the legacy verifier. + let (resolved, sha1_hex) = match resolved_raw { + Some(raw) => match raw.split_once('#') { + Some((url, frag)) => ( + http_url(url), + (frag.len() == 40 && frag.bytes().all(|b| b.is_ascii_hexdigit())) + .then(|| frag.to_ascii_lowercase()), + ), + None => (http_url(raw), None), + }, + None => (None, None), + }; + let integrity = yarn_classic_lock::classic_field(&block.lines, "integrity") + .map(|i| LockIntegrity::Sri(i.to_string())) + .or(sha1_hex.map(LockIntegrity::Sha1Hex)) + .unwrap_or(LockIntegrity::None); + out.push(LockfileEntry::npm(name, version, resolved, integrity)); + } + Some(out) +} + +// ───────────────────────────── yarn.lock (berry) ───────────────────────────── + +async fn inventory_yarn_berry(root: &Path) -> Option> { + let text = tokio::fs::read_to_string(root.join("yarn.lock")).await.ok()?; + let mut out = Vec::new(); + // Berry reuses classic's block grammar (same scanner the berry backend + // imports); `__metadata` and workspace/patch/file resolutions are not + // registry packages. + for block in yarn_classic_lock::scan_blocks(&text) { + if block.key.starts_with("__metadata") { + continue; + } + let Some(resolution) = yarn_berry_lock::berry_field(&block.lines, "resolution") else { + continue; + }; + // Registry resolutions are `name@npm:` (a `::binding` + // suffix may follow). Anything else (workspace:/patch:/file:/link:) + // is skipped — including our own vendored file: resolutions. + let Some((name, reference)) = yarn_classic_lock::split_pattern(resolution) else { + continue; + }; + let Some(reference) = reference.strip_prefix("npm:") else { + continue; + }; + let version_from_res = reference.split("::").next().unwrap_or(reference); + let version = yarn_berry_lock::berry_field(&block.lines, "version") + .unwrap_or(version_from_res); + let integrity = yarn_berry_lock::berry_field(&block.lines, "checksum") + .map(|c| LockIntegrity::BerryChecksum(c.to_string())) + .unwrap_or(LockIntegrity::None); + out.push(LockfileEntry::npm(name, version, None, integrity)); + } + Some(out) +} + +// ──────────────────────────────── bun.lock ──────────────────────────────── + +async fn inventory_bun(root: &Path) -> Option> { + let text = tokio::fs::read_to_string(root.join("bun.lock")).await.ok()?; + bun_lock::check_lock_version(&text).ok()?; + let lines: Vec = text.split('\n').map(str::to_string).collect(); + let entries = bun_lock::parse_packages_section(&lines).ok()?; + + let mut out = Vec::new(); + for entry in entries { + // Registry entries are 4-tuples `[spec, registry, {deps}, sha512]`; + // our vendored 3-tuples and other shapes are skipped. + if entry.elems.len() != 4 || !entry.elems[2].starts_with('{') { + continue; + } + let Some(spec) = entry.elems.first().and_then(|e| bun_lock::decode_json_string(e)) + else { + continue; + }; + let Some((name, version)) = bun_lock::split_name_spec(&spec) else { + continue; + }; + if !version.chars().next().is_some_and(|c| c.is_ascii_digit()) { + continue; + } + let Some(registry) = bun_lock::decode_json_string(&entry.elems[1]) else { + continue; + }; + let Some(integrity) = bun_lock::decode_json_string(&entry.elems[3]) else { + continue; + }; + // elem[1] is `""` for the default registry; a full `.tgz` URL is + // used verbatim; any other base falls back to conventional URL + // construction (the integrity check still gates the content). + let resolved = (registry.ends_with(".tgz")) + .then(|| http_url(®istry)) + .flatten(); + out.push(LockfileEntry::npm( + name, + version, + resolved, + LockIntegrity::Sri(integrity), + )); + } + Some(out) +} + +// ────────────────────────────── composer.lock ────────────────────────────── + +/// Inventory `composer.lock` `packages`/`packages-dev`. The `dist.shasum` +/// (sha1 of the dist zip) is frequently empty — such entries stay +/// discovery-only. Names lowercase to the canonical packagist form; +/// versions drop the pretty leading `v`. +#[cfg(feature = "composer")] +pub async fn inventory_composer_lock(project_root: &Path) -> Option> { + let bytes = tokio::fs::read(project_root.join("composer.lock")).await.ok()?; + let doc: Value = serde_json::from_slice(&bytes).ok()?; + let mut out = Vec::new(); + for section in ["packages", "packages-dev"] { + let Some(list) = doc.get(section).and_then(Value::as_array) else { + continue; + }; + for pkg in list { + let Some(name) = pkg.get("name").and_then(Value::as_str) else { + continue; + }; + let Some(version) = pkg.get("version").and_then(Value::as_str) else { + continue; + }; + let name = name.to_ascii_lowercase(); + let version = version + .strip_prefix('v') + .filter(|r| r.chars().next().is_some_and(|c| c.is_ascii_digit())) + .unwrap_or(version) + .to_string(); + if !path_safety::is_safe_multi_segment(&name) + || name.split('/').count() != 2 + || !path_safety::is_safe_single_segment(&version) + { + continue; + } + let dist = pkg.get("dist"); + let dist_url = dist + .and_then(|d| d.get("url")) + .and_then(Value::as_str) + .unwrap_or(""); + // Our own vendored entries use a path dist — skip. + if dist + .and_then(|d| d.get("type")) + .and_then(Value::as_str) + .is_some_and(|t| t == "path") + || parse_vendor_path(dist_url).is_some() + { + continue; + } + let is_zip = dist + .and_then(|d| d.get("type")) + .and_then(Value::as_str) + .is_some_and(|t| t == "zip"); + let shasum = dist + .and_then(|d| d.get("shasum")) + .and_then(Value::as_str) + .unwrap_or(""); + let integrity = if is_zip + && shasum.len() == 40 + && shasum.bytes().all(|b| b.is_ascii_hexdigit()) + { + LockIntegrity::Sha1Hex(shasum.to_ascii_lowercase()) + } else { + LockIntegrity::None + }; + let purl = format!("pkg:composer/{name}@{version}"); + out.push(LockfileEntry { + ecosystem: "composer", + name, + version, + purl, + resolved: is_zip.then(|| http_url(dist_url)).flatten(), + integrity, + }); + } + } + Some(dedup_prefer_integrity(out)) +} + +// ────────────────────────────── Gemfile.lock ────────────────────────────── + +/// Inventory `Gemfile.lock`: `GEM`-section `specs:` entries (4-space +/// indent; deeper lines are dependency ranges) plus the bundler ≥ 2.6 +/// `CHECKSUMS` section's sha256 values when present (older locks stay +/// discovery-only). Platform-suffixed specs (`nokogiri (1.16.5-arm64-…)`) +/// are skipped — platform gems are unsupported for vendoring anyway. +pub async fn inventory_gemfile_lock(project_root: &Path) -> Option> { + let text = tokio::fs::read_to_string(project_root.join("Gemfile.lock")) + .await + .ok()?; + let mut remote: Option = None; + let mut checksums: HashMap<(String, String), String> = HashMap::new(); + let mut specs: Vec<(String, String)> = Vec::new(); + + let mut section = ""; + let mut in_specs = false; + for line in text.lines() { + if !line.starts_with(' ') { + section = line.trim(); + in_specs = false; + continue; + } + let trimmed = line.trim_start(); + let indent = line.len() - trimmed.len(); + match section { + "GEM" => { + if indent == 2 { + if let Some(r) = trimmed.strip_prefix("remote:") { + let r = r.trim().trim_end_matches('/'); + if remote.is_none() && !r.is_empty() { + remote = Some(r.to_string()); + } + } + in_specs = trimmed == "specs:"; + } else if in_specs && indent == 4 { + if let Some((name, version)) = parse_gem_spec_line(trimmed) { + specs.push((name, version)); + } + } + } + "CHECKSUMS" => { + // ` name (version) sha256=hex` + if let Some((spec_part, hash_part)) = + trimmed.rsplit_once(" sha256=").map(|(s, h)| (s, h.trim())) + { + if let Some((name, version)) = parse_gem_spec_line(spec_part) { + if hash_part.len() == 64 + && hash_part.bytes().all(|b| b.is_ascii_hexdigit()) + { + checksums + .insert((name, version), hash_part.to_ascii_lowercase()); + } + } + } + } + _ => {} + } + } + if specs.is_empty() { + return None; + } + let base = remote.unwrap_or_else(|| "https://rubygems.org".to_string()); + let mut out = Vec::new(); + for (name, version) in specs { + if !path_safety::is_safe_single_segment(&name) + || !path_safety::is_safe_single_segment(&version) + { + continue; + } + let integrity = checksums + .get(&(name.clone(), version.clone())) + .map(|h| LockIntegrity::Sha256Hex(h.clone())) + .unwrap_or(LockIntegrity::None); + out.push(LockfileEntry { + ecosystem: "gem", + purl: format!("pkg:gem/{name}@{version}"), + resolved: http_url(&format!("{base}/downloads/{name}-{version}.gem")), + name, + version, + integrity, + }); + } + Some(dedup_prefer_integrity(out)) +} + +/// `name (version)` → parts; platform-suffixed versions (`1.2.3-x86_64…`) +/// and dependency lines (no parens / range operators) yield `None`. +fn parse_gem_spec_line(line: &str) -> Option<(String, String)> { + let (name, rest) = line.split_once(" (")?; + let version = rest.strip_suffix(')')?; + if name.is_empty() + || version.is_empty() + || version.contains(' ') + || version.contains('-') + || !version.chars().next().is_some_and(|c| c.is_ascii_digit()) + { + return None; + } + Some((name.to_string(), version.to_string())) +} + +// ─────────────────────────────── pypi locks ─────────────────────────────── + +/// PEP 503 name normalization (`Foo._Bar` → `foo-bar`) — pypi purls and +/// lock entries must compare in this form. +fn pep503(name: &str) -> String { + let mut out = String::with_capacity(name.len()); + let mut last_dash = false; + for c in name.chars() { + let c = c.to_ascii_lowercase(); + if c == '-' || c == '_' || c == '.' { + if !last_dash { + out.push('-'); + last_dash = true; + } + } else { + out.push(c); + last_dash = false; + } + } + out +} + +/// Inventory the pypi lock the project carries. Fetchable resolution +/// (URL + sha256 of a pure `py3-none-any` wheel) comes from `uv.lock`; +/// `poetry.lock` and `--hash`-pinned `requirements.txt` contribute +/// DISCOVERY-only entries (no recorded URL; platform-independent wheel +/// choice is not derivable offline). Pipenv/pdm locks: not yet read. +pub async fn inventory_pypi_locks(project_root: &Path) -> Option> { + if let Some(out) = inventory_uv_lock(project_root).await { + return Some(out); + } + if let Some(out) = inventory_poetry_lock(project_root).await { + return Some(out); + } + inventory_requirements_txt(project_root).await +} + +/// uv.lock: TOML `[[package]]` blocks with `name`/`version` and +/// `wheels = [{ url, hash = "sha256:…" }, …]` entries. +async fn inventory_uv_lock(project_root: &Path) -> Option> { + let text = tokio::fs::read_to_string(project_root.join("uv.lock")) + .await + .ok()?; + let mut out = Vec::new(); + // Line-oriented: uv emits `[[package]]` blocks; wheels live either as + // inline `{ url = "…", hash = "sha256:…" }` table rows or one-line + // arrays. A pure-python wheel ends `py3-none-any.whl`. + let mut name: Option = None; + let mut version: Option = None; + let mut sourced_registry = true; + let mut wheel: Option<(String, String)> = None; + let flush = |name: &mut Option, + version: &mut Option, + sourced_registry: &mut bool, + wheel: &mut Option<(String, String)>, + out: &mut Vec| { + if let (Some(n), Some(v)) = (name.take(), version.take()) { + let canonical = pep503(&n); + if *sourced_registry + && path_safety::is_safe_single_segment(&canonical) + && path_safety::is_safe_single_segment(&v) + { + let (resolved, integrity) = match wheel.take() { + Some((url, sha)) => (http_url(&url), LockIntegrity::Sha256Hex(sha)), + None => (None, LockIntegrity::None), + }; + out.push(LockfileEntry { + ecosystem: "pypi", + purl: format!("pkg:pypi/{canonical}@{v}"), + name: canonical, + version: v, + resolved, + integrity, + }); + } + } + *sourced_registry = true; + *wheel = None; + }; + for line in text.lines() { + let t = line.trim(); + if t == "[[package]]" { + flush(&mut name, &mut version, &mut sourced_registry, &mut wheel, &mut out); + continue; + } + if let Some(v) = t.strip_prefix("name = ") { + name = Some(v.trim_matches('"').to_string()); + } else if let Some(v) = t.strip_prefix("version = ") { + version = Some(v.trim_matches('"').to_string()); + } else if t.starts_with("source = ") { + // Registry packages: `source = { registry = "…" }`; editable/ + // virtual/path/git sources are not fetchable artifacts. + sourced_registry = t.contains("registry"); + } else if wheel.is_none() && t.contains("py3-none-any.whl") { + // `{ url = "…py3-none-any.whl", hash = "sha256:…" }` + let url = t + .split("url = \"") + .nth(1) + .and_then(|r| r.split('"').next()) + .unwrap_or(""); + let sha = t + .split("hash = \"sha256:") + .nth(1) + .and_then(|r| r.split('"').next()) + .unwrap_or(""); + if !url.is_empty() && sha.len() == 64 && sha.bytes().all(|b| b.is_ascii_hexdigit()) { + wheel = Some((url.to_string(), sha.to_ascii_lowercase())); + } + } + } + flush(&mut name, &mut version, &mut sourced_registry, &mut wheel, &mut out); + Some(dedup_prefer_integrity(out)) +} + +/// poetry.lock: `[[package]]` blocks with `name`/`version` — discovery +/// only (file hashes exist but carry no URLs and no platform choice). +async fn inventory_poetry_lock(project_root: &Path) -> Option> { + let text = tokio::fs::read_to_string(project_root.join("poetry.lock")) + .await + .ok()?; + let mut out = Vec::new(); + let mut in_package = false; + let mut name: Option = None; + for line in text.lines() { + let t = line.trim(); + if t == "[[package]]" { + in_package = true; + name = None; + continue; + } + if t.starts_with('[') && t != "[[package]]" { + in_package = false; + continue; + } + if !in_package { + continue; + } + if let Some(v) = t.strip_prefix("name = ") { + name = Some(pep503(v.trim_matches('"'))); + } else if let Some(v) = t.strip_prefix("version = ") { + if let Some(n) = name.take() { + let v = v.trim_matches('"').to_string(); + if path_safety::is_safe_single_segment(&n) + && path_safety::is_safe_single_segment(&v) + { + out.push(LockfileEntry { + ecosystem: "pypi", + purl: format!("pkg:pypi/{n}@{v}"), + name: n, + version: v, + resolved: None, + integrity: LockIntegrity::None, + }); + } + } + } + } + if out.is_empty() { + return None; + } + Some(dedup_prefer_integrity(out)) +} + +/// requirements.txt with exact `==` pins — discovery only. +async fn inventory_requirements_txt(project_root: &Path) -> Option> { + let text = tokio::fs::read_to_string(project_root.join("requirements.txt")) + .await + .ok()?; + let mut out = Vec::new(); + for line in text.lines() { + let t = line.trim(); + if t.is_empty() || t.starts_with('#') || t.starts_with('-') { + continue; + } + // `name==version` (strip extras, env markers, hash continuations). + let spec = t.split(';').next().unwrap_or(t).trim(); + let spec = spec.split_whitespace().next().unwrap_or(spec); + let Some((raw_name, version)) = spec.split_once("==") else { + continue; + }; + let name = pep503(raw_name.split('[').next().unwrap_or(raw_name).trim()); + let version = version.trim().to_string(); + if name.is_empty() + || !path_safety::is_safe_single_segment(&name) + || !path_safety::is_safe_single_segment(&version) + || !version.chars().next().is_some_and(|c| c.is_ascii_digit()) + { + continue; + } + out.push(LockfileEntry { + ecosystem: "pypi", + purl: format!("pkg:pypi/{name}@{version}"), + name, + version, + resolved: None, + integrity: LockIntegrity::None, + }); + } + if out.is_empty() { + return None; + } + Some(dedup_prefer_integrity(out)) +} + +#[cfg(test)] +mod tests { + use super::*; + + async fn write(root: &Path, name: &str, content: &str) { + tokio::fs::write(root.join(name), content).await.unwrap(); + } + + fn entry<'a>(entries: &'a [LockfileEntry], name: &str) -> &'a LockfileEntry { + entries + .iter() + .find(|e| e.name == name) + .unwrap_or_else(|| panic!("no entry for {name}: {entries:?}")) + } + + // ── package-lock ────────────────────────────────────────────────────── + + const PACKAGE_LOCK: &str = r#"{ + "name": "fixture", + "version": "1.0.0", + "lockfileVersion": 3, + "packages": { + "": { "name": "fixture", "version": "1.0.0" }, + "packages/member": { "name": "member", "version": "0.0.1" }, + "node_modules/member": { "resolved": "packages/member", "link": true }, + "node_modules/left-pad": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz", + "integrity": "sha512-XI5MPz==" + }, + "node_modules/@scope/pkg": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/@scope/pkg/-/pkg-2.0.0.tgz", + "integrity": "sha512-scoped==" + }, + "node_modules/bundled-dep": { + "version": "1.0.0", + "inBundle": true + }, + "node_modules/git-dep": { + "version": "0.5.0", + "resolved": "git+ssh://git@github.com/x/git-dep.git#abc" + }, + "node_modules/vendored": { + "version": "3.0.0", + "resolved": "file:.socket/vendor/npm/9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f/vendored-3.0.0.tgz", + "integrity": "sha512-ours==" + }, + "node_modules/evil": { + "version": "../../escape", + "resolved": "https://registry.npmjs.org/evil/-/evil-1.0.0.tgz", + "integrity": "sha512-evil==" + } + } +} +"#; + + #[tokio::test] + async fn package_lock_inventories_registry_entries() { + let tmp = tempfile::tempdir().unwrap(); + write(tmp.path(), "package-lock.json", PACKAGE_LOCK).await; + + let (flavor, entries) = inventory_npm_lock(tmp.path()).await.unwrap(); + assert_eq!(flavor, NpmLockFlavor::PackageLock); + + let lp = entry(&entries, "left-pad"); + assert_eq!(lp.version, "1.3.0"); + assert_eq!(lp.purl, "pkg:npm/left-pad@1.3.0"); + assert_eq!( + lp.resolved.as_deref(), + Some("https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz") + ); + assert_eq!(lp.integrity, LockIntegrity::Sri("sha512-XI5MPz==".into())); + + let scoped = entry(&entries, "@scope/pkg"); + assert_eq!(scoped.purl, "pkg:npm/@scope/pkg@2.0.0"); + + // git deps stay listed (discovery) but carry no fetchable URL. + let git = entry(&entries, "git-dep"); + assert_eq!(git.resolved, None); + assert_eq!(git.integrity, LockIntegrity::None); + + // Workspace members, links, bundled deps, our vendored spec, and + // the unsafe-version entry are all absent. + for absent in ["member", "fixture", "bundled-dep", "vendored", "evil"] { + assert!( + !entries.iter().any(|e| e.name == absent), + "{absent} must not be inventoried: {entries:?}" + ); + } + } + + #[tokio::test] + async fn shrinkwrap_wins_over_package_lock() { + let tmp = tempfile::tempdir().unwrap(); + write(tmp.path(), "package-lock.json", PACKAGE_LOCK).await; + write( + tmp.path(), + "npm-shrinkwrap.json", + r#"{ "lockfileVersion": 3, "packages": { + "node_modules/only-in-shrinkwrap": { "version": "9.9.9" } } }"#, + ) + .await; + + let (_, entries) = inventory_npm_lock(tmp.path()).await.unwrap(); + assert!(entries.iter().any(|e| e.name == "only-in-shrinkwrap")); + assert!(!entries.iter().any(|e| e.name == "left-pad")); + } + + #[tokio::test] + async fn legacy_v1_lock_without_packages_map_yields_none() { + let tmp = tempfile::tempdir().unwrap(); + write( + tmp.path(), + "package-lock.json", + r#"{ "lockfileVersion": 1, "dependencies": { "left-pad": { "version": "1.3.0" } } }"#, + ) + .await; + assert!(inventory_npm_lock(tmp.path()).await.is_none()); + } + + // ── pnpm ────────────────────────────────────────────────────────────── + + const PNPM_LOCK: &str = "lockfileVersion: '9.0' + +settings: + autoInstallPeers: true + +importers: + + .: + dependencies: + left-pad: + specifier: 1.3.0 + version: 1.3.0 + +packages: + + left-pad@1.3.0: + resolution: {integrity: sha512-XI5MPz==} + + '@scope/pkg@2.0.0': + resolution: {integrity: sha512-scoped==} + + peer-user@4.0.0(left-pad@1.3.0): + resolution: {integrity: sha512-peer==} + + local-thing@file:packages/local: + resolution: {directory: packages/local, type: directory} + + vendored@file:.socket/vendor/npm/9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f/vendored-3.0.0.tgz: + resolution: {integrity: sha512-ours==, tarball: file:.socket/vendor/npm/9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f/vendored-3.0.0.tgz} + +snapshots: + + left-pad@1.3.0: {} +"; + + #[tokio::test] + async fn pnpm_v9_keys_parse_with_peer_suffix_and_scoped_quoting() { + let tmp = tempfile::tempdir().unwrap(); + write(tmp.path(), "pnpm-lock.yaml", PNPM_LOCK).await; + + let (flavor, entries) = inventory_npm_lock(tmp.path()).await.unwrap(); + assert_eq!(flavor, NpmLockFlavor::Pnpm); + + assert_eq!( + entry(&entries, "left-pad").integrity, + LockIntegrity::Sri("sha512-XI5MPz==".into()) + ); + assert_eq!(entry(&entries, "@scope/pkg").version, "2.0.0"); + assert_eq!(entry(&entries, "peer-user").version, "4.0.0"); + // registry entries carry no URL in v9 — constructed at fetch time. + assert_eq!(entry(&entries, "left-pad").resolved, None); + for absent in ["local-thing", "vendored"] { + assert!(!entries.iter().any(|e| e.name == absent), "{entries:?}"); + } + } + + // ── yarn classic ────────────────────────────────────────────────────── + + const YARN_CLASSIC: &str = "# THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY. +# yarn lockfile v1 + + +\"@scope/pkg@^2.0.0\": + version \"2.0.0\" + resolved \"https://registry.yarnpkg.com/@scope/pkg/-/pkg-2.0.0.tgz#aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\" + integrity sha512-scoped== + +left-pad@1.3.0, left-pad@^1.3.0: + version \"1.3.0\" + resolved \"https://registry.yarnpkg.com/left-pad/-/left-pad-1.3.0.tgz#bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb\" + integrity sha512-XI5MPz== + +old-school@0.1.0: + version \"0.1.0\" + resolved \"https://registry.yarnpkg.com/old-school/-/old-school-0.1.0.tgz#cccccccccccccccccccccccccccccccccccccccc\" + +aliased@npm:real-name@^3.0.0: + version \"3.0.0\" + resolved \"https://registry.yarnpkg.com/real-name/-/real-name-3.0.0.tgz#dddddddddddddddddddddddddddddddddddddddd\" + integrity sha512-alias== +"; + + #[tokio::test] + async fn yarn_classic_blocks_yield_resolved_sha1_and_integrity() { + let tmp = tempfile::tempdir().unwrap(); + write(tmp.path(), "yarn.lock", YARN_CLASSIC).await; + + let (flavor, entries) = inventory_npm_lock(tmp.path()).await.unwrap(); + assert_eq!(flavor, NpmLockFlavor::YarnClassic); + + let lp = entry(&entries, "left-pad"); + assert_eq!( + lp.resolved.as_deref(), + Some("https://registry.yarnpkg.com/left-pad/-/left-pad-1.3.0.tgz"), + "the #sha1 fragment is split off the URL" + ); + assert_eq!(lp.integrity, LockIntegrity::Sri("sha512-XI5MPz==".into())); + + // Integrity-less old locks fall back to the sha1 fragment. + assert_eq!( + entry(&entries, "old-school").integrity, + LockIntegrity::Sha1Hex("c".repeat(40)) + ); + + // `alias@npm:real@range` resolves to the real name. + assert!(entries.iter().any(|e| e.name == "real-name")); + assert_eq!(entry(&entries, "@scope/pkg").version, "2.0.0"); + } + + // ── yarn berry ──────────────────────────────────────────────────────── + + const YARN_BERRY: &str = "# This file is generated by running \"yarn install\" inside your project. +# Manifest files (package.json) are also used. + +__metadata: + version: 8 + cacheKey: 10c0 + +\"fixture@workspace:.\": + version: 0.0.0-use.local + resolution: \"fixture@workspace:.\" + languageName: unknown + linkType: soft + +\"left-pad@npm:1.3.0\": + version: 1.3.0 + resolution: \"left-pad@npm:1.3.0\" + checksum: 10c0/deadbeefcafe== + languageName: node + linkType: hard + +\"@scope/pkg@npm:^2.0.0\": + version: 2.0.0 + resolution: \"@scope/pkg@npm:2.0.0\" + checksum: 10c0/scopedchecksum== + languageName: node + linkType: hard +"; + + #[tokio::test] + async fn yarn_berry_registry_resolutions_inventory_with_checksums() { + let tmp = tempfile::tempdir().unwrap(); + write(tmp.path(), "yarn.lock", YARN_BERRY).await; + + let (flavor, entries) = inventory_npm_lock(tmp.path()).await.unwrap(); + assert_eq!(flavor, NpmLockFlavor::YarnBerry); + + let lp = entry(&entries, "left-pad"); + assert_eq!(lp.version, "1.3.0"); + assert_eq!( + lp.integrity, + LockIntegrity::BerryChecksum("10c0/deadbeefcafe==".into()) + ); + assert_eq!(entry(&entries, "@scope/pkg").version, "2.0.0"); + // The workspace root is not a registry package. + assert!(!entries.iter().any(|e| e.name == "fixture"), "{entries:?}"); + } + + // ── bun ─────────────────────────────────────────────────────────────── + + const BUN_LOCK: &str = r#"{ + "lockfileVersion": 1, + "workspaces": { + "": { "name": "fixture", "dependencies": { "left-pad": "1.3.0" } }, + }, + "packages": { + "left-pad": ["left-pad@1.3.0", "", {}, "sha512-XI5MPz=="], + "@scope/pkg": ["@scope/pkg@2.0.0", "", {}, "sha512-scoped=="], + "vendored": ["vendored@file:.socket/vendor/npm/9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f/vendored-3.0.0.tgz", {}], + "linked": ["linked@workspace:packages/linked", {}], + } +} +"#; + + #[tokio::test] + async fn bun_registry_tuples_parse_and_locals_are_skipped() { + let tmp = tempfile::tempdir().unwrap(); + write(tmp.path(), "bun.lock", BUN_LOCK).await; + + let (flavor, entries) = inventory_npm_lock(tmp.path()).await.unwrap(); + assert_eq!(flavor, NpmLockFlavor::Bun); + + assert_eq!( + entry(&entries, "left-pad").integrity, + LockIntegrity::Sri("sha512-XI5MPz==".into()) + ); + assert_eq!(entry(&entries, "left-pad").resolved, None); + assert_eq!(entry(&entries, "@scope/pkg").version, "2.0.0"); + for absent in ["vendored", "linked"] { + assert!(!entries.iter().any(|e| e.name == absent), "{entries:?}"); + } + } + + // ── shared semantics ────────────────────────────────────────────────── + + #[tokio::test] + async fn lookup_bridges_percent_encoded_purls() { + let entries = vec![ + LockfileEntry::npm("@scope/pkg", "2.0.0", None, LockIntegrity::None), + LockfileEntry::npm("left-pad", "1.3.0", None, LockIntegrity::None), + ]; + assert!(lookup(&entries, "pkg:npm/%40scope/pkg@2.0.0").is_some()); + assert!(lookup(&entries, "pkg:npm/@scope/pkg@2.0.0").is_some()); + assert!(lookup(&entries, "pkg:npm/left-pad@1.3.0?artifact_id=x").is_some()); + assert!(lookup(&entries, "pkg:npm/left-pad@9.9.9").is_none()); + assert!(lookup(&entries, "pkg:pypi/left-pad@1.3.0").is_none()); + } + + #[tokio::test] + async fn dedup_prefers_integrity_bearing_instance() { + let raw = vec![ + LockfileEntry::npm("dup", "1.0.0", None, LockIntegrity::None), + LockfileEntry::npm("dup", "1.0.0", None, LockIntegrity::Sri("sha512-x==".into())), + LockfileEntry::npm("dup", "1.0.0", None, LockIntegrity::None), + ]; + let out = finalize_npm(raw); + assert_eq!(out.len(), 1); + assert_eq!(out[0].integrity, LockIntegrity::Sri("sha512-x==".into())); + } + + #[cfg(feature = "cargo")] + #[tokio::test] + async fn cargo_lock_inventories_crates_io_entries() { + let tmp = tempfile::tempdir().unwrap(); + write( + tmp.path(), + "Cargo.lock", + r#"# This file is automatically @generated by Cargo. +version = 4 + +[[package]] +name = "fixture" +version = "0.1.0" + +[[package]] +name = "serde" +version = "1.0.200" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddc6f9cc94d67c0e21aaf7eda3a010fd3af78ebf6e096aa6e2e13c79749cce4f" + +[[package]] +name = "git-dep" +version = "0.5.0" +source = "git+https://github.com/x/git-dep?rev=abc#abc" + +[[package]] +name = "sparse-crate" +version = "2.0.0" +source = "sparse+https://index.crates.io/" +checksum = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" +"#, + ) + .await; + + let entries = inventory_cargo_lock(tmp.path()).await.unwrap(); + let serde_entry = entry(&entries, "serde"); + assert_eq!(serde_entry.version, "1.0.200"); + assert_eq!(serde_entry.purl, "pkg:cargo/serde@1.0.200"); + assert_eq!( + serde_entry.integrity, + LockIntegrity::Sha256Hex( + "ddc6f9cc94d67c0e21aaf7eda3a010fd3af78ebf6e096aa6e2e13c79749cce4f".into() + ) + ); + assert!(matches!( + entry(&entries, "sparse-crate").integrity, + LockIntegrity::Sha256Hex(_) + )); + // Workspace member (no source) excluded; git source unverifiable. + assert!(!entries.iter().any(|e| e.name == "fixture")); + assert_eq!(entry(&entries, "git-dep").integrity, LockIntegrity::None); + } + + #[cfg(feature = "golang")] + #[tokio::test] + async fn go_sum_inventories_module_zip_lines() { + let tmp = tempfile::tempdir().unwrap(); + write( + tmp.path(), + "go.sum", + "github.com/gin-gonic/gin v1.9.1 h1:4idEAncQnU5cB7BeOkPtxjfCSye0AAm1R0RVIqJ+Jmg=\n\ + github.com/gin-gonic/gin v1.9.1/go.mod h1:hPrL7YrpYKXt5YId3A/Tnip5kqbEAP+KLuI3SUcPTeU=\n\ + golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ=\n", + ) + .await; + + let entries = inventory_go_sum(tmp.path()).await.unwrap(); + assert_eq!(entries.len(), 2, "the /go.mod line is skipped: {entries:?}"); + let gin = entry(&entries, "github.com/gin-gonic/gin"); + assert_eq!(gin.version, "v1.9.1"); + assert_eq!(gin.purl, "pkg:golang/github.com/gin-gonic/gin@v1.9.1"); + assert_eq!( + gin.integrity, + LockIntegrity::GoH1("h1:4idEAncQnU5cB7BeOkPtxjfCSye0AAm1R0RVIqJ+Jmg=".into()) + ); + } + + #[tokio::test] + async fn lookup_matches_cargo_and_golang_purls() { + let entries = vec![ + LockfileEntry { + ecosystem: "cargo", + name: "serde".into(), + version: "1.0.200".into(), + purl: "pkg:cargo/serde@1.0.200".into(), + resolved: None, + integrity: LockIntegrity::None, + }, + LockfileEntry { + ecosystem: "golang", + name: "github.com/x/y".into(), + version: "v1.0.0".into(), + purl: "pkg:golang/github.com/x/y@v1.0.0".into(), + resolved: None, + integrity: LockIntegrity::None, + }, + ]; + assert!(lookup(&entries, "pkg:cargo/serde@1.0.200").is_some()); + assert!(lookup(&entries, "pkg:golang/github.com/x/y@v1.0.0").is_some()); + assert!(lookup(&entries, "pkg:cargo/serde@9.9.9").is_none()); + assert!( + lookup(&entries, "pkg:npm/serde@1.0.200").is_none(), + "ecosystem tags must match, not just name@version" + ); + } + + #[cfg(feature = "composer")] + #[tokio::test] + async fn composer_lock_inventories_dist_entries() { + let tmp = tempfile::tempdir().unwrap(); + write( + tmp.path(), + "composer.lock", + r#"{ + "packages": [ + { + "name": "Monolog/Monolog", + "version": "v3.5.0", + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/Seldaek/monolog/zipball/abc", + "shasum": "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" + } + }, + { + "name": "vendored/pkg", + "version": "1.0.0", + "dist": { "type": "path", "url": ".socket/vendor/composer/9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f/vendored/pkg@1.0.0" } + } + ], + "packages-dev": [ + { + "name": "symfony/console", + "version": "v6.4.1", + "dist": { "type": "zip", "url": "https://example.com/console.zip", "shasum": "" } + } + ] +}"#, + ) + .await; + + let entries = inventory_composer_lock(tmp.path()).await.unwrap(); + let monolog = entry(&entries, "monolog/monolog"); + assert_eq!(monolog.version, "3.5.0", "leading v dropped, name lowercased"); + assert_eq!(monolog.purl, "pkg:composer/monolog/monolog@3.5.0"); + assert!(matches!(monolog.integrity, LockIntegrity::Sha1Hex(_))); + assert!(monolog.resolved.as_deref().unwrap().contains("zipball")); + // Empty shasum → discovery-only; path dist (ours) excluded. + assert_eq!( + entry(&entries, "symfony/console").integrity, + LockIntegrity::None + ); + assert!(!entries.iter().any(|e| e.name == "vendored/pkg")); + } + + #[tokio::test] + async fn gemfile_lock_inventories_specs_and_checksums() { + let tmp = tempfile::tempdir().unwrap(); + write( + tmp.path(), + "Gemfile.lock", + "GEM\n remote: https://rubygems.org/\n specs:\n rails (7.1.0)\n \ + actionpack (= 7.1.0)\n rack (3.0.8)\n nokogiri (1.16.5-arm64-darwin)\n\n\ + PLATFORMS\n ruby\n\nDEPENDENCIES\n rails\n\nCHECKSUMS\n \ + rails (7.1.0) sha256=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n\n\ + BUNDLED WITH\n 2.6.0\n", + ) + .await; + + let entries = inventory_gemfile_lock(tmp.path()).await.unwrap(); + let rails = entry(&entries, "rails"); + assert_eq!(rails.version, "7.1.0"); + assert_eq!(rails.purl, "pkg:gem/rails@7.1.0"); + assert!(matches!(rails.integrity, LockIntegrity::Sha256Hex(_))); + assert_eq!( + rails.resolved.as_deref(), + Some("https://rubygems.org/downloads/rails-7.1.0.gem") + ); + // No CHECKSUMS entry → discovery-only; platform gem skipped; + // dependency range lines never parse as specs. + assert_eq!(entry(&entries, "rack").integrity, LockIntegrity::None); + assert!(!entries.iter().any(|e| e.name == "nokogiri")); + assert!(!entries.iter().any(|e| e.name == "actionpack")); + } + + #[tokio::test] + async fn uv_lock_inventories_pure_wheels() { + let tmp = tempfile::tempdir().unwrap(); + write( + tmp.path(), + "uv.lock", + r#"version = 1 + +[[package]] +name = "Requests" +version = "2.28.0" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/aa/requests-2.28.0-py3-none-any.whl", hash = "sha256:aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" }, +] + +[[package]] +name = "native-only" +version = "1.0.0" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bb/native_only-1.0.0-cp312-macosx.whl", hash = "sha256:bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" }, +] + +[[package]] +name = "local-proj" +version = "0.0.1" +source = { editable = "." } +"#, + ) + .await; + + let entries = inventory_pypi_locks(tmp.path()).await.unwrap(); + let requests = entry(&entries, "requests"); + assert_eq!(requests.purl, "pkg:pypi/requests@2.28.0", "PEP 503 name"); + assert!(matches!(requests.integrity, LockIntegrity::Sha256Hex(_))); + assert!(requests + .resolved + .as_deref() + .unwrap() + .ends_with("py3-none-any.whl")); + // Platform-only wheels → discovery-only; editable sources excluded. + assert_eq!( + entry(&entries, "native-only").integrity, + LockIntegrity::None + ); + assert!(!entries.iter().any(|e| e.name == "local-proj")); + } + + #[tokio::test] + async fn poetry_and_requirements_are_discovery_only() { + let tmp = tempfile::tempdir().unwrap(); + write( + tmp.path(), + "poetry.lock", + "[[package]]\nname = \"Flask_Login\"\nversion = \"0.6.3\"\n\n[metadata]\nlock-version = \"2.0\"\n", + ) + .await; + let entries = inventory_pypi_locks(tmp.path()).await.unwrap(); + let fl = entry(&entries, "flask-login"); + assert_eq!(fl.purl, "pkg:pypi/flask-login@0.6.3"); + assert_eq!(fl.integrity, LockIntegrity::None); + + let tmp = tempfile::tempdir().unwrap(); + write( + tmp.path(), + "requirements.txt", + "# pinned\nrequests[security]==2.28.0 --hash=sha256:abc \\\n --hash=sha256:def\nflask>=2.0\n-e .\n", + ) + .await; + let entries = inventory_pypi_locks(tmp.path()).await.unwrap(); + assert_eq!(entries.len(), 1, "{entries:?}"); + assert_eq!(entries[0].purl, "pkg:pypi/requests@2.28.0"); + } + + #[tokio::test] + async fn unsupported_flavors_yield_none() { + // PnP marker wins over any lockfile. + let tmp = tempfile::tempdir().unwrap(); + write(tmp.path(), ".pnp.cjs", "/* pnp */").await; + write(tmp.path(), "package-lock.json", PACKAGE_LOCK).await; + assert!(inventory_npm_lock(tmp.path()).await.is_none()); + + // pnpm v6. + let tmp = tempfile::tempdir().unwrap(); + write(tmp.path(), "pnpm-lock.yaml", "lockfileVersion: '6.0'\n").await; + assert!(inventory_npm_lock(tmp.path()).await.is_none()); + + // No lockfile at all. + let tmp = tempfile::tempdir().unwrap(); + assert!(inventory_npm_lock(tmp.path()).await.is_none()); + } +} diff --git a/crates/socket-patch-core/src/patch/vendor/mod.rs b/crates/socket-patch-core/src/patch/vendor/mod.rs index 7d60fdc..8b13f22 100644 --- a/crates/socket-patch-core/src/patch/vendor/mod.rs +++ b/crates/socket-patch-core/src/patch/vendor/mod.rs @@ -53,6 +53,8 @@ pub mod cargo_lock; #[cfg(feature = "composer")] pub mod composer_lock; pub mod gem; +pub mod lock_inventory; +pub mod registry_fetch; #[cfg(feature = "golang")] pub mod golang; mod npm_common; @@ -75,7 +77,14 @@ pub mod yarn_classic_lock; pub use path::{ecosystem_dir_for_purl, parse_vendor_path, VendorPathParts, VENDOR_DIR}; pub use state::{load_state, save_state, VendorEntry, VendorState, VENDOR_STATE_REL}; -use crate::patch::apply::ApplyResult; +use std::collections::HashMap; +use std::path::Path; + +use crate::manifest::schema::{PatchFileInfo, PatchRecord}; +use crate::patch::apply::{ + apply_package_patch, is_safe_relative_subpath, normalize_file_path, ApplyResult, PatchSources, + VerifyStatus, +}; /// A non-fatal advisory surfaced as a warning event (`code` is a stable /// reason tag from the CLI contract; `detail` is human text). @@ -94,6 +103,144 @@ impl VendorWarning { } } +/// One warning per staged file whose pre-patch content matched NEITHER +/// `beforeHash` nor `afterHash` and was overwritten with the verified +/// patched content (vendor staging always force-applies — the stage is a +/// private copy, and every apply write path is hash-gated to exactly +/// `afterHash`). +/// +/// Detection rides the verify signature `apply_package_patch` leaves +/// behind: a force-promoted file keeps `status: Ready` WITH +/// `expected_hash: Some(..)` and a differing `current_hash`, whereas a +/// cleanly-verified file carries `expected_hash: None` (see +/// `verify_file_patch`). +pub(crate) fn mismatch_overwrite_warnings( + result: &ApplyResult, + name: &str, + version: &str, +) -> Vec { + let mut warnings: Vec = result + .files_verified + .iter() + .filter(|v| { + v.status == VerifyStatus::Ready + && v.expected_hash.is_some() + && v.current_hash != v.expected_hash + }) + .map(|v| { + VendorWarning::new( + "vendor_content_mismatch_overwritten", + format!( + "installed {name}@{version} does not match this patch's expected original \ + ({}); vendored the patched content anyway", + v.file + ), + ) + }) + .collect(); + // HashMap-driven verify order is randomized; keep warning order stable. + warnings.sort_by(|a, b| a.detail.cmp(&b.detail)); + warnings +} + +/// Patch-target files (non-empty `beforeHash`) absent from the staged +/// copy. Vendor staging force-applies (see [`force_apply_staged`]), and +/// force silently SKIPS missing files — which would pack an artifact +/// without the fix. This pre-check restores the strict apply's +/// fail-closed behavior for the non-`--force` path. Unsafe keys are +/// skipped here: the apply pipeline itself rejects them fail-closed. +pub(crate) async fn missing_existing_patch_files( + staged_dir: &Path, + files: &HashMap, +) -> Vec { + let mut missing: Vec = Vec::new(); + for (file_name, info) in files { + if info.before_hash.is_empty() { + continue; // a new file is expected to not exist yet + } + let normalized = normalize_file_path(file_name); + if !is_safe_relative_subpath(normalized) { + continue; + } + if tokio::fs::metadata(staged_dir.join(normalized)).await.is_err() { + missing.push(file_name.clone()); + } + } + missing.sort(); + missing +} + +/// A failed synthesized [`ApplyResult`] in the shape the strict apply +/// pipeline would have produced (success=false, `error` set, no files). +pub(crate) fn failed_apply_result(purl: &str, error: String) -> ApplyResult { + ApplyResult { + package_key: purl.to_string(), + package_path: String::new(), + success: false, + files_verified: Vec::new(), + files_patched: Vec::new(), + applied_via: HashMap::new(), + error: Some(error), + sidecar: None, + } +} + +/// Run the hardened apply pipeline against a vendor stage/copy with the +/// vendor auto-force policy: +/// +/// * Missing patch-target files fail closed unless the caller's own +/// `--force` asked for that skip tolerance. +/// * The apply itself ALWAYS forces: the stage is a private copy (never +/// the user's tree), and every apply write path is hash-gated to +/// exactly `afterHash` (the archive and blob paths verify content +/// BEFORE writing; the diff path self-disables on a base mismatch) — +/// forcing can only produce the verified patched content or fail +/// closed. This is what lets vendor succeed on a package already +/// patched in place by `apply`, or on a patch whose `beforeHash` was +/// built against different bytes than the installed artifact. +/// * Every force-overwritten file (content matched NEITHER hash) emits a +/// `vendor_content_mismatch_overwritten` warning — including on dry +/// runs, so previews predict the real outcome. +#[allow(clippy::too_many_arguments)] +pub(crate) async fn force_apply_staged( + purl: &str, + staged_dir: &Path, + record: &PatchRecord, + sources: &PatchSources<'_>, + dry_run: bool, + force: bool, + name: &str, + version: &str, + warnings: &mut Vec, +) -> ApplyResult { + if !force { + let missing = missing_existing_patch_files(staged_dir, &record.files).await; + if let Some(first) = missing.first() { + return failed_apply_result( + purl, + format!("Cannot apply patch: {first} - File not found"), + ); + } + } + let result = apply_package_patch( + purl, + staged_dir, + &record.files, + sources, + Some(&record.uuid), + dry_run, + // The stage is private and every write path is afterHash-gated; + // Force additionally covers the caller's --force NotFound-skip + // (the missing-file pre-check above handles the default case). + crate::patch::apply::MismatchPolicy::Force, + ) + .await; + if result.success { + warnings.extend(mismatch_overwrite_warnings(&result, name, version)); + } + result +} + /// The result of one backend `vendor_*` call. // // `large_enum_variant`: `Done` is much bigger than `Refused` because it carries @@ -187,3 +334,63 @@ pub async fn vendored_purl_keys( Err(_) => std::collections::HashSet::new(), } } + +#[cfg(test)] +mod policy_tests { + use super::*; + use crate::patch::apply::VerifyResult; + + fn verify(status: VerifyStatus, expected: Option<&str>, current: Option<&str>) -> VerifyResult { + VerifyResult { + file: "package/index.js".to_string(), + status, + message: None, + current_hash: current.map(str::to_string), + expected_hash: expected.map(str::to_string), + target_hash: None, + } + } + + fn result_with(files_verified: Vec) -> ApplyResult { + ApplyResult { + package_key: "pkg:npm/x@1.0.0".to_string(), + package_path: String::new(), + success: true, + files_verified, + files_patched: Vec::new(), + applied_via: HashMap::new(), + error: None, + sidecar: None, + } + } + + /// Only the force-promoted signature (`Ready` + `expected_hash: Some` + + /// differing `current_hash`) flags an overwrite; clean verifies and + /// AlreadyPatched files never do. + #[test] + fn mismatch_overwrite_warnings_detects_promoted_ready() { + // Force-promoted mismatch: flagged. + let r = result_with(vec![verify(VerifyStatus::Ready, Some("aa"), Some("bb"))]); + let w = mismatch_overwrite_warnings(&r, "left-pad", "1.3.0"); + assert_eq!(w.len(), 1); + assert_eq!(w[0].code, "vendor_content_mismatch_overwritten"); + assert!(w[0].detail.contains("left-pad@1.3.0")); + assert!(w[0].detail.contains("package/index.js")); + + // Clean Ready (verify matched beforeHash): expected_hash is None. + let r = result_with(vec![verify(VerifyStatus::Ready, None, Some("aa"))]); + assert!(mismatch_overwrite_warnings(&r, "x", "1").is_empty()); + + // AlreadyPatched (afterHash content): not a mismatch. + let r = result_with(vec![verify( + VerifyStatus::AlreadyPatched, + None, + Some("after"), + )]); + assert!(mismatch_overwrite_warnings(&r, "x", "1").is_empty()); + + // NotFound (force-skipped): not an overwrite. + let r = result_with(vec![verify(VerifyStatus::NotFound, None, None)]); + assert!(mismatch_overwrite_warnings(&r, "x", "1").is_empty()); + } +} diff --git a/crates/socket-patch-core/src/patch/vendor/npm_common.rs b/crates/socket-patch-core/src/patch/vendor/npm_common.rs index 1b26cc4..e2b4cdb 100644 --- a/crates/socket-patch-core/src/patch/vendor/npm_common.rs +++ b/crates/socket-patch-core/src/patch/vendor/npm_common.rs @@ -12,30 +12,33 @@ //! the project byte-untouched (a dry run stops after verification and //! creates nothing on disk). -use std::collections::HashMap; use std::path::Path; use serde_json::Value; use crate::manifest::schema::PatchRecord; -use crate::patch::apply::{apply_package_patch, normalize_file_path, ApplyResult, PatchSources}; +use crate::patch::apply::{normalize_file_path, ApplyResult, PatchSources}; use crate::patch::copy_tree::{fresh_copy, remove_tree}; use crate::patch::path_safety; -use crate::utils::purl::strip_purl_qualifiers; +use crate::utils::purl::{percent_decode_purl_component, strip_purl_qualifiers}; use super::npm_pack::{pack_deterministic, PackedTarball}; use super::path::vendor_uuid_dir_rel; -use super::VendorOutcome; +use super::{VendorOutcome, VendorWarning}; /// Validated npm vendoring coordinates (the output of -/// [`guard_coordinates`]). `name`/`version` borrow from the purl. +/// [`guard_coordinates`]). `name`/`version` are the percent-DECODED purl +/// components (the API serves scoped purls as `%40scope/name`; the +/// lockfile and node_modules carry the literal `@scope/name`). #[derive(Debug)] -pub(super) struct NpmCoords<'a> { - pub name: &'a str, - pub version: &'a str, +pub(super) struct NpmCoords { + pub name: String, + pub version: String, /// `.socket/vendor/npm/` (validated, forward slashes). pub uuid_dir_rel: String, - /// Qualifier-free base PURL. + /// Qualifier-free base PURL — VERBATIM (still encoded when the API + /// encoded it): the ledger's `base_purl`/entry keys must keep + /// matching the manifest keys, which store the purl as-served. pub base_purl: String, } @@ -49,17 +52,17 @@ pub(super) struct NpmCoords<'a> { /// vendor, arbitrary delete on revert) — reject fail-closed before any disk /// access. `Err` carries a ready [`VendorOutcome::Refused`] to bubble /// verbatim. -pub(super) fn guard_coordinates<'a>( - purl: &'a str, +pub(super) fn guard_coordinates( + purl: &str, record: &PatchRecord, -) -> Result, Box> { +) -> Result> { let Some((name, version)) = parse_npm_purl(purl) else { return Err(Box::new(refused( "unsafe_coordinates", format!("cannot parse an npm name@version out of `{purl}`"), ))); }; - if !is_safe_npm_name(name) || !path_safety::is_safe_single_segment(version) { + if !is_safe_npm_name(&name) || !path_safety::is_safe_single_segment(&version) { return Err(Box::new(refused( "unsafe_coordinates", format!( @@ -118,6 +121,7 @@ pub(super) struct NpmStagedPack { /// verification — no pack, no dirs created). /// * `Ok((Some(staged), result))` — full success: the tarball is on disk at /// `staged.rel_tgz` and the caller proceeds to its lockfile wiring. +#[allow(clippy::too_many_arguments)] pub(super) async fn stage_patch_pack( purl: &str, installed_dir: &Path, @@ -126,6 +130,7 @@ pub(super) async fn stage_patch_pack( sources: &PatchSources<'_>, dry_run: bool, force: bool, + warnings: &mut Vec, ) -> Result<(Option, ApplyResult), Box> { let coords = guard_coordinates(purl, record)?; @@ -175,18 +180,21 @@ pub(super) async fn stage_patch_pack( } } - // Delegate to the hardened apply pipeline, pointed at the stage (which + // Delegate to the hardened apply pipeline (with the vendor auto-force + // policy — see `force_apply_staged`), pointed at the stage (which // plays the role of the installed package dir — manifest npm keys carry // the `package/` prefix and `apply` strips it via `normalize_file_path`, // exactly as it does for an in-place npm apply). - let result = apply_package_patch( + let result = super::force_apply_staged( purl, &stage, - &record.files, + record, sources, - Some(&record.uuid), dry_run, force, + &coords.name, + &coords.version, + warnings, ) .await; // A failed patch never packs (wiring is last — the caller returns with @@ -199,7 +207,7 @@ pub(super) async fn stage_patch_pack( let rel_tgz = format!( "{}/{}", coords.uuid_dir_rel, - tgz_rel_leaf(coords.name, coords.version) + tgz_rel_leaf(&coords.name, &coords.version) ); let dest = project_root.join(&rel_tgz); if let Some(parent) = dest.parent() { @@ -236,8 +244,8 @@ pub(super) async fn stage_patch_pack( Ok(( Some(NpmStagedPack { - name: coords.name.to_string(), - version: coords.version.to_string(), + name: coords.name, + version: coords.version, rel_tgz, packed, staged_pkg_json, @@ -251,14 +259,27 @@ pub(super) async fn stage_patch_pack( /// `pkg:npm/[@scope/]name@version` → `(name, version)`; scoped names keep /// the `@scope/` prefix. The LAST `@` separates the version (a leading /// scope-`@` is at index 0 and never the last `@` of a versioned purl). -pub(super) fn parse_npm_purl(purl: &str) -> Option<(&str, &str)> { +/// +/// Components are percent-DECODED (the API serves `pkg:npm/%40scope/...`). +/// SECURITY: each segment decodes independently AFTER the `/`/`@` splits, +/// and the post-decode `is_safe_npm_name`/`is_safe_single_segment` gates in +/// [`guard_coordinates`] reject any separator or traversal sequence a +/// decode may have surfaced (`%2e%2e`, `%2f`, ...) — decoding never runs +/// after the guards. +pub(super) fn parse_npm_purl(purl: &str) -> Option<(String, String)> { let base = strip_purl_qualifiers(purl); let rest = base.strip_prefix("pkg:npm/")?; let at = rest.rfind('@').filter(|&i| i > 0)?; - let (name, version) = (&rest[..at], &rest[at + 1..]); - if name.is_empty() || version.is_empty() { + let (name_raw, version_raw) = (&rest[..at], &rest[at + 1..]); + if name_raw.is_empty() || version_raw.is_empty() { return None; } + let name = name_raw + .split('/') + .map(percent_decode_purl_component) + .collect::>() + .join("/"); + let version = percent_decode_purl_component(version_raw).into_owned(); Some((name, version)) } @@ -314,16 +335,7 @@ pub(super) fn refused(code: &'static str, detail: String) -> VendorOutcome { /// results. pub(super) fn done_failure(purl: &str, error: String) -> VendorOutcome { VendorOutcome::Done { - result: ApplyResult { - package_key: purl.to_string(), - package_path: String::new(), - success: false, - files_verified: Vec::new(), - files_patched: Vec::new(), - applied_via: HashMap::new(), - error: Some(error), - sidecar: None, - }, + result: super::failed_apply_result(purl, error), entry: None, warnings: Vec::new(), } @@ -333,6 +345,7 @@ pub(super) fn done_failure(purl: &str, error: String) -> VendorOutcome { mod tests { use super::*; use crate::manifest::schema::PatchFileInfo; + use std::collections::HashMap; const UUID: &str = "9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f"; @@ -369,18 +382,42 @@ mod tests { fn guard_coordinates_accepts_plain_and_scoped_names() { let record = record_with_uuid(UUID); let coords = guard_coordinates("pkg:npm/left-pad@1.3.0", &record).unwrap(); - assert_eq!((coords.name, coords.version), ("left-pad", "1.3.0")); + assert_eq!((coords.name.as_str(), coords.version.as_str()), ("left-pad", "1.3.0")); assert_eq!(coords.uuid_dir_rel, format!(".socket/vendor/npm/{UUID}")); assert_eq!(coords.base_purl, "pkg:npm/left-pad@1.3.0"); let coords = guard_coordinates("pkg:npm/@scope/pkg@1.0.0?artifact_id=x", &record).unwrap(); - assert_eq!((coords.name, coords.version), ("@scope/pkg", "1.0.0")); + assert_eq!((coords.name.as_str(), coords.version.as_str()), ("@scope/pkg", "1.0.0")); assert_eq!( coords.base_purl, "pkg:npm/@scope/pkg@1.0.0", "qualifiers stripped" ); } + /// The API serves scoped purls percent-encoded; the coordinates must + /// decode to the literal `@scope/name` (which keys the lockfile and + /// the artifact path), while `base_purl` stays verbatim — the ledger + /// must keep matching the manifest key as-served. + #[test] + fn guard_coordinates_decodes_percent_encoded_scope() { + let record = record_with_uuid(UUID); + let coords = + guard_coordinates("pkg:npm/%40modelcontextprotocol/sdk@1.12.0", &record).unwrap(); + assert_eq!( + (coords.name.as_str(), coords.version.as_str()), + ("@modelcontextprotocol/sdk", "1.12.0") + ); + assert_eq!( + coords.base_purl, "pkg:npm/%40modelcontextprotocol/sdk@1.12.0", + "base_purl stays verbatim-encoded (manifest/ledger key parity)" + ); + assert_eq!( + tgz_rel_leaf(&coords.name, &coords.version), + "@modelcontextprotocol/sdk-1.12.0.tgz", + "artifact leaf is built from the decoded name" + ); + } + #[test] fn guard_coordinates_refuses_fail_closed() { let record = record_with_uuid(UUID); @@ -399,6 +436,20 @@ mod tests { guard_coordinates("pkg:npm/x@../1.0.0", &record).unwrap_err(), "unsafe_coordinates", ); + // SECURITY: percent-encoded traversal must be rejected POST-decode — + // guarding the encoded form would be a bypass (`%2e%2e` → `..`). + expect_refusal( + guard_coordinates("pkg:npm/%2e%2e/escape@1.0.0", &record).unwrap_err(), + "unsafe_coordinates", + ); + expect_refusal( + guard_coordinates("pkg:npm/@scope/%2e%2e%2f%2e%2e@1.0.0", &record).unwrap_err(), + "unsafe_coordinates", + ); + expect_refusal( + guard_coordinates("pkg:npm/x@%2e%2e%2f1.0.0", &record).unwrap_err(), + "unsafe_coordinates", + ); // Tampered uuid. let record = record_with_uuid("../../x"); expect_refusal( diff --git a/crates/socket-patch-core/src/patch/vendor/npm_flavor.rs b/crates/socket-patch-core/src/patch/vendor/npm_flavor.rs index eb04e9c..b0efc65 100644 --- a/crates/socket-patch-core/src/patch/vendor/npm_flavor.rs +++ b/crates/socket-patch-core/src/patch/vendor/npm_flavor.rs @@ -366,6 +366,54 @@ pub async fn vendor_npm_any( outcome } +/// Is this npm-vendored entry still consumed by its lockfile's dependency +/// graph? +/// +/// `Some(true)`: the lockfile still resolves something to the entry's +/// artifact. `Some(false)`: the lockfile is present and parses but no +/// resolution references `.socket/vendor/npm//` — the dependency +/// was removed and re-locked, so the vendoring is unused (an override/ +/// resolutions DECLARATION alone does not count: pnpm's mirrored +/// `overrides:` section is excluded by the flavor probe, and the other +/// flavors carry no declaration inside the lock at all). `None`: cannot +/// determine (missing lock, unknown flavor) — callers keep the entry, +/// fail-safe. Detached entries are lockfile-invisible BY DESIGN and must +/// never be routed here (the probe would always call them unused). +pub async fn vendored_entry_in_use(entry: &VendorEntry, project_root: &Path) -> Option { + match entry.flavor.as_deref() { + Some("pnpm") => super::pnpm_lock::pnpm_entry_in_use(entry, project_root).await, + // The remaining flavors wire resolutions into the lock itself + // (resolved URLs / file: ranges / package tuples), so a textual + // probe for the uuid dir is exact: the path appears iff some + // resolution still points at the artifact. shrinkwrap wins over + // package-lock, mirroring the vendor/revert lockfile selection. + None | Some("package-lock") => { + lock_text_mentions_uuid( + project_root, + &["npm-shrinkwrap.json", "package-lock.json"], + &entry.uuid, + ) + .await + } + Some("yarn-classic") | Some("yarn-berry") => { + lock_text_mentions_uuid(project_root, &["yarn.lock"], &entry.uuid).await + } + Some("bun") => lock_text_mentions_uuid(project_root, &["bun.lock"], &entry.uuid).await, + Some(_) => None, // unknown flavor: cannot determine + } +} + +/// First readable lockfile from `names`, probed for the uuid artifact dir. +async fn lock_text_mentions_uuid(project_root: &Path, names: &[&str], uuid: &str) -> Option { + let needle = format!(".socket/vendor/npm/{uuid}/"); + for name in names { + if let Ok(text) = tokio::fs::read_to_string(project_root.join(name)).await { + return Some(text.contains(&needle)); + } + } + None +} + /// Revert one recorded npm vendor entry through the flavor that wired it. /// Entries from before the flavor field existed (`None`) are package-lock /// wirings; an unknown flavor fails CLOSED (an older binary must not guess @@ -773,4 +821,85 @@ mod tests { assert!(outcome.success, "flavor {flavor:?}: {:?}", outcome.error); } } + + /// One minimal entry per flavor for the in-use probe. + fn probe_entry(flavor: Option<&str>) -> VendorEntry { + VendorEntry { + ecosystem: "npm".into(), + base_purl: "pkg:npm/left-pad@1.3.0".into(), + uuid: UUID.into(), + artifact: VendorArtifact { + path: format!(".socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz"), + sha256: String::new(), + size: None, + platform_locked: None, + }, + wiring: Vec::new(), + lock: None, + took_over_go_patches: false, + detached: false, + record: None, + flavor: flavor.map(str::to_string), + uv: None, + pnpm: None, + poetry: None, + pdm: None, + pipenv: None, + } + } + + /// The textual flavors: a resolution pointing at the uuid dir means in + /// use; a clean lock means unused; a missing lock or unknown flavor + /// cannot be determined (keep, fail-safe). + #[tokio::test] + async fn vendored_entry_in_use_textual_flavors() { + let entry = probe_entry(Some("package-lock")); + + // Missing lock: undeterminable. + let tmp = tempfile::tempdir().unwrap(); + assert_eq!(vendored_entry_in_use(&entry, tmp.path()).await, None); + + // Lock resolves to our artifact: in use. + touch( + tmp.path(), + "package-lock.json", + &format!( + "{{\"packages\":{{\"node_modules/left-pad\":{{\"resolved\":\"file:.socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz\"}}}}}}" + ), + ) + .await; + assert_eq!(vendored_entry_in_use(&entry, tmp.path()).await, Some(true)); + + // Dep removed + re-locked (no reference left): unused. + touch(tmp.path(), "package-lock.json", "{\"packages\":{}}").await; + assert_eq!(vendored_entry_in_use(&entry, tmp.path()).await, Some(false)); + + // shrinkwrap wins over package-lock (same precedence as vendoring). + touch( + tmp.path(), + "npm-shrinkwrap.json", + &format!( + "{{\"packages\":{{\"node_modules/left-pad\":{{\"resolved\":\"file:.socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz\"}}}}}}" + ), + ) + .await; + assert_eq!(vendored_entry_in_use(&entry, tmp.path()).await, Some(true)); + + // yarn flavors probe yarn.lock. + let entry = probe_entry(Some("yarn-classic")); + let tmp = tempfile::tempdir().unwrap(); + touch( + tmp.path(), + "yarn.lock", + &format!("left-pad@1.3.0:\n resolved \"file:./.socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz#abc\"\n"), + ) + .await; + assert_eq!(vendored_entry_in_use(&entry, tmp.path()).await, Some(true)); + touch(tmp.path(), "yarn.lock", "# yarn lockfile v1\n").await; + assert_eq!(vendored_entry_in_use(&entry, tmp.path()).await, Some(false)); + + // Unknown flavor: undeterminable, fail-safe keep. + let entry = probe_entry(Some("future-pm")); + assert_eq!(vendored_entry_in_use(&entry, tmp.path()).await, None); + } } diff --git a/crates/socket-patch-core/src/patch/vendor/npm_lock.rs b/crates/socket-patch-core/src/patch/vendor/npm_lock.rs index 2be4c7a..7e23591 100644 --- a/crates/socket-patch-core/src/patch/vendor/npm_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/npm_lock.rs @@ -91,7 +91,7 @@ pub async fn vendor_npm( Ok(coords) => coords, Err(outcome) => return *outcome, }; - let (name, version) = (coords.name, coords.version); + let (name, version) = (coords.name.as_str(), coords.version.as_str()); let uuid_dir_rel = coords.uuid_dir_rel; let base_purl = coords.base_purl; @@ -175,6 +175,7 @@ pub async fn vendor_npm( sources, dry_run, force, + &mut warnings, ) .await { @@ -1090,6 +1091,176 @@ mod tests { assert!(found, "package/index.js missing from the tarball"); } + /// Read one member's bytes out of the packed tarball. + fn tgz_member(tgz: &[u8], member: &str) -> Option> { + let mut archive = tar::Archive::new(flate2::read::GzDecoder::new(tgz)); + for e in archive.entries().unwrap() { + let mut e = e.unwrap(); + if e.path().unwrap().to_string_lossy() == member { + let mut data = Vec::new(); + std::io::Read::read_to_end(&mut e, &mut data).unwrap(); + return Some(data); + } + } + None + } + + /// Vendor auto-force policy: installed content matching NEITHER hash + /// (e.g. a patch built against different bytes than the registry + /// artifact) is overwritten in the STAGE with the verified patched + /// content; the run succeeds, wires the lock, and surfaces the + /// overwrite as a `vendor_content_mismatch_overwritten` warning. The + /// installed tree is never touched. + #[tokio::test] + async fn vendor_overwrites_mismatched_content_with_warning() { + let fx = fixture().await; + let divergent: &[u8] = b"module.exports = () => 'divergent';\n"; + tokio::fs::write(fx.installed().join("index.js"), divergent) + .await + .unwrap(); + + let (result, entry, warnings) = expect_done(fx.vendor(false).await); + assert!(result.success, "{:?}", result.error); + assert!(entry.is_some(), "first vendor records a ledger entry"); + assert_eq!( + warnings + .iter() + .filter(|w| w.code == "vendor_content_mismatch_overwritten") + .count(), + 1, + "overwrite surfaced exactly once: {warnings:?}" + ); + assert!( + warnings[0].detail.contains("left-pad@1.3.0") + && warnings[0].detail.contains("package/index.js"), + "warning names the package and file: {warnings:?}" + ); + + // The tarball carries the VERIFIED patched bytes, not the divergent + // ones — every apply write path is hash-gated to afterHash. + let tgz = tokio::fs::read(fx.root().join(fx.expected_rel_tgz())) + .await + .unwrap(); + assert_eq!( + tgz_member(&tgz, "package/index.js").unwrap(), + PATCHED_INDEX + ); + + // The installed tree keeps its (divergent) bytes — only the stage + // was overwritten. + assert_eq!( + tokio::fs::read(fx.installed().join("index.js")) + .await + .unwrap(), + divergent + ); + + // The lock was rewired to the vendored artifact. + let lock = fx.read_lock().await; + assert_eq!( + lock["packages"]["node_modules/left-pad"]["resolved"], + json!(format!("file:{}", fx.expected_rel_tgz())) + ); + } + + /// Auto-force must NOT inherit force's silent NotFound skip: a missing + /// patch-target file still fails closed (a tarball without the fix + /// must never be packed), leaving the project byte-untouched. + #[tokio::test] + async fn vendor_missing_patch_file_fails_without_force() { + let fx = fixture().await; + tokio::fs::remove_file(fx.installed().join("index.js")) + .await + .unwrap(); + + let (result, entry, _) = expect_done(fx.vendor(false).await); + assert!(!result.success, "missing file must fail closed"); + assert!( + result + .error + .as_deref() + .unwrap_or("") + .contains("File not found"), + "error names the missing file: {:?}", + result.error + ); + assert!(entry.is_none()); + assert_eq!( + tokio::fs::read(fx.lock_path()).await.unwrap(), + fx.lock_bytes, + "lock byte-untouched on failure" + ); + assert!( + tokio::fs::metadata(fx.root().join(".socket/vendor")) + .await + .is_err(), + "no artifact dir on failure" + ); + } + + /// `vendor --force` keeps its missing-file tolerance (strict superset + /// of the auto-force policy). + #[tokio::test] + async fn vendor_force_still_skips_missing_files() { + let fx = fixture().await; + tokio::fs::remove_file(fx.installed().join("index.js")) + .await + .unwrap(); + + let blobs = fx.root().join(".socket/blobs"); + let sources = PatchSources::blobs_only(&blobs); + let outcome = vendor_npm( + &fx.purl(), + &fx.installed(), + fx.root(), + &fx.record, + &sources, + "2026-06-09T00:00:00Z", + false, + /*force=*/ true, + ) + .await; + let (result, entry, _) = expect_done(outcome); + assert!(result.success, "{:?}", result.error); + assert!(entry.is_some()); + } + + /// A package already patched IN PLACE by `apply` vendors cleanly: the + /// staged copy verifies AlreadyPatched (no mismatch warning — the + /// content is exactly the patch's afterHash) and the tarball ships the + /// patched bytes. + #[tokio::test] + async fn vendor_of_already_applied_package_succeeds() { + let fx = fixture().await; + // Simulate a prior in-place `socket-patch apply`. + tokio::fs::write(fx.installed().join("index.js"), PATCHED_INDEX) + .await + .unwrap(); + + let (result, entry, warnings) = expect_done(fx.vendor(false).await); + assert!(result.success, "{:?}", result.error); + assert!(entry.is_some(), "first vendor records a ledger entry"); + assert!( + warnings + .iter() + .all(|w| w.code != "vendor_content_mismatch_overwritten"), + "afterHash content is AlreadyPatched, not a mismatch: {warnings:?}" + ); + + let tgz = tokio::fs::read(fx.root().join(fx.expected_rel_tgz())) + .await + .unwrap(); + assert_eq!( + tgz_member(&tgz, "package/index.js").unwrap(), + PATCHED_INDEX + ); + let lock = fx.read_lock().await; + assert_eq!( + lock["packages"]["node_modules/left-pad"]["resolved"], + json!(format!("file:{}", fx.expected_rel_tgz())) + ); + } + #[tokio::test] async fn rerun_is_in_sync_and_byte_stable() { let fx = fixture().await; @@ -1633,11 +1804,11 @@ mod tests { fn purl_and_name_helpers() { assert_eq!( parse_npm_purl("pkg:npm/left-pad@1.3.0"), - Some(("left-pad", "1.3.0")) + Some(("left-pad".into(), "1.3.0".into())) ); assert_eq!( parse_npm_purl("pkg:npm/@scope/pkg@1.0.0?foo=bar"), - Some(("@scope/pkg", "1.0.0")) + Some(("@scope/pkg".into(), "1.0.0".into())) ); assert_eq!(parse_npm_purl("pkg:npm/@scope/pkg"), None, "no version"); assert_eq!( diff --git a/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs b/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs index cac16e9..0174f6c 100644 --- a/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs @@ -93,7 +93,7 @@ pub async fn vendor_pnpm( Ok(coords) => coords, Err(outcome) => return *outcome, }; - let (name, version) = (coords.name, coords.version); + let (name, version) = (coords.name.as_str(), coords.version.as_str()); let rel_tgz = format!("{}/{}", coords.uuid_dir_rel, tgz_rel_leaf(name, version)); // pnpm spells the override target `file:` with NO // `./` (spike P1 fixtures, verbatim). @@ -138,10 +138,15 @@ pub async fn vendor_pnpm( let mut lines = split_lines(&lock_text); // ── 3. Pre-flight refusals (override conflicts, entry present) ─────── - if let Err(detail) = check_pkg_override_conflict(&pkg, name, &override_key) { - return refused("vendor_override_conflict", detail); - } - if let Err(detail) = check_lock_override_conflict(&lines, name, &override_key) { + // A user-authored exact-version pin equal to `version` is TAKEN OVER + // (the pin's key is rewritten to our spec on both surfaces and the + // original value recorded for revert); anything else same-name refuses. + let disposition = match classify_pkg_override(&pkg, name, version, &override_key) { + Ok(d) => d, + Err(detail) => return refused("vendor_override_conflict", detail), + }; + let effective_key = disposition.effective_key(&override_key).to_string(); + if let Err(detail) = check_lock_override(&lines, name, version, &effective_key) { return refused("vendor_override_conflict", detail); } if !lock_has_target_package(&lines, name, version) { @@ -163,6 +168,7 @@ pub async fn vendor_pnpm( sources, dry_run, force, + &mut warnings, ) .await { @@ -200,11 +206,12 @@ pub async fn vendor_pnpm( rel_tgz: &rel_tgz, spec: &spec, integrity: &packed.integrity, + override_key: &effective_key, }; let mut wiring: Vec = Vec::new(); let (pkg_changed, created_pnpm_table, created_overrides_table) = - match apply_pkg_override(&mut pkg, &override_key, &spec, &mut wiring) { + match apply_pkg_override(&mut pkg, &effective_key, &spec, &mut wiring) { Ok(out) => out, Err(e) => return done_failure(purl, e), }; @@ -307,6 +314,46 @@ pub async fn vendor_pnpm( } } +/// Is this pnpm-vendored entry still consumed by the lock's dependency +/// graph? +/// +/// `Some(true)`: a `packages:`/`snapshots:` block resolves to the entry's +/// artifact (`@file:.socket/vendor/npm//...`) — some importer +/// still depends on the package. `Some(false)`: the lock parses cleanly +/// and carries NO such block — the dependency was removed and re-locked +/// (the `overrides:` declaration alone does NOT count as usage: pnpm +/// keeps it mirrored from package.json even when nothing matches it). +/// `None`: cannot determine (missing/unreadable/unsupported lock) — +/// callers must keep the entry, fail-safe. +pub async fn pnpm_entry_in_use(entry: &VendorEntry, project_root: &Path) -> Option { + let text = tokio::fs::read_to_string(project_root.join(PNPM_LOCK)) + .await + .ok()?; + if check_lock_version(&text).is_err() { + return None; + } + let lines = split_lines(&text); + for section in ["packages", "snapshots"] { + let Some((start, end)) = section_bounds(&lines, section) else { + continue; + }; + let mut i = start + 1; + while let Some(block) = next_block(&lines, i, end) { + let resolved_to_ours = block + .key + .find("@file:") + .map(|at| &block.key[at + 1..]) + .and_then(parse_vendor_path) + .is_some_and(|p| p.eco == "npm" && p.uuid == entry.uuid); + if resolved_to_ours { + return Some(true); + } + i = block.end; + } + } + Some(false) +} + /// Undo one pnpm-vendored package: restore the recorded pair fragments and /// remove the artifact dir. Reverse application order; per-record ownership /// is re-checked against the live fragment (drift ⇒ warning, left alone). @@ -485,6 +532,11 @@ struct EditCtx<'a> { spec: &'a str, /// `sha512-` of the packed tarball. integrity: &'a str, + /// The override key BOTH surfaces edit (see + /// [`OverrideDisposition::effective_key`]): our canonical + /// `name@version` on a fresh insert, or the user's existing key on a + /// takeover / re-run over a taken-over key. + override_key: &'a str, } impl EditCtx<'_> { @@ -560,46 +612,129 @@ fn override_key_name(key: &str) -> &str { } } -/// Is this (key, value) override pair OURS for the target package — the -/// exact versioned selector pointing into `.socket/vendor/npm/`? -fn override_is_ours(key: &str, value: &str, our_key: &str) -> bool { - key == our_key && parse_vendor_path(value).is_some_and(|p| p.eco == "npm") +/// Does `value` point into `.socket/vendor/npm/` (ours — any uuid)? +fn is_vendor_value(value: &str) -> bool { + parse_vendor_path(value).is_some_and(|p| p.eco == "npm") +} + +/// How the package.json `pnpm.overrides` table relates to the package +/// being vendored. The lock's `overrides:` section must mirror this map +/// key-for-key (pnpm hard-checks the two and fails +/// `ERR_PNPM_LOCKFILE_CONFIG_MISMATCH` on any drift), so whichever key +/// this classification yields is the one BOTH surfaces edit. +#[derive(Debug, Clone, PartialEq, Eq)] +enum OverrideDisposition { + /// No same-name key: insert our canonical `name@version` key. + Insert, + /// A same-name key already points into `.socket/vendor/npm/` — ours + /// (any uuid; possibly a user key an earlier vendor took over). + /// Rewrite that key's value in place; our own value is never + /// recorded as an `original`. + Ours { key: String }, + /// A user-authored exact-version pin equal to the version being + /// vendored (`"tar-fs": "3.1.0"` or `"tar-fs@3.1.0": "3.1.0"`): take + /// the key over — rewrite its VALUE to the `file:` spec (the user's + /// pin already forces every `tar-fs` to this exact version, so + /// redirecting the same key preserves their semantics) and record + /// the pin as the wiring `original` so revert restores it exactly. + Takeover { key: String, original: String }, +} + +impl OverrideDisposition { + /// The override key both surfaces edit: the matched existing key, or + /// our canonical `name@version` on a fresh insert. + fn effective_key<'a>(&'a self, our_key: &'a str) -> &'a str { + match self { + OverrideDisposition::Insert => our_key, + OverrideDisposition::Ours { key } | OverrideDisposition::Takeover { key, .. } => key, + } + } } -/// A user-authored override already steering this package would be -/// silently fought over by ours; refuse instead (fail-closed). -fn check_pkg_override_conflict(pkg: &Value, name: &str, our_key: &str) -> Result<(), String> { +/// Classify the package.json override state for `name` (see +/// [`OverrideDisposition`]). `Err` is a genuine conflict (fail-closed): +/// a range/different-version value, a `parent>child` selector chain +/// (scoped to one dependent — our whole-graph rewrite has different +/// semantics), a non-string value, or several same-name keys. +fn classify_pkg_override( + pkg: &Value, + name: &str, + version: &str, + our_key: &str, +) -> Result { let Some(overrides) = pkg.get("pnpm").and_then(|p| p.get("overrides")) else { - return Ok(()); + return Ok(OverrideDisposition::Insert); }; let Some(map) = overrides.as_object() else { return Err("package.json pnpm.overrides is not an object".to_string()); }; + let mut found: Option = None; for (key, value) in map { if override_key_name(key) != name { continue; } + if found.is_some() { + return Err(format!( + "package.json carries more than one pnpm override for `{name}`; vendoring \ + cannot pick one — remove the extras first" + )); + } let value_str = value.as_str().unwrap_or(""); - if override_is_ours(key, value_str, our_key) { - continue; // ours (possibly a stale uuid) — the edit handles it + let classified = if key.contains('>') { + None + } else if is_vendor_value(value_str) { + Some(OverrideDisposition::Ours { key: key.clone() }) + } else if value_str == version && (key == name || key == our_key) { + Some(OverrideDisposition::Takeover { + key: key.clone(), + original: value_str.to_string(), + }) + } else { + None + }; + match classified { + Some(d) => found = Some(d), + None => { + return Err(format!( + "package.json already carries a pnpm override for `{key}` ({value}); \ + vendoring would fight it — remove the override (or vendor --revert) \ + first (an exact-version pin equal to {version} is taken over \ + automatically)" + )) + } } - return Err(format!( - "package.json already carries a pnpm override for `{key}` ({value}); vendoring \ - would fight it — remove the override (or vendor --revert) first" - )); } - Ok(()) + Ok(found.unwrap_or(OverrideDisposition::Insert)) } -/// Same conflict check against the lock's own `overrides:` section (a -/// desynced lock-side override would be silently clobbered otherwise). -fn check_lock_override_conflict(lines: &[String], name: &str, our_key: &str) -> Result<(), String> { +/// Lock-side mirror check against the effective key. Every same-name key +/// in the lock's `overrides:` section must BE `effective_key` (pnpm +/// requires the lock's override map to equal package.json's — a key-shape +/// drift means the pair is already desynced) with a value the edit can +/// own: ours, the exact pinned `version` (takeover), or already our spec. +/// A missing section/key is fine — the edit inserts it, restoring parity. +fn check_lock_override( + lines: &[String], + name: &str, + version: &str, + effective_key: &str, +) -> Result<(), String> { let Some((start, end)) = section_bounds(lines, "overrides") else { return Ok(()); }; for line in &lines[start + 1..end] { if let Some((key, _repr, rest)) = parse_key_line(line, 2) { - if override_key_name(&key) == name && !override_is_ours(&key, &rest, our_key) { + if override_key_name(&key) != name { + continue; + } + if key != effective_key { + return Err(format!( + "{PNPM_LOCK} carries an override key `{key}` for `{name}` that does not \ + match package.json's `{effective_key}` — the two override maps must \ + agree (run `pnpm install` to re-sync them) before vendoring" + )); + } + if !(is_vendor_value(&rest) || rest == version) { return Err(format!( "{PNPM_LOCK} already carries an override for `{key}` ({rest}); vendoring \ would fight it — remove the override (or vendor --revert) first" @@ -665,20 +800,24 @@ fn apply_pkg_override( if existing == Some(spec) { return Ok((false, false, false)); // in sync, no record } - // The conflict pre-flight guarantees any existing value here is OURS - // (a stale uuid): never record our own edit as the "original". - let was_ours = existing.is_some(); + // The classify pre-flight guarantees an existing value here is either + // OURS (a stale uuid — never recorded as an "original") or the user's + // exact-version pin being TAKEN OVER (recorded so revert restores it). + let was_present = existing.is_some(); + let original = existing + .filter(|v| !is_vendor_value(v)) + .map(|v| Value::String(v.to_string())); overrides.insert(our_key.to_string(), Value::String(spec.to_string())); wiring.push(WiringRecord { file: PACKAGE_JSON.to_string(), kind: KIND_PKG_OVERRIDE.to_string(), - action: if was_ours { + action: if was_present { WiringAction::Rewritten } else { WiringAction::Added }, key: Some(our_key.to_string()), - original: None, // Added has none; Rewritten-over-ours records none by design + original, new: Some(Value::String(spec.to_string())), }); Ok((true, created_pnpm_table, created_overrides_table)) @@ -694,7 +833,7 @@ fn edit_overrides( ctx: &EditCtx<'_>, wiring: &mut Vec, ) -> Result { - let our_key = ctx.reg_key(); + let our_key = ctx.override_key.to_string(); let entry_line = format!(" {}: {}", yaml_key(&our_key), ctx.spec); if let Some((start, end)) = section_bounds(lines, "overrides") { // Immutable scan first: our line's position (if present) + the last @@ -702,29 +841,38 @@ fn edit_overrides( let mut ours = None; let mut last_entry = start; for (i, line) in lines.iter().enumerate().take(end).skip(start + 1) { - if let Some((key, _repr, rest)) = parse_key_line(line, 2) { + if let Some((key, repr, rest)) = parse_key_line(line, 2) { last_entry = i; if key == our_key { - ours = Some((i, rest)); + ours = Some((i, repr, rest)); break; } } } - if let Some((i, rest)) = ours { + if let Some((i, repr, rest)) = ours { if rest == ctx.spec { return Ok(false); // in sync } - // Ours with a stale uuid (conflict pre-flight proved it). - lines[i] = entry_line; + // Ours with a stale uuid (no original), or the user's pinned + // value being TAKEN OVER (recorded as original; the live key + // repr/quoting is preserved so revert is byte-faithful). + let original = (!is_vendor_value(&rest)).then(|| rest.clone()); + lines[i] = format!(" {}: {}", yaml_key_like(&our_key, &repr), ctx.spec); wiring.push(overrides_record( &our_key, ctx.spec, WiringAction::Rewritten, + original, )); return Ok(true); } lines.insert(last_entry + 1, entry_line); - wiring.push(overrides_record(&our_key, ctx.spec, WiringAction::Added)); + wiring.push(overrides_record( + &our_key, + ctx.spec, + WiringAction::Added, + None, + )); return Ok(true); } // No overrides section: insert one right before `importers:` (with the @@ -735,17 +883,29 @@ fn edit_overrides( importers..importers, ["overrides:".to_string(), entry_line, String::new()], ); - wiring.push(overrides_record(&our_key, ctx.spec, WiringAction::Added)); + wiring.push(overrides_record( + &our_key, + ctx.spec, + WiringAction::Added, + None, + )); Ok(true) } -fn overrides_record(key: &str, spec: &str, action: WiringAction) -> WiringRecord { +fn overrides_record( + key: &str, + spec: &str, + action: WiringAction, + original: Option, +) -> WiringRecord { WiringRecord { file: PNPM_LOCK.to_string(), kind: KIND_LOCK_OVERRIDES.to_string(), action, key: Some(key.to_string()), - original: None, // Added, or rewritten-over-ours (never an original) + // `Some` only on a takeover (the user's pinned value); Added and + // rewritten-over-ours never record an original. + original: original.map(Value::String), new: Some(Value::String(spec.to_string())), } } @@ -1062,7 +1222,17 @@ fn revert_pkg_record( ))); return; } - overrides.shift_remove(key); + // A takeover recorded the user's pinned value as `original`: restore + // it in place (the key stays). A plain Added/Rewritten-over-ours + // record has no original — remove the key as before. + match rec.original.as_ref().and_then(Value::as_str) { + Some(orig) => { + overrides.insert(key.to_string(), Value::String(orig.to_string())); + } + None => { + overrides.shift_remove(key); + } + } *dirty = true; } @@ -1112,15 +1282,15 @@ fn revert_overrides_line( let mut ours_at = None; let mut others = 0usize; for (i, line) in lines.iter().enumerate().take(end).skip(start + 1) { - if let Some((k, _repr, rest)) = parse_key_line(line, 2) { + if let Some((k, repr, rest)) = parse_key_line(line, 2) { if k == key && ours_at.is_none() { - ours_at = Some((i, rest)); + ours_at = Some((i, repr, rest)); } else { others += 1; } } } - let Some((idx, rest)) = ours_at else { + let Some((idx, repr, rest)) = ours_at else { warnings.push(drifted(format!("overrides entry `{key}` no longer exists"))); return; }; @@ -1132,6 +1302,13 @@ fn revert_overrides_line( ))); return; } + // A takeover recorded the user's pinned value: restore it in place + // (key + quoting preserved; the section obviously stays). + if let Some(orig) = rec.original.as_ref().and_then(Value::as_str) { + lines[idx] = format!(" {}: {orig}", yaml_key_like(key, &repr)); + *dirty = true; + return; + } lines.remove(idx); *dirty = true; if others == 0 { @@ -1404,7 +1581,7 @@ async fn commit_pair( // pnpm-lock.yaml is machine-emitted with a fixed 2/4/6/8-space shape; these // helpers splice line blocks and never interpret YAML generically. -fn split_lines(text: &str) -> Vec { +pub(super) fn split_lines(text: &str) -> Vec { text.split('\n').map(str::to_string).collect() } @@ -1415,7 +1592,7 @@ fn join_lines(lines: &[String]) -> String { /// `(header_idx, end_idx)` of a top-level `name:` section; `end` is the /// first following column-0 line (exclusive), so trailing blank separator /// lines belong to the section. -fn section_bounds(lines: &[String], name: &str) -> Option<(usize, usize)> { +pub(super) fn section_bounds(lines: &[String], name: &str) -> Option<(usize, usize)> { let header = format!("{name}:"); let start = lines.iter().position(|l| l == &header)?; let end = lines @@ -1431,10 +1608,10 @@ fn section_bounds(lines: &[String], name: &str) -> Option<(usize, usize)> { /// One 2-space-keyed block inside a section (`[header, end)`; `end` stops at /// the blank separator / next block header, so the captured fragment is the /// verbatim entry without surrounding blanks). -struct YamlBlock { - header: usize, - end: usize, - key: String, +pub(super) struct YamlBlock { + pub(super) header: usize, + pub(super) end: usize, + pub(super) key: String, /// The key exactly as spelled in the file (incl. quotes) — rekeys /// preserve the file's quoting style. repr: String, @@ -1454,7 +1631,7 @@ impl YamlBlock { } /// The next block at or after line `i` (within `[i, end)`). -fn next_block(lines: &[String], mut i: usize, end: usize) -> Option { +pub(super) fn next_block(lines: &[String], mut i: usize, end: usize) -> Option { while i < end { if let Some((key, repr, rest)) = parse_key_line(&lines[i], 2) { let mut j = i + 1; @@ -2111,6 +2288,228 @@ snapshots: assert!(live_lock.contains("overrides:\n other-pkg: 2.0.0\n\nimporters:")); } + // ── in-use probe ─────────────────────────────────────────────────────── + + /// The prune-time in-use probe: a packages/snapshots block resolving to + /// the artifact means in use; an overrides declaration ALONE (the state + /// pnpm leaves after the dependency is removed and re-locked) does not; + /// a missing or unsupported-version lock is undeterminable (keep). + #[tokio::test] + async fn pnpm_entry_in_use_reflects_lock_graph() { + let fx = fixture_with(P1_BEFORE_PKG, P1_BEFORE_LOCK).await; + let (_, entry, _) = expect_done(fx.vendor(false).await); + let entry = entry.unwrap(); + + // Freshly vendored: the rekeyed file: blocks are in the graph. + assert_eq!(pnpm_entry_in_use(&entry, fx.root()).await, Some(true)); + + // Dep removed + re-locked: pnpm prunes the file: blocks but keeps + // the overrides declaration mirrored from package.json. + let removed_lock = format!( + "lockfileVersion: '9.0'\n\nsettings:\n autoInstallPeers: true\n\ + \noverrides:\n left-pad@1.3.0: file:{}\n\nimporters:\n\n .:\n \ + dependencies:\n consumer:\n specifier: file:./consumer\n \ + version: file:consumer\n\npackages:\n\n consumer@file:consumer:\n \ + resolution: {{directory: consumer, type: directory}}\n\nsnapshots:\n\n \ + consumer@file:consumer: {{}}\n", + fx.rel_tgz() + ); + tokio::fs::write(fx.root().join(PNPM_LOCK), &removed_lock) + .await + .unwrap(); + assert_eq!( + pnpm_entry_in_use(&entry, fx.root()).await, + Some(false), + "the lingering overrides declaration alone is not usage" + ); + + // Unsupported lock version: undeterminable. + tokio::fs::write(fx.root().join(PNPM_LOCK), "lockfileVersion: '6.0'\n") + .await + .unwrap(); + assert_eq!(pnpm_entry_in_use(&entry, fx.root()).await, None); + + // Missing lock: undeterminable. + tokio::fs::remove_file(fx.root().join(PNPM_LOCK)).await.unwrap(); + assert_eq!(pnpm_entry_in_use(&entry, fx.root()).await, None); + } + + // ── exact-version pin takeover ───────────────────────────────────────── + + /// package.json with a user-authored override pin (`key: value`) plus the + /// matching lock-side `overrides:` mirror line. + fn pin_fixture_inputs(key: &str, value: &str) -> (String, String) { + let pkg = format!( + "{{\n \"name\": \"vendor-spike\",\n \"version\": \"1.0.0\",\n \"private\": true,\n \"dependencies\": {{\n \"consumer\": \"file:./consumer\",\n \"left-pad\": \"1.3.0\",\n \"left-pad-old\": \"npm:left-pad@1.2.0\"\n }},\n \"pnpm\": {{\n \"overrides\": {{\n \"{key}\": \"{value}\"\n }}\n }}\n}}\n" + ); + let lock = P1_BEFORE_LOCK.replace( + "importers:", + &format!("overrides:\n {key}: {value}\n\nimporters:"), + ); + (pkg, lock) + } + + /// A user-authored EXACT-version pin equal to the patched version is + /// taken over: the user's key keeps its spelling on both surfaces, its + /// value moves to our `file:` spec, the wiring records the pin as + /// `original`, and a full revert restores both files byte-identically. + #[tokio::test] + async fn user_exact_pin_bare_key_is_taken_over_and_revert_restores_it() { + let (pkg_before, lock_before) = pin_fixture_inputs("left-pad", "1.3.0"); + let fx = fixture_with(&pkg_before, &lock_before).await; + + let (result, entry, _) = expect_done(fx.vendor(false).await); + assert!(result.success, "{:?}", result.error); + let entry = entry.unwrap(); + + // package.json: the USER'S key (`left-pad`) now carries our spec; + // no `left-pad@1.3.0` key was added; tables pre-existed. + let pkg: Value = serde_json::from_str(&fx.read(PACKAGE_JSON).await).unwrap(); + let overrides = &pkg["pnpm"]["overrides"]; + assert_eq!( + overrides["left-pad"], + Value::String(format!("file:{}", fx.rel_tgz())) + ); + assert!(overrides.get("left-pad@1.3.0").is_none()); + assert_eq!( + entry.pnpm, + Some(PnpmMeta { + created_overrides_table: false, + created_pnpm_table: false + }) + ); + + // Lock: same key, same value (map parity — pnpm hard-checks it). + let live_lock = fx.read(PNPM_LOCK).await; + assert!( + live_lock.contains(&format!("overrides:\n left-pad: file:{}", fx.rel_tgz())), + "{live_lock}" + ); + + // Wiring: both override records carry the user's key, action + // Rewritten, and the pin as `original`. + for kind in [KIND_PKG_OVERRIDE, KIND_LOCK_OVERRIDES] { + let rec = entry + .wiring + .iter() + .find(|r| r.kind == kind) + .unwrap_or_else(|| panic!("no {kind} record: {:?}", entry.wiring)); + assert_eq!(rec.key.as_deref(), Some("left-pad"), "{kind}"); + assert_eq!(rec.action, WiringAction::Rewritten, "{kind}"); + assert_eq!( + rec.original, + Some(Value::String("1.3.0".to_string())), + "{kind}: the user's pin is the original" + ); + } + + // Full revert restores the pin on both surfaces byte-identically. + let outcome = revert_pnpm(&entry, fx.root(), false).await; + assert!(outcome.success, "{:?}", outcome.error); + assert_eq!(fx.read(PACKAGE_JSON).await, pkg_before); + assert_eq!(fx.read(PNPM_LOCK).await, lock_before); + } + + /// The versioned key shape (`left-pad@1.3.0: 1.3.0`) is taken over the + /// same way — the key happens to equal our canonical key. + #[tokio::test] + async fn user_exact_pin_versioned_key_is_taken_over() { + let (pkg_before, lock_before) = pin_fixture_inputs("left-pad@1.3.0", "1.3.0"); + let fx = fixture_with(&pkg_before, &lock_before).await; + + let (result, entry, _) = expect_done(fx.vendor(false).await); + assert!(result.success, "{:?}", result.error); + let entry = entry.unwrap(); + + let pkg: Value = serde_json::from_str(&fx.read(PACKAGE_JSON).await).unwrap(); + assert_eq!( + pkg["pnpm"]["overrides"]["left-pad@1.3.0"], + Value::String(format!("file:{}", fx.rel_tgz())) + ); + let rec = entry + .wiring + .iter() + .find(|r| r.kind == KIND_PKG_OVERRIDE) + .unwrap(); + assert_eq!(rec.original, Some(Value::String("1.3.0".to_string()))); + + // Revert restores the pin. + let outcome = revert_pnpm(&entry, fx.root(), false).await; + assert!(outcome.success, "{:?}", outcome.error); + assert_eq!(fx.read(PACKAGE_JSON).await, pkg_before); + assert_eq!(fx.read(PNPM_LOCK).await, lock_before); + } + + /// A second vendor over a taken-over key is the in-sync hot path: + /// AlreadyPatched, no new ledger entry, bytes stable. (Guards the + /// `Ours` classification accepting the user-keyed vendor value — the + /// old `key == our_key` requirement would refuse its own wiring.) + #[tokio::test] + async fn takeover_rerun_is_in_sync_and_records_nothing() { + let (pkg_before, lock_before) = pin_fixture_inputs("left-pad", "1.3.0"); + let fx = fixture_with(&pkg_before, &lock_before).await; + let (result, entry, _) = expect_done(fx.vendor(false).await); + assert!(result.success, "{:?}", result.error); + assert!(entry.is_some()); + let pkg_after = fx.read(PACKAGE_JSON).await; + let lock_after = fx.read(PNPM_LOCK).await; + + let (result, entry, _) = expect_done(fx.vendor(false).await); + assert!(result.success, "{:?}", result.error); + assert!(entry.is_none(), "in-sync rerun records nothing"); + assert!(result + .files_verified + .iter() + .all(|v| v.status == crate::patch::apply::VerifyStatus::AlreadyPatched)); + assert_eq!(fx.read(PACKAGE_JSON).await, pkg_after, "bytes stable"); + assert_eq!(fx.read(PNPM_LOCK).await, lock_after, "bytes stable"); + } + + /// Selector chains and duplicate same-name keys still refuse — only a + /// plain exact pin is taken over. (Range keys and different-version + /// values are covered by `existing_user_override_for_the_name_is_refused`.) + #[tokio::test] + async fn chain_and_duplicate_override_keys_still_refuse() { + // `parent>child` chain, even with the exact version value. + let (pkg, lock) = pin_fixture_inputs("consumer>left-pad", "1.3.0"); + let fx = fixture_with(&pkg, &lock).await; + let detail = expect_refused(fx.vendor(false).await, "vendor_override_conflict"); + assert!(detail.contains("consumer>left-pad"), "{detail}"); + + // Two same-name keys (one ours-shaped pin + one bare pin). + let pkg = "{\n \"name\": \"x\",\n \"pnpm\": {\n \"overrides\": {\n \"left-pad\": \"1.3.0\",\n \"left-pad@1.3.0\": \"1.3.0\"\n }\n }\n}\n".to_string(); + let fx = fixture_with(&pkg, P1_BEFORE_LOCK).await; + let detail = expect_refused(fx.vendor(false).await, "vendor_override_conflict"); + assert!(detail.contains("more than one"), "{detail}"); + } + + /// pkg↔lock override-key shape drift refuses (pnpm itself would fail + /// `ERR_PNPM_LOCKFILE_CONFIG_MISMATCH`); a pkg-side pin with NO lock + /// mirror is fine — the edit inserts the same key, restoring parity. + #[tokio::test] + async fn takeover_lock_shape_mismatch_refuses_but_missing_section_inserts() { + // Shape drift: pkg keys `left-pad`, lock keys `left-pad@1.3.0`. + let (pkg, _) = pin_fixture_inputs("left-pad", "1.3.0"); + let lock = P1_BEFORE_LOCK.replace( + "importers:", + "overrides:\n left-pad@1.3.0: 1.3.0\n\nimporters:", + ); + let fx = fixture_with(&pkg, &lock).await; + let detail = expect_refused(fx.vendor(false).await, "vendor_override_conflict"); + assert!(detail.contains("must"), "{detail}"); + + // No lock overrides section at all: takeover inserts the pkg key. + let fx = fixture_with(&pkg, P1_BEFORE_LOCK).await; + let (result, entry, _) = expect_done(fx.vendor(false).await); + assert!(result.success, "{:?}", result.error); + let live_lock = fx.read(PNPM_LOCK).await; + assert!( + live_lock.contains(&format!("overrides:\n left-pad: file:{}", fx.rel_tgz())), + "lock key matches the pkg key: {live_lock}" + ); + assert!(entry.is_some()); + } + #[tokio::test] async fn created_tables_bookkeeping_and_revert_prunes_them() { // pnpm table exists (other keys), overrides created by us: revert diff --git a/crates/socket-patch-core/src/patch/vendor/pypi.rs b/crates/socket-patch-core/src/patch/vendor/pypi.rs index 553a0dc..8b317f3 100644 --- a/crates/socket-patch-core/src/patch/vendor/pypi.rs +++ b/crates/socket-patch-core/src/patch/vendor/pypi.rs @@ -405,6 +405,7 @@ pub async fn vendor_pypi( &dest, dry_run, force, + &mut warnings, ) .await; let (result, artifact) = match built { diff --git a/crates/socket-patch-core/src/patch/vendor/pypi_wheel.rs b/crates/socket-patch-core/src/patch/vendor/pypi_wheel.rs index 69d4ffb..4b851ff 100644 --- a/crates/socket-patch-core/src/patch/vendor/pypi_wheel.rs +++ b/crates/socket-patch-core/src/patch/vendor/pypi_wheel.rs @@ -20,7 +20,7 @@ use sha2::Digest as _; use crate::crawlers::python_crawler::{canonicalize_pypi_name, read_python_metadata}; use crate::manifest::schema::PatchRecord; use crate::patch::apply::{ - apply_package_patch, is_safe_relative_subpath, normalize_file_path, ApplyResult, PatchSources, + is_safe_relative_subpath, normalize_file_path, ApplyResult, PatchSources, }; use crate::utils::fs::{atomic_write_bytes, list_dir_entries}; @@ -255,6 +255,7 @@ pub async fn build_patched_wheel( dest: &Path, dry_run: bool, force: bool, + warnings: &mut Vec, ) -> Result<(ApplyResult, Option), (&'static str, String)> { // Editable installs (`pip install -e` / uv tool dev mode) point // site-packages at the user's own working tree: the RECORD describes a @@ -371,15 +372,18 @@ pub async fn build_patched_wheel( } // Patch the stage through the shared apply pipeline (same verify/source - // strategy contract as `apply`). The installed tree is never touched. - let mut result = apply_package_patch( + // strategy contract as `apply`, with the vendor auto-force policy — + // see `force_apply_staged`). The installed tree is never touched. + let mut result = super::force_apply_staged( purl, stage.path(), - &record.files, + record, sources, - Some(&record.uuid), dry_run, force, + &dist.dist_name, + &dist.version, + warnings, ) .await; if dry_run || !result.success { @@ -903,6 +907,7 @@ mod tests { &fx.dest, false, false, + &mut Vec::new(), ) .await .unwrap(); @@ -952,6 +957,7 @@ mod tests { &fx.dest, false, false, + &mut Vec::new(), ) .await .unwrap_err(); @@ -977,6 +983,7 @@ mod tests { &fx.dest, false, false, + &mut Vec::new(), ) .await .unwrap(); @@ -994,6 +1001,7 @@ mod tests { &fx.dest, false, false, + &mut Vec::new(), ) .await .unwrap(); @@ -1044,6 +1052,7 @@ mod tests { &fx.dest, false, false, + &mut Vec::new(), ) .await .unwrap(); @@ -1082,6 +1091,7 @@ mod tests { &fx.dest, false, false, + &mut Vec::new(), ) .await .unwrap_err(); @@ -1105,6 +1115,7 @@ mod tests { &fx.dest, true, false, + &mut Vec::new(), ) .await .unwrap(); @@ -1120,8 +1131,12 @@ mod tests { ); } + /// Vendor auto-force policy: installed content matching NEITHER hash is + /// overwritten with the verified patched content in the STAGE (the + /// installed tree is never touched), and the overwrite is surfaced as a + /// `vendor_content_mismatch_overwritten` warning. #[tokio::test] - async fn hash_mismatch_fails_without_touching_install_or_dest() { + async fn hash_mismatch_overwrites_in_stage_with_warning() { let fx = make_fixture("", None).await; // Corrupt the installed six.py so verify sees a HashMismatch. tokio::fs::write(fx.site_packages.join("six.py"), b"tampered") @@ -1132,6 +1147,7 @@ mod tests { .unwrap(); let record = patch_record(&[("six.py", ORIG, PATCHED)]); let sources = PatchSources::blobs_only(&fx.blobs); + let mut warnings = Vec::new(); let (result, artifact) = build_patched_wheel( "pkg:pypi/six@1.16.0", &fx.site_packages, @@ -1141,10 +1157,65 @@ mod tests { &fx.dest, false, false, + &mut warnings, + ) + .await + .unwrap(); + assert!(result.success, "{:?}", result.error); + assert!(artifact.is_some()); + assert!(fx.dest.exists(), "patched wheel must be written"); + assert_eq!( + warnings + .iter() + .filter(|w| w.code == "vendor_content_mismatch_overwritten") + .count(), + 1, + "overwrite surfaced as a warning: {warnings:?}" + ); + // Installed tree untouched — only the stage was overwritten. + assert_eq!( + tokio::fs::read(fx.site_packages.join("six.py")) + .await + .unwrap(), + b"tampered" + ); + } + + /// A patch-target file MISSING from the install still fails closed + /// without `--force` — auto-force must not inherit force's silent + /// NotFound skip (the wheel would ship without the fix). + #[tokio::test] + async fn missing_patch_file_fails_without_force() { + let fx = make_fixture("", None).await; + tokio::fs::remove_file(fx.site_packages.join("six.py")) + .await + .unwrap(); + let dist = locate_installed_dist(&fx.site_packages, "six", "1.16.0") + .await + .unwrap(); + let record = patch_record(&[("six.py", ORIG, PATCHED)]); + let sources = PatchSources::blobs_only(&fx.blobs); + let (result, artifact) = build_patched_wheel( + "pkg:pypi/six@1.16.0", + &fx.site_packages, + &dist, + &record, + &sources, + &fx.dest, + false, + false, + &mut Vec::new(), ) .await .unwrap(); assert!(!result.success); + // The RECORD staging step trips first ("RECORD member ... is + // unreadable") — either way the build fails closed rather than + // packing a wheel without the fix. + assert!( + result.error.is_some(), + "missing file fails closed with an error" + ); assert!(artifact.is_none()); assert!(!fx.dest.exists()); } diff --git a/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs b/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs new file mode 100644 index 0000000..3cef46b --- /dev/null +++ b/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs @@ -0,0 +1,1435 @@ +//! Pristine-artifact fetching for lockfile-resolved packages with no +//! installed copy. +//! +//! `vendor` needs an installed package dir to stage from; on a fresh clone +//! there is none. This module downloads the pristine artifact the lockfile +//! resolves (the lock-recorded URL when present, the conventional registry +//! URL otherwise), verifies it against the integrity the lock records +//! **FAIL-CLOSED and before anything is written to the staging dir**, and +//! extracts it into a private tempdir the vendor pipeline then treats as +//! the installed dir. The project tree — node_modules included — is never +//! touched. +//! +//! Trust model: the URL comes from the user's own committed lockfile (or a +//! conventional construction from it); content trust comes from the +//! lock-recorded hash, not the transport — which is also why an entry with +//! no verifier ([`LockIntegrity::None`]) is refused outright +//! ([`FetchError::Unverifiable`]) without any network I/O. + +use std::path::{Path, PathBuf}; +use std::time::Duration; + +use base64::Engine as _; +use sha1::Sha1; +use sha2::{Digest, Sha256, Sha384, Sha512}; + +use crate::constants::USER_AGENT; +use crate::patch::apply::is_safe_relative_subpath; + +use super::lock_inventory::{LockIntegrity, LockfileEntry}; + +/// The default npm registry; override with `SOCKET_NPM_REGISTRY` (the +/// enterprise-mirror / test escape hatch — `.npmrc` parsing is out of +/// scope, but lock-recorded `resolved` URLs already carry custom hosts). +pub const DEFAULT_NPM_REGISTRY: &str = "https://registry.npmjs.org"; + +/// Whole-package caps — wider than `patch/package.rs`'s patch-archive caps +/// because these are full upstream packages, but still bounded so a +/// poisoned lockfile cannot turn the fetch into a disk/memory bomb. +const MAX_DOWNLOAD_BYTES: u64 = 128 * 1024 * 1024; +const MAX_TOTAL_DECOMPRESSED_BYTES: u64 = 512 * 1024 * 1024; +const MAX_ENTRY_BYTES: u64 = 128 * 1024 * 1024; +const MAX_ENTRIES: usize = 60_000; + +/// A fetched, verified, extracted package. The tempdir lives exactly as +/// long as this value — callers must hold it until the vendor pipeline has +/// finished staging from [`FetchedPackage::dir`]. +#[derive(Debug)] +pub struct FetchedPackage { + dir: PathBuf, + /// Where the bytes came from (surfaced in the fetch warning event). + pub url: String, + _tmp: tempfile::TempDir, +} + +impl FetchedPackage { + /// The extracted package root (`package.json` at the top for npm). + pub fn dir(&self) -> &Path { + &self.dir + } +} + +#[derive(Debug)] +pub enum FetchError { + /// The entry cannot be verified against the lockfile (no integrity + /// recorded, or no fetcher for its ecosystem) — decided BEFORE any + /// network I/O; the caller keeps its `package_not_installed` outcome. + Unverifiable(String), + /// The fetch was attempted and failed (HTTP error, size cap, integrity + /// mismatch, extraction failure). User-facing message. + Failed(String), +} + +/// One shared client for all fetches in a run. +pub fn build_registry_client() -> reqwest::Client { + reqwest::Client::builder() + .user_agent(USER_AGENT) + .timeout(Duration::from_secs(60)) + .build() + .unwrap_or_else(|_| reqwest::Client::new()) +} + +/// The npm registry base after the env override. +pub fn npm_registry_base() -> String { + std::env::var("SOCKET_NPM_REGISTRY") + .ok() + .map(|v| v.trim_end_matches('/').to_string()) + .filter(|v| !v.is_empty()) + .unwrap_or_else(|| DEFAULT_NPM_REGISTRY.to_string()) +} + +/// Conventional npm tarball URL: the scope stays in the package path, the +/// tarball leaf uses the bare name — +/// `{base}/@scope/name/-/name-1.0.0.tgz` / `{base}/name/-/name-1.0.0.tgz`. +pub fn npm_tarball_url(base: &str, name: &str, version: &str) -> String { + let leaf = name.rsplit('/').next().unwrap_or(name); + format!("{base}/{name}/-/{leaf}-{version}.tgz") +} + +/// Fetch + verify + extract one lockfile entry. Ecosystems without a +/// fetcher yet return [`FetchError::Unverifiable`] (callers keep their +/// not-installed outcome). +pub async fn fetch_and_stage( + entry: &LockfileEntry, + client: &reqwest::Client, +) -> Result { + if entry.integrity == LockIntegrity::None { + return Err(FetchError::Unverifiable(format!( + "the lockfile records no integrity hash for {}@{}; refusing to fetch \ + unverifiable content", + entry.name, entry.version + ))); + } + match entry.ecosystem { + "npm" => fetch_npm(entry, client).await, + #[cfg(feature = "cargo")] + "cargo" => fetch_cargo(entry, client).await, + #[cfg(feature = "golang")] + "golang" => fetch_golang(entry, client).await, + #[cfg(feature = "composer")] + "composer" => fetch_composer(entry, client).await, + "gem" => fetch_gem(entry, client).await, + "pypi" => fetch_pypi(entry, client).await, + other => Err(FetchError::Unverifiable(format!( + "no registry fetcher for ecosystem `{other}`" + ))), + } +} + +/// Traversal-guarded zip extraction. `strip_first` mirrors the tar +/// behavior (composer dist zips carry a variable top dir; wheels carry +/// content at the root). +fn extract_zip(bytes: &[u8], dest: &Path, strip_first: bool) -> Result<(), String> { + let mut archive = zip::ZipArchive::new(std::io::Cursor::new(bytes)) + .map_err(|e| format!("unreadable zip: {e}"))?; + if archive.len() > MAX_ENTRIES { + return Err(format!("zip exceeds {MAX_ENTRIES} entries")); + } + let mut total: u64 = 0; + for i in 0..archive.len() { + let mut file = archive + .by_index(i) + .map_err(|e| format!("unreadable zip entry: {e}"))?; + if file.is_dir() { + continue; + } + let raw = PathBuf::from(file.name()); + let rel = if strip_first { + match strip_first_component(&raw) { + Some(rel) => rel, + None => continue, + } + } else { + raw.clone() + }; + let rel_str = rel.to_string_lossy().into_owned(); + if !is_safe_relative_subpath(&rel_str) { + return Err(format!( + "zip entry `{}` escapes the extraction dir — refusing the artifact", + raw.display() + )); + } + if file.size() > MAX_ENTRY_BYTES { + return Err(format!( + "zip entry `{rel_str}` is {} bytes (cap {MAX_ENTRY_BYTES})", + file.size() + )); + } + total += file.size(); + if total > MAX_TOTAL_DECOMPRESSED_BYTES { + return Err(format!( + "zip decompresses past the {MAX_TOTAL_DECOMPRESSED_BYTES}-byte cap" + )); + } + let target = dest.join(&rel); + if let Some(parent) = target.parent() { + std::fs::create_dir_all(parent) + .map_err(|e| format!("cannot create {}: {e}", parent.display()))?; + } + let mut out = std::fs::File::create(&target) + .map_err(|e| format!("cannot create {}: {e}", target.display()))?; + std::io::copy(&mut file, &mut out) + .map_err(|e| format!("cannot extract `{rel_str}`: {e}"))?; + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let exec = file.unix_mode().is_some_and(|m| m & 0o111 != 0); + let perms = if exec { 0o755 } else { 0o644 }; + let _ = std::fs::set_permissions(&target, std::fs::Permissions::from_mode(perms)); + } + } + Ok(()) +} + +/// Composer dist zips (packagist/GitHub zipballs): sha1-verified, variable +/// top dir stripped. The extracted dir plays the installed package dir. +#[cfg(feature = "composer")] +async fn fetch_composer( + entry: &LockfileEntry, + client: &reqwest::Client, +) -> Result { + let Some(url) = entry.resolved.clone() else { + return Err(FetchError::Unverifiable(format!( + "composer.lock records no dist URL for {}@{}", + entry.name, entry.version + ))); + }; + let bytes = download(client, &url).await.map_err(FetchError::Failed)?; + verify_integrity(&bytes, &entry.integrity)?; + let tmp = tempfile::tempdir() + .map_err(|e| FetchError::Failed(format!("cannot create fetch tempdir: {e}")))?; + let dir = tmp.path().join("package"); + extract_zip(&bytes, &dir, /*strip_first=*/ true).map_err(FetchError::Failed)?; + if tokio::fs::metadata(dir.join("composer.json")).await.is_err() { + return Err(FetchError::Failed(format!( + "fetched dist for {}@{} carries no composer.json", + entry.name, entry.version + ))); + } + Ok(FetchedPackage { + dir, + url, + _tmp: tmp, + }) +} + +/// `.gem` files are plain tar containers holding `data.tar.gz` (the +/// package content, no prefix dir) + metadata. The whole `.gem` is +/// sha256-verified against the Gemfile.lock CHECKSUMS entry first. +async fn fetch_gem( + entry: &LockfileEntry, + client: &reqwest::Client, +) -> Result { + let Some(url) = entry.resolved.clone() else { + return Err(FetchError::Unverifiable(format!( + "no download URL for {}@{}", + entry.name, entry.version + ))); + }; + let bytes = download(client, &url).await.map_err(FetchError::Failed)?; + verify_integrity(&bytes, &entry.integrity)?; + + // Locate data.tar.gz inside the (uncompressed) outer tar. + let mut archive = tar::Archive::new(bytes.as_slice()); + let mut data: Option> = None; + for e in archive + .entries() + .map_err(|e| FetchError::Failed(format!("unreadable .gem: {e}")))? + { + use std::io::Read as _; + let mut e = e.map_err(|err| FetchError::Failed(format!("unreadable .gem entry: {err}")))?; + let is_data = e + .path() + .ok() + .is_some_and(|p| p.as_os_str() == "data.tar.gz"); + if !is_data { + continue; + } + if e.header().size().unwrap_or(u64::MAX) > MAX_DOWNLOAD_BYTES { + return Err(FetchError::Failed("data.tar.gz exceeds the size cap".into())); + } + let mut buf = Vec::new(); + e.read_to_end(&mut buf) + .map_err(|err| FetchError::Failed(format!("cannot read data.tar.gz: {err}")))?; + data = Some(buf); + break; + } + let Some(data) = data else { + return Err(FetchError::Failed(format!( + "fetched .gem for {}@{} carries no data.tar.gz", + entry.name, entry.version + ))); + }; + let tmp = tempfile::tempdir() + .map_err(|e| FetchError::Failed(format!("cannot create fetch tempdir: {e}")))?; + let dir = tmp.path().join("gem"); + extract_tgz_no_strip(&data, &dir).map_err(FetchError::Failed)?; + Ok(FetchedPackage { + dir, + url, + _tmp: tmp, + }) +} + +/// Pure-python wheels recorded by uv.lock (URL + sha256): the unzipped +/// wheel IS a site-packages layout (package dirs + `.dist-info/RECORD` at +/// the root), which is exactly the shape the pypi vendor backend stages +/// from. +async fn fetch_pypi( + entry: &LockfileEntry, + client: &reqwest::Client, +) -> Result { + let Some(url) = entry.resolved.clone() else { + return Err(FetchError::Unverifiable(format!( + "the lockfile records no platform-independent wheel URL for {}@{} (only uv.lock carries fetchable wheel resolutions today)", + entry.name, entry.version + ))); + }; + let bytes = download(client, &url).await.map_err(FetchError::Failed)?; + verify_integrity(&bytes, &entry.integrity)?; + let tmp = tempfile::tempdir() + .map_err(|e| FetchError::Failed(format!("cannot create fetch tempdir: {e}")))?; + let dir = tmp.path().join("site-packages"); + extract_zip(&bytes, &dir, /*strip_first=*/ false).map_err(FetchError::Failed)?; + Ok(FetchedPackage { + dir, + url, + _tmp: tmp, + }) +} + +/// crates.io static download host; override with `SOCKET_CRATES_REGISTRY`. +#[cfg(feature = "cargo")] +pub const DEFAULT_CRATES_REGISTRY: &str = "https://static.crates.io/crates"; + +#[cfg(feature = "cargo")] +fn crates_registry_base() -> String { + std::env::var("SOCKET_CRATES_REGISTRY") + .ok() + .map(|v| v.trim_end_matches('/').to_string()) + .filter(|v| !v.is_empty()) + .unwrap_or_else(|| DEFAULT_CRATES_REGISTRY.to_string()) +} + +/// `.crate` files are tar.gz with a `{name}-{version}/` top dir — the same +/// extraction path as npm tarballs. The Cargo.lock `checksum` is the sha256 +/// of the `.crate` bytes. +#[cfg(feature = "cargo")] +async fn fetch_cargo( + entry: &LockfileEntry, + client: &reqwest::Client, +) -> Result { + let url = entry.resolved.clone().unwrap_or_else(|| { + format!( + "{}/{}/{}-{}.crate", + crates_registry_base(), + entry.name, + entry.name, + entry.version + ) + }); + let bytes = download(client, &url).await.map_err(FetchError::Failed)?; + verify_integrity(&bytes, &entry.integrity)?; + + let tmp = tempfile::tempdir() + .map_err(|e| FetchError::Failed(format!("cannot create fetch tempdir: {e}")))?; + let dir = tmp.path().join("crate"); + extract_tgz(&bytes, &dir).map_err(FetchError::Failed)?; + if tokio::fs::metadata(dir.join("Cargo.toml")).await.is_err() { + return Err(FetchError::Failed(format!( + "fetched .crate for {}@{} carries no Cargo.toml — not a crate", + entry.name, entry.version + ))); + } + Ok(FetchedPackage { + dir, + url, + _tmp: tmp, + }) +} + +/// Default Go module proxy; `SOCKET_GOPROXY` wins, else the standard +/// `GOPROXY` env (first element that isn't `direct`/`off`). +#[cfg(feature = "golang")] +pub const DEFAULT_GOPROXY: &str = "https://proxy.golang.org"; + +#[cfg(feature = "golang")] +fn goproxy_base() -> String { + if let Ok(v) = std::env::var("SOCKET_GOPROXY") { + let v = v.trim_end_matches('/').to_string(); + if !v.is_empty() { + return v; + } + } + if let Ok(v) = std::env::var("GOPROXY") { + for part in v.split(',') { + let part = part.trim().trim_end_matches('/'); + if !part.is_empty() && part != "direct" && part != "off" { + return part.to_string(); + } + } + } + DEFAULT_GOPROXY.to_string() +} + +/// Go's module-path case encoding for proxy URLs: an uppercase letter `X` +/// becomes `!x` (applies to the module path and the version). +#[cfg(feature = "golang")] +fn go_escape(s: &str) -> String { + let mut out = String::with_capacity(s.len()); + for c in s.chars() { + if c.is_ascii_uppercase() { + out.push('!'); + out.push(c.to_ascii_lowercase()); + } else { + out.push(c); + } + } + out +} + +/// go.sum's `h1:` dirhash over a module zip: sha256 of the sorted +/// `"{sha256hex(content)} {entry name}\n"` lines, base64-encoded +/// (golang.org/x/mod/sumdb/dirhash Hash1/HashZip). Computed in memory +/// BEFORE extraction. +#[cfg(feature = "golang")] +fn go_h1_of_zip(bytes: &[u8]) -> Result { + use std::io::Read as _; + let mut archive = zip::ZipArchive::new(std::io::Cursor::new(bytes)) + .map_err(|e| format!("unreadable module zip: {e}"))?; + if archive.len() > MAX_ENTRIES { + return Err(format!("module zip exceeds {MAX_ENTRIES} entries")); + } + let mut files: Vec<(String, String)> = Vec::new(); + let mut total: u64 = 0; + for i in 0..archive.len() { + let mut file = archive + .by_index(i) + .map_err(|e| format!("unreadable module zip entry: {e}"))?; + if file.is_dir() { + continue; // go module zips carry files only + } + let name = file.name().to_string(); + if name.contains('\n') { + return Err("module zip entry name contains a newline".to_string()); + } + if file.size() > MAX_ENTRY_BYTES { + return Err(format!( + "module zip entry `{name}` is {} bytes (cap {MAX_ENTRY_BYTES})", + file.size() + )); + } + total += file.size(); + if total > MAX_TOTAL_DECOMPRESSED_BYTES { + return Err(format!( + "module zip decompresses past the {MAX_TOTAL_DECOMPRESSED_BYTES}-byte cap" + )); + } + let mut hasher = Sha256::new(); + let mut buf = [0u8; 64 * 1024]; + loop { + let n = file + .read(&mut buf) + .map_err(|e| format!("cannot read module zip entry `{name}`: {e}"))?; + if n == 0 { + break; + } + hasher.update(&buf[..n]); + } + files.push((name, hex::encode(hasher.finalize()))); + } + files.sort_by(|a, b| a.0.cmp(&b.0)); + let mut h = Sha256::new(); + for (name, content_hex) in &files { + h.update(format!("{content_hex} {name}\n").as_bytes()); + } + Ok(format!( + "h1:{}", + base64::engine::general_purpose::STANDARD.encode(h.finalize()) + )) +} + +/// Traversal-guarded zip extraction with an EXPLICIT required prefix +/// (`@/` — go module paths contain slashes, so a +/// first-component strip would be wrong). Same guard family as +/// [`extract_tgz`]; an entry outside the prefix fails the whole artifact. +#[cfg(feature = "golang")] +fn extract_zip_with_prefix(bytes: &[u8], dest: &Path, prefix: &str) -> Result<(), String> { + let mut archive = zip::ZipArchive::new(std::io::Cursor::new(bytes)) + .map_err(|e| format!("unreadable module zip: {e}"))?; + for i in 0..archive.len() { + let mut file = archive + .by_index(i) + .map_err(|e| format!("unreadable module zip entry: {e}"))?; + if file.is_dir() { + continue; + } + let name = file.name().to_string(); + let Some(rel) = name.strip_prefix(prefix) else { + return Err(format!( + "module zip entry `{name}` lies outside `{prefix}` — refusing the artifact" + )); + }; + if !is_safe_relative_subpath(rel) { + return Err(format!( + "module zip entry `{name}` escapes the extraction dir — refusing the artifact" + )); + } + let target = dest.join(rel); + if let Some(parent) = target.parent() { + std::fs::create_dir_all(parent) + .map_err(|e| format!("cannot create {}: {e}", parent.display()))?; + } + let mut out = std::fs::File::create(&target) + .map_err(|e| format!("cannot create {}: {e}", target.display()))?; + std::io::copy(&mut file, &mut out).map_err(|e| format!("cannot extract `{rel}`: {e}"))?; + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let exec = file.unix_mode().is_some_and(|m| m & 0o111 != 0); + let perms = if exec { 0o755 } else { 0o644 }; + let _ = std::fs::set_permissions(&target, std::fs::Permissions::from_mode(perms)); + } + } + Ok(()) +} + +#[cfg(feature = "golang")] +async fn fetch_golang( + entry: &LockfileEntry, + client: &reqwest::Client, +) -> Result { + let LockIntegrity::GoH1(expected) = &entry.integrity else { + return Err(FetchError::Unverifiable( + "go module entries verify via the go.sum h1 dirhash only".to_string(), + )); + }; + let url = entry.resolved.clone().unwrap_or_else(|| { + format!( + "{}/{}/@v/{}.zip", + goproxy_base(), + go_escape(&entry.name), + go_escape(&entry.version) + ) + }); + let bytes = download(client, &url).await.map_err(FetchError::Failed)?; + let actual = go_h1_of_zip(&bytes).map_err(FetchError::Failed)?; + if &actual != expected { + return Err(FetchError::Failed(format!( + "go.sum dirhash mismatch: lockfile records {expected}, the fetched module zip \ + hashes to {actual}" + ))); + } + let tmp = tempfile::tempdir() + .map_err(|e| FetchError::Failed(format!("cannot create fetch tempdir: {e}")))?; + let dir = tmp.path().join("module"); + let prefix = format!("{}@{}/", entry.name, entry.version); + extract_zip_with_prefix(&bytes, &dir, &prefix).map_err(FetchError::Failed)?; + Ok(FetchedPackage { + dir, + url, + _tmp: tmp, + }) +} + +async fn fetch_npm( + entry: &LockfileEntry, + client: &reqwest::Client, +) -> Result { + let url = entry.resolved.clone().unwrap_or_else(|| { + npm_tarball_url(&npm_registry_base(), &entry.name, &entry.version) + }); + let bytes = download(client, &url).await.map_err(FetchError::Failed)?; + match &entry.integrity { + // yarn berry locks never hash the tarball itself — the checksum is + // sha512 of the deterministic cache zip. Rebuild it from the fetched + // bytes (the same spike-pinned recipe the berry wiring uses) and + // compare. Only cacheKey 10c0 (yarn 4 default) is reproducible. + LockIntegrity::BerryChecksum(expected) => { + if !expected.starts_with("10c0/") { + return Err(FetchError::Unverifiable(format!( + "yarn berry checksum `{expected}` uses a cacheKey other than 10c0; the \ + cache-zip recipe is not reproducible for it" + ))); + } + let actual = super::berry_zip::berry_cache_checksum_10c0(&bytes, &entry.name) + .map_err(FetchError::Failed)?; + if &actual != expected { + return Err(FetchError::Failed(format!( + "yarn berry cache checksum mismatch: lockfile records {expected}, the \ + fetched tarball rebuilds to {actual}" + ))); + } + } + other => verify_integrity(&bytes, other)?, + } + + let tmp = tempfile::tempdir() + .map_err(|e| FetchError::Failed(format!("cannot create fetch tempdir: {e}")))?; + let dir = tmp.path().join("package"); + extract_tgz(&bytes, &dir).map_err(FetchError::Failed)?; + if tokio::fs::metadata(dir.join("package.json")).await.is_err() { + return Err(FetchError::Failed(format!( + "fetched tarball for {}@{} carries no package.json — not an npm package", + entry.name, entry.version + ))); + } + Ok(FetchedPackage { + dir, + url, + _tmp: tmp, + }) +} + +/// Stage a package from an on-disk vendored tarball (the fresh-clone +/// re-vendor path: the project has our committed artifact but no installed +/// copy). The bytes are verified against the LEDGER-recorded sha256 before +/// extraction — same fail-closed posture as the registry path; an entry +/// with no recorded hash is refused. +pub async fn stage_local_artifact( + tgz_path: &Path, + expected_sha256_hex: &str, +) -> Result { + if expected_sha256_hex.is_empty() { + return Err(FetchError::Unverifiable( + "the vendor ledger records no sha256 for the artifact".to_string(), + )); + } + let bytes = tokio::fs::read(tgz_path) + .await + .map_err(|e| FetchError::Failed(format!("cannot read {}: {e}", tgz_path.display())))?; + if bytes.len() as u64 > MAX_DOWNLOAD_BYTES { + return Err(FetchError::Failed(format!( + "{}: artifact exceeds the {MAX_DOWNLOAD_BYTES}-byte cap", + tgz_path.display() + ))); + } + let actual = hex::encode(Sha256::digest(&bytes)); + if !actual.eq_ignore_ascii_case(expected_sha256_hex) { + return Err(FetchError::Failed(format!( + "{}: sha256 mismatch against the vendor ledger (recorded {expected_sha256_hex}, \ + on-disk bytes hash to {actual})", + tgz_path.display() + ))); + } + let tmp = tempfile::tempdir() + .map_err(|e| FetchError::Failed(format!("cannot create staging tempdir: {e}")))?; + let dir = tmp.path().join("package"); + extract_tgz(&bytes, &dir).map_err(FetchError::Failed)?; + Ok(FetchedPackage { + dir, + url: format!("file:{}", tgz_path.display()), + _tmp: tmp, + }) +} + +/// Capped download. http(s) only; the cap is enforced on the declared +/// Content-Length AND the actual stream (a lying server cannot blow past +/// it). +async fn download(client: &reqwest::Client, url: &str) -> Result, String> { + if !(url.starts_with("https://") || url.starts_with("http://")) { + return Err(format!("refusing non-http(s) artifact URL `{url}`")); + } + let mut resp = client + .get(url) + .send() + .await + .map_err(|e| format!("GET {url}: {e}"))?; + let status = resp.status(); + if !status.is_success() { + return Err(format!("GET {url}: HTTP {status}")); + } + if let Some(len) = resp.content_length() { + if len > MAX_DOWNLOAD_BYTES { + return Err(format!( + "{url}: artifact is {len} bytes (cap {MAX_DOWNLOAD_BYTES})" + )); + } + } + let mut bytes: Vec = Vec::new(); + while let Some(chunk) = resp + .chunk() + .await + .map_err(|e| format!("reading {url}: {e}"))? + { + if bytes.len() as u64 + chunk.len() as u64 > MAX_DOWNLOAD_BYTES { + return Err(format!( + "{url}: artifact exceeds the {MAX_DOWNLOAD_BYTES}-byte cap" + )); + } + bytes.extend_from_slice(&chunk); + } + Ok(bytes) +} + +/// Verify downloaded bytes against the lock-recorded verifier. Runs BEFORE +/// any disk write. Berry cache-zip checksums and go.sum dirhashes have +/// dedicated verifiers in their ecosystems' fetchers. +fn verify_integrity(bytes: &[u8], integrity: &LockIntegrity) -> Result<(), FetchError> { + match integrity { + LockIntegrity::Sri(sri) => verify_sri(bytes, sri).map_err(FetchError::Failed), + LockIntegrity::Sha1Hex(expect) => { + let actual = hex::encode(Sha1::digest(bytes)); + if &actual == expect { + Ok(()) + } else { + Err(FetchError::Failed(format!( + "sha1 mismatch: lockfile records {expect}, downloaded bytes hash to {actual}" + ))) + } + } + LockIntegrity::Sha256Hex(expect) => { + let actual = hex::encode(Sha256::digest(bytes)); + if actual.eq_ignore_ascii_case(expect) { + Ok(()) + } else { + Err(FetchError::Failed(format!( + "sha256 mismatch: lockfile records {expect}, downloaded bytes hash to {actual}" + ))) + } + } + LockIntegrity::BerryChecksum(_) | LockIntegrity::GoH1(_) => { + Err(FetchError::Unverifiable( + "verifier handled by a dedicated ecosystem fetcher".to_string(), + )) + } + LockIntegrity::None => Err(FetchError::Unverifiable( + "no integrity recorded".to_string(), + )), + } +} + +/// SRI verification: pick the strongest hash of a (possibly multi-hash, +/// whitespace-separated) SRI string and compare base64 digests. +fn verify_sri(bytes: &[u8], sri: &str) -> Result<(), String> { + let mut best: Option<(u8, &str, &str)> = None; + for token in sri.split_whitespace() { + let Some((algo, b64)) = token.split_once('-') else { + continue; + }; + let rank = match algo { + "sha512" => 3, + "sha384" => 2, + "sha256" => 1, + _ => continue, + }; + if best.map(|(r, _, _)| rank > r).unwrap_or(true) { + best = Some((rank, algo, b64)); + } + } + let Some((_, algo, expect)) = best else { + return Err(format!("no usable hash in SRI `{sri}`")); + }; + let b64 = base64::engine::general_purpose::STANDARD; + let actual = match algo { + "sha512" => b64.encode(Sha512::digest(bytes)), + "sha384" => b64.encode(Sha384::digest(bytes)), + _ => b64.encode(Sha256::digest(bytes)), + }; + if actual == expect { + Ok(()) + } else { + Err(format!( + "{algo} integrity mismatch: lockfile records {expect}, downloaded bytes hash to \ + {actual}" + )) + } +} + +/// Strip the FIRST path component (npm's tarball semantics — usually +/// `package/`, but registry tarballs may use any prefix dir). +fn strip_first_component(path: &Path) -> Option { + let mut components = path.components(); + components.next()?; + let rest = components.as_path(); + (!rest.as_os_str().is_empty()).then(|| rest.to_path_buf()) +} + +/// Traversal-guarded, mode-preserving tgz extraction (the same guard +/// family as `patch/package.rs::read_archive_to_map`, plus exec-bit +/// preservation: the deterministic re-pack reads modes from disk, so a +/// bytes-only extraction would silently strip bin scripts' exec bits). +/// Fails CLOSED on any traversal-shaped entry — a malicious tarball must +/// not half-extract. +fn extract_tgz(bytes: &[u8], dest: &Path) -> Result<(), String> { + extract_tar_gz(bytes, dest, /*strip_first=*/ true) +} + +/// Like [`extract_tgz`] but keeps entry paths verbatim (gem `data.tar.gz` +/// archives carry package content at the root, no prefix dir). +#[allow(dead_code)] // used by the gem fetcher (feature-independent helper) +fn extract_tgz_no_strip(bytes: &[u8], dest: &Path) -> Result<(), String> { + extract_tar_gz(bytes, dest, /*strip_first=*/ false) +} + +fn extract_tar_gz(bytes: &[u8], dest: &Path, strip_first: bool) -> Result<(), String> { + use std::io::Read as _; + let gz = flate2::read::GzDecoder::new(bytes).take(MAX_TOTAL_DECOMPRESSED_BYTES); + let mut archive = tar::Archive::new(gz); + let mut count = 0usize; + for entry in archive + .entries() + .map_err(|e| format!("unreadable tarball: {e}"))? + { + let mut entry = entry.map_err(|e| format!("unreadable tarball entry: {e}"))?; + count += 1; + if count > MAX_ENTRIES { + return Err(format!("tarball exceeds {MAX_ENTRIES} entries")); + } + // Regular files only: symlinks/hardlinks/devices never extract + // (a symlink could redirect later entries out of the stage). + if !entry.header().entry_type().is_file() { + continue; + } + let raw = entry + .path() + .map_err(|e| format!("tarball entry has an undecodable path: {e}"))? + .into_owned(); + let rel = if strip_first { + match strip_first_component(&raw) { + Some(rel) => rel, + None => continue, // a bare prefix-level file — not package content + } + } else { + raw.clone() + }; + let rel_str = rel.to_string_lossy(); + if !is_safe_relative_subpath(&rel_str) { + return Err(format!( + "tarball entry `{}` escapes the extraction dir — refusing the artifact", + raw.display() + )); + } + let size = entry.header().size().unwrap_or(u64::MAX); + if size > MAX_ENTRY_BYTES { + return Err(format!( + "tarball entry `{rel_str}` is {size} bytes (cap {MAX_ENTRY_BYTES})" + )); + } + let target = dest.join(&rel); + if let Some(parent) = target.parent() { + std::fs::create_dir_all(parent) + .map_err(|e| format!("cannot create {}: {e}", parent.display()))?; + } + let mut out = std::fs::File::create(&target) + .map_err(|e| format!("cannot create {}: {e}", target.display()))?; + std::io::copy(&mut entry, &mut out) + .map_err(|e| format!("cannot extract `{rel_str}`: {e}"))?; + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let mode = entry.header().mode().unwrap_or(0o644); + let perms = if mode & 0o111 != 0 { 0o755 } else { 0o644 }; + let _ = std::fs::set_permissions(&target, std::fs::Permissions::from_mode(perms)); + } + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use wiremock::matchers::{method, path as url_path}; + use wiremock::{Mock, MockServer, ResponseTemplate}; + + /// Build a gzipped tarball with the given `(path, bytes, exec)` entries. + fn make_tgz(entries: &[(&str, &[u8], bool)]) -> Vec { + let mut builder = tar::Builder::new(flate2::write::GzEncoder::new( + Vec::new(), + flate2::Compression::default(), + )); + for (path, bytes, exec) in entries { + let mut header = tar::Header::new_gnu(); + header.set_size(bytes.len() as u64); + header.set_mode(if *exec { 0o755 } else { 0o644 }); + header.set_cksum(); + builder.append_data(&mut header, path, *bytes).unwrap(); + } + builder.into_inner().unwrap().finish().unwrap() + } + + fn sri_of(bytes: &[u8]) -> String { + format!( + "sha512-{}", + base64::engine::general_purpose::STANDARD.encode(Sha512::digest(bytes)) + ) + } + + fn npm_entry(resolved: Option, integrity: LockIntegrity) -> LockfileEntry { + LockfileEntry { + ecosystem: "npm", + name: "left-pad".into(), + version: "1.3.0".into(), + purl: "pkg:npm/left-pad@1.3.0".into(), + resolved, + integrity, + } + } + + #[test] + fn tarball_url_forms() { + assert_eq!( + npm_tarball_url(DEFAULT_NPM_REGISTRY, "left-pad", "1.3.0"), + "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz" + ); + assert_eq!( + npm_tarball_url(DEFAULT_NPM_REGISTRY, "@scope/pkg", "2.0.0"), + "https://registry.npmjs.org/@scope/pkg/-/pkg-2.0.0.tgz", + "the scope stays in the path; the leaf uses the bare name" + ); + } + + #[test] + fn sri_picks_strongest_hash_and_compares() { + let bytes = b"hello"; + let good = sri_of(bytes); + assert!(verify_sri(bytes, &good).is_ok()); + // Multi-hash: a wrong sha256 alongside the right sha512 still passes + // (strongest wins), and vice versa fails. + let multi = format!("sha256-WRONG= {good}"); + assert!(verify_sri(bytes, &multi).is_ok()); + let bad = sri_of(b"other"); + assert!(verify_sri(bytes, &bad).is_err()); + assert!(verify_sri(bytes, "md5-abc=").is_err(), "unknown algos refuse"); + } + + #[tokio::test] + async fn fetch_verifies_sri_and_extracts_with_modes() { + let tgz = make_tgz(&[ + ("package/package.json", br#"{"name":"left-pad"}"#, false), + ("package/bin/cli.js", b"#!/usr/bin/env node\n", true), + ("package/index.js", b"module.exports = 1;\n", false), + ]); + let mock = MockServer::start().await; + Mock::given(method("GET")) + .and(url_path("/left-pad/-/left-pad-1.3.0.tgz")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(tgz.clone())) + .mount(&mock) + .await; + + let entry = npm_entry( + Some(format!("{}/left-pad/-/left-pad-1.3.0.tgz", mock.uri())), + LockIntegrity::Sri(sri_of(&tgz)), + ); + let fetched = fetch_and_stage(&entry, &build_registry_client()) + .await + .unwrap(); + assert!(fetched.dir().join("package.json").is_file()); + assert_eq!( + std::fs::read(fetched.dir().join("index.js")).unwrap(), + b"module.exports = 1;\n" + ); + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let mode = std::fs::metadata(fetched.dir().join("bin/cli.js")) + .unwrap() + .permissions() + .mode(); + assert_eq!(mode & 0o111, 0o111, "exec bit preserved"); + } + // The tempdir dies with the holder. + let dir = fetched.dir().to_path_buf(); + drop(fetched); + assert!(!dir.exists()); + } + + #[tokio::test] + async fn integrity_mismatch_fails_before_extraction() { + let tgz = make_tgz(&[("package/package.json", b"{}", false)]); + let mock = MockServer::start().await; + Mock::given(method("GET")) + .and(url_path("/left-pad/-/left-pad-1.3.0.tgz")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(tgz)) + .mount(&mock) + .await; + + let entry = npm_entry( + Some(format!("{}/left-pad/-/left-pad-1.3.0.tgz", mock.uri())), + LockIntegrity::Sri(sri_of(b"the lock expects different bytes")), + ); + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Failed(msg)) => { + assert!(msg.contains("mismatch"), "{msg}") + } + other => panic!("expected integrity failure, got {other:?}"), + } + } + + #[tokio::test] + async fn unverifiable_entry_refuses_without_network() { + // A URL that would hard-fail if contacted — Unverifiable proves the + // decision happened before any I/O. + let entry = npm_entry( + Some("http://127.0.0.1:1/nope.tgz".into()), + LockIntegrity::None, + ); + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Unverifiable(msg)) => { + assert!(msg.contains("no integrity"), "{msg}") + } + other => panic!("expected Unverifiable, got {other:?}"), + } + } + + #[tokio::test] + async fn http_error_and_scheme_guard_fail_closed() { + let mock = MockServer::start().await; + // No mounted route → 404. + let entry = npm_entry( + Some(format!("{}/missing.tgz", mock.uri())), + LockIntegrity::Sri(sri_of(b"x")), + ); + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Failed(msg)) => assert!(msg.contains("404"), "{msg}"), + other => panic!("expected HTTP failure, got {other:?}"), + } + + let entry = npm_entry( + Some("ftp://example.com/x.tgz".into()), + LockIntegrity::Sri(sri_of(b"x")), + ); + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Failed(msg)) => assert!(msg.contains("non-http"), "{msg}"), + other => panic!("expected scheme refusal, got {other:?}"), + } + } + + #[test] + fn extraction_strips_first_component_whatever_its_name() { + let tgz = make_tgz(&[("weird-prefix/package.json", b"{}", false)]); + let tmp = tempfile::tempdir().unwrap(); + extract_tgz(&tgz, tmp.path()).unwrap(); + assert!(tmp.path().join("package.json").is_file()); + } + + #[test] + fn traversal_entries_fail_closed() { + // The tar crate refuses to WRITE `..` paths, so craft the header + // name bytes directly — exactly what a hostile tarball would carry. + for evil in ["package/../../escape.js", "package/x/../../../up.js"] { + let mut builder = tar::Builder::new(flate2::write::GzEncoder::new( + Vec::new(), + flate2::Compression::default(), + )); + let mut header = tar::Header::new_gnu(); + { + let name = &mut header.as_gnu_mut().unwrap().name; + name[..evil.len()].copy_from_slice(evil.as_bytes()); + } + header.set_size(4); + header.set_mode(0o644); + header.set_cksum(); + builder.append(&header, &b"evil"[..]).unwrap(); + let tgz = builder.into_inner().unwrap().finish().unwrap(); + + let tmp = tempfile::tempdir().unwrap(); + let err = extract_tgz(&tgz, tmp.path()).unwrap_err(); + assert!(err.contains("escapes"), "{evil}: {err}"); + assert!( + std::fs::read_dir(tmp.path()).unwrap().next().is_none(), + "nothing may extract from a traversal-bearing tarball" + ); + } + } + + #[tokio::test] + async fn berry_checksum_verifies_via_cache_zip_rebuild() { + let tgz = make_tgz(&[ + ("package/package.json", br#"{"name":"left-pad"}"#, false), + ("package/index.js", b"module.exports = 1;\n", false), + ]); + let expected = + super::super::berry_zip::berry_cache_checksum_10c0(&tgz, "left-pad").unwrap(); + let mock = MockServer::start().await; + Mock::given(method("GET")) + .and(url_path("/left-pad/-/left-pad-1.3.0.tgz")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(tgz)) + .mount(&mock) + .await; + + let entry = npm_entry( + Some(format!("{}/left-pad/-/left-pad-1.3.0.tgz", mock.uri())), + LockIntegrity::BerryChecksum(expected), + ); + let fetched = fetch_and_stage(&entry, &build_registry_client()) + .await + .unwrap(); + assert!(fetched.dir().join("package.json").is_file()); + + // Tampered checksum → Failed; foreign cacheKey → Unverifiable. + let entry = npm_entry( + Some(format!("{}/left-pad/-/left-pad-1.3.0.tgz", mock.uri())), + LockIntegrity::BerryChecksum(format!("10c0/{}", "0".repeat(128))), + ); + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Failed(msg)) => assert!(msg.contains("mismatch"), "{msg}"), + other => panic!("expected mismatch, got {other:?}"), + } + let entry = npm_entry( + Some(format!("{}/left-pad/-/left-pad-1.3.0.tgz", mock.uri())), + LockIntegrity::BerryChecksum(format!("9/{}", "0".repeat(128))), + ); + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Unverifiable(msg)) => assert!(msg.contains("cacheKey"), "{msg}"), + other => panic!("expected Unverifiable, got {other:?}"), + } + } + + #[tokio::test] + async fn stage_local_artifact_verifies_ledger_sha256() { + let tgz = make_tgz(&[("package/package.json", b"{}", false)]); + let tmp = tempfile::tempdir().unwrap(); + let tgz_path = tmp.path().join("left-pad-1.3.0.tgz"); + std::fs::write(&tgz_path, &tgz).unwrap(); + let sha = hex::encode(Sha256::digest(&tgz)); + + let staged = stage_local_artifact(&tgz_path, &sha).await.unwrap(); + assert!(staged.dir().join("package.json").is_file()); + + match stage_local_artifact(&tgz_path, &"0".repeat(64)).await { + Err(FetchError::Failed(msg)) => assert!(msg.contains("mismatch"), "{msg}"), + other => panic!("expected ledger mismatch, got {other:?}"), + } + match stage_local_artifact(&tgz_path, "").await { + Err(FetchError::Unverifiable(_)) => {} + other => panic!("expected Unverifiable for empty hash, got {other:?}"), + } + } + + #[cfg(feature = "cargo")] + #[tokio::test] + async fn cargo_crate_fetch_verifies_sha256_and_extracts() { + // .crate = tar.gz with a {name}-{version}/ top dir. + let crate_bytes = make_tgz(&[ + ("left-pad-1.3.0/Cargo.toml", b"[package]\nname = \"left-pad\"\n", false), + ("left-pad-1.3.0/src/lib.rs", b"pub fn pad() {}\n", false), + ]); + let sha = hex::encode(Sha256::digest(&crate_bytes)); + let mock = MockServer::start().await; + Mock::given(method("GET")) + .and(url_path("/left-pad/left-pad-1.3.0.crate")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(crate_bytes)) + .mount(&mock) + .await; + + let entry = LockfileEntry { + ecosystem: "cargo", + name: "left-pad".into(), + version: "1.3.0".into(), + purl: "pkg:cargo/left-pad@1.3.0".into(), + resolved: Some(format!("{}/left-pad/left-pad-1.3.0.crate", mock.uri())), + integrity: LockIntegrity::Sha256Hex(sha), + }; + let fetched = fetch_and_stage(&entry, &build_registry_client()) + .await + .unwrap(); + assert!(fetched.dir().join("Cargo.toml").is_file()); + assert!(fetched.dir().join("src/lib.rs").is_file()); + + // Tampered checksum fails closed. + let entry = LockfileEntry { + integrity: LockIntegrity::Sha256Hex("0".repeat(64)), + ..entry + }; + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Failed(msg)) => assert!(msg.contains("mismatch"), "{msg}"), + other => panic!("expected mismatch, got {other:?}"), + } + } + + /// Build a go module zip in memory (files only, `module@version/` + /// prefix — the go zip layout). + #[cfg(feature = "golang")] + fn make_module_zip(prefix: &str, files: &[(&str, &[u8])]) -> Vec { + use std::io::Write as _; + let mut writer = zip::ZipWriter::new(std::io::Cursor::new(Vec::new())); + for (name, bytes) in files { + writer + .start_file( + format!("{prefix}{name}"), + zip::write::SimpleFileOptions::default() + .compression_method(zip::CompressionMethod::Deflated), + ) + .unwrap(); + writer.write_all(bytes).unwrap(); + } + writer.finish().unwrap().into_inner() + } + + /// Independent spec-mirror of dirhash Hash1/HashZip, structured + /// differently from the production fn to catch encoding slips. + #[cfg(feature = "golang")] + fn spec_h1(files: &[(&str, &[u8])], prefix: &str) -> String { + // dirhash.Hash1 sorts the FILE NAMES, then emits one line per file. + let mut named: Vec<(String, &[u8])> = files + .iter() + .map(|(name, bytes)| (format!("{prefix}{name}"), *bytes)) + .collect(); + named.sort_by(|a, b| a.0.cmp(&b.0)); + let lines: Vec = named + .iter() + .map(|(name, bytes)| format!("{} {name}\n", hex::encode(Sha256::digest(bytes)))) + .collect(); + let digest = Sha256::digest(lines.concat().as_bytes()); + format!( + "h1:{}", + base64::engine::general_purpose::STANDARD.encode(digest) + ) + } + + #[cfg(feature = "golang")] + #[tokio::test] + async fn golang_module_fetch_verifies_h1_dirhash_and_extracts() { + // Out-of-order files prove the sort; nested module path proves the + // explicit-prefix strip (a first-component strip would be wrong). + let prefix = "github.com/x/y@v1.0.0/"; + let files: [(&str, &[u8]); 3] = [ + ("go.mod", b"module github.com/x/y\n"), + ("a/b.go", b"package a\n"), + ("README.md", b"# y\n"), + ]; + let zip_bytes = make_module_zip(prefix, &files); + let expected = spec_h1(&files, prefix); + assert_eq!( + go_h1_of_zip(&zip_bytes).unwrap(), + expected, + "production dirhash matches the spec mirror" + ); + + let mock = MockServer::start().await; + Mock::given(method("GET")) + .and(url_path("/github.com/x/y/@v/v1.0.0.zip")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(zip_bytes)) + .mount(&mock) + .await; + + let entry = LockfileEntry { + ecosystem: "golang", + name: "github.com/x/y".into(), + version: "v1.0.0".into(), + purl: "pkg:golang/github.com/x/y@v1.0.0".into(), + resolved: Some(format!("{}/github.com/x/y/@v/v1.0.0.zip", mock.uri())), + integrity: LockIntegrity::GoH1(expected), + }; + let fetched = fetch_and_stage(&entry, &build_registry_client()) + .await + .unwrap(); + assert!(fetched.dir().join("go.mod").is_file()); + assert!(fetched.dir().join("a/b.go").is_file()); + + // Tampered h1 fails closed. + let entry = LockfileEntry { + integrity: LockIntegrity::GoH1("h1:AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=".into()), + ..entry + }; + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Failed(msg)) => assert!(msg.contains("mismatch"), "{msg}"), + other => panic!("expected mismatch, got {other:?}"), + } + } + + #[cfg(feature = "golang")] + #[test] + fn go_escape_uppercase_and_zip_prefix_guards() { + assert_eq!(go_escape("github.com/Azure/azure-sdk"), "github.com/!azure/azure-sdk"); + assert_eq!(go_escape("v1.0.0-RC1"), "v1.0.0-!r!c1"); + + // An entry outside the module prefix fails the whole artifact. + let zip_bytes = make_module_zip("github.com/x/y@v1.0.0/", &[("go.mod", b"m\n")]); + let tmp = tempfile::tempdir().unwrap(); + let err = extract_zip_with_prefix(&zip_bytes, tmp.path(), "github.com/OTHER@v1/") + .unwrap_err(); + assert!(err.contains("outside"), "{err}"); + } + + /// Build a zip with the given `(path, bytes)` entries. + fn make_zip(files: &[(&str, &[u8])]) -> Vec { + use std::io::Write as _; + let mut writer = zip::ZipWriter::new(std::io::Cursor::new(Vec::new())); + for (name, bytes) in files { + writer + .start_file( + name.to_string(), + zip::write::SimpleFileOptions::default() + .compression_method(zip::CompressionMethod::Deflated), + ) + .unwrap(); + writer.write_all(bytes).unwrap(); + } + writer.finish().unwrap().into_inner() + } + + #[cfg(feature = "composer")] + #[tokio::test] + async fn composer_dist_fetch_verifies_sha1_and_strips_top_dir() { + // GitHub zipballs carry an `owner-repo-sha/` top dir. + let zip_bytes = make_zip(&[ + ("Seldaek-monolog-abc123/composer.json", br#"{"name":"monolog/monolog"}"#), + ("Seldaek-monolog-abc123/src/Logger.php", b" assert!(msg.contains("mismatch"), "{msg}"), + other => panic!("expected mismatch, got {other:?}"), + } + } + + #[tokio::test] + async fn gem_fetch_verifies_sha256_and_extracts_data_tar() { + // .gem = plain tar holding data.tar.gz (content at the ROOT — no + // prefix dir) + metadata.gz. + let data_tgz = make_tgz(&[ + ("lib/rails.rb", b"module Rails; end\n", false), + ("README.md", b"# rails\n", false), + ]); + let mut outer = tar::Builder::new(Vec::new()); + for (name, bytes) in [("metadata.gz", b"meta".as_slice()), ("data.tar.gz", &data_tgz)] { + let mut header = tar::Header::new_gnu(); + header.set_size(bytes.len() as u64); + header.set_mode(0o644); + header.set_cksum(); + outer.append_data(&mut header, name, bytes).unwrap(); + } + let gem_bytes = outer.into_inner().unwrap(); + let sha = hex::encode(Sha256::digest(&gem_bytes)); + + let mock = MockServer::start().await; + Mock::given(method("GET")) + .and(url_path("/downloads/rails-7.1.0.gem")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(gem_bytes)) + .mount(&mock) + .await; + + let entry = LockfileEntry { + ecosystem: "gem", + name: "rails".into(), + version: "7.1.0".into(), + purl: "pkg:gem/rails@7.1.0".into(), + resolved: Some(format!("{}/downloads/rails-7.1.0.gem", mock.uri())), + integrity: LockIntegrity::Sha256Hex(sha), + }; + let fetched = fetch_and_stage(&entry, &build_registry_client()) + .await + .unwrap(); + assert!( + fetched.dir().join("lib/rails.rb").is_file(), + "data.tar.gz content extracts at the root (no strip)" + ); + assert!(fetched.dir().join("README.md").is_file()); + } + + #[tokio::test] + async fn pypi_wheel_fetch_extracts_site_packages_layout() { + let wheel = make_zip(&[ + ("requests/__init__.py", b"__version__ = '2.28.0'\n"), + ( + "requests-2.28.0.dist-info/RECORD", + b"requests/__init__.py,sha256=abc,24\n", + ), + ("requests-2.28.0.dist-info/WHEEL", b"Wheel-Version: 1.0\n"), + ]); + let sha = hex::encode(Sha256::digest(&wheel)); + let mock = MockServer::start().await; + Mock::given(method("GET")) + .and(url_path("/packages/requests-2.28.0-py3-none-any.whl")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(wheel)) + .mount(&mock) + .await; + + let entry = LockfileEntry { + ecosystem: "pypi", + name: "requests".into(), + version: "2.28.0".into(), + purl: "pkg:pypi/requests@2.28.0".into(), + resolved: Some(format!( + "{}/packages/requests-2.28.0-py3-none-any.whl", + mock.uri() + )), + integrity: LockIntegrity::Sha256Hex(sha), + }; + let fetched = fetch_and_stage(&entry, &build_registry_client()) + .await + .unwrap(); + // Wheel content at the root: a site-packages-shaped dir with the + // dist-info RECORD the pypi vendor backend stages from. + assert!(fetched.dir().join("requests/__init__.py").is_file()); + assert!(fetched + .dir() + .join("requests-2.28.0.dist-info/RECORD") + .is_file()); + + // No recorded wheel URL (poetry/requirements) → Unverifiable. + let entry = LockfileEntry { + resolved: None, + integrity: LockIntegrity::Sha256Hex("0".repeat(64)), + ..entry + }; + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Unverifiable(msg)) => assert!(msg.contains("wheel"), "{msg}"), + other => panic!("expected Unverifiable, got {other:?}"), + } + } + + #[test] + fn oversized_entry_header_fails_closed() { + // A header CLAIMING more than the per-entry cap fails before any + // attempt to read that much data. + let mut builder = tar::Builder::new(flate2::write::GzEncoder::new( + Vec::new(), + flate2::Compression::default(), + )); + let mut header = tar::Header::new_gnu(); + header.set_path("package/huge.bin").unwrap(); + header.set_size(MAX_ENTRY_BYTES + 1); + header.set_mode(0o644); + header.set_cksum(); + // Intentionally append no data: the size check fires first. + let inner = { + use std::io::Write as _; + builder.get_mut().write_all(&header.as_bytes()[..]).unwrap(); + builder.into_inner().unwrap().finish().unwrap() + }; + let tmp = tempfile::tempdir().unwrap(); + let err = extract_tgz(&inner, tmp.path()).unwrap_err(); + assert!( + err.contains("cap") || err.contains("unreadable"), + "oversize header fails closed: {err}" + ); + } +} diff --git a/crates/socket-patch-core/src/patch/vendor/yarn_berry_lock.rs b/crates/socket-patch-core/src/patch/vendor/yarn_berry_lock.rs index 613b503..2dd101b 100644 --- a/crates/socket-patch-core/src/patch/vendor/yarn_berry_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/yarn_berry_lock.rs @@ -83,7 +83,7 @@ pub async fn vendor_yarn_berry( Ok(coords) => coords, Err(outcome) => return *outcome, }; - let (name, version) = (coords.name, coords.version); + let (name, version) = (coords.name.as_str(), coords.version.as_str()); let uuid_dir_rel = coords.uuid_dir_rel.clone(); let base_purl = coords.base_purl.clone(); let rel_tgz = format!("{}/{}", coords.uuid_dir_rel, tgz_rel_leaf(name, version)); @@ -195,6 +195,12 @@ pub async fn vendor_yarn_berry( format!("{PACKAGE_JSON} root is not an object"), ); }; + // A user-authored BARE-name pin to the exact version being vendored is + // TAKEN OVER (its value is rewritten to our spec — the pin already + // forced this exact version, so semantics are preserved — and recorded + // as the wiring `original` so revert restores it). Anything else + // same-name still refuses. + let mut takeover_original: Option = None; if let Some(res) = pkg_obj.get("resolutions") { let Some(res_obj) = res.as_object() else { return refused( @@ -210,19 +216,26 @@ pub async fn vendor_yarn_berry( continue; } // Our own (possibly stale-uuid) entry is fine to overwrite; a - // user-authored override is never clobbered. + // user-authored override is never clobbered silently. let ours = value .as_str() .is_some_and(|v| parse_vendor_path(v).is_some_and(|p| p.eco == "npm")); - if !ours { - return refused( - "vendor_override_conflict", - format!( - "{PACKAGE_JSON} already has a resolutions entry for `{selector}` \ - ({value}); vendor will not overwrite a user-authored override" - ), - ); + if ours { + continue; + } + if selector == name && value.as_str() == Some(version) { + takeover_original = Some(version.to_string()); + continue; } + return refused( + "vendor_override_conflict", + format!( + "{PACKAGE_JSON} already has a resolutions entry for `{selector}` \ + ({value}); vendor will not overwrite a user-authored override (an \ + exact-version pin `\"{name}\": \"{version}\"` is taken over \ + automatically)" + ), + ); } } @@ -254,6 +267,7 @@ pub async fn vendor_yarn_berry( sources, dry_run, force, + &mut warnings, ) .await { @@ -393,16 +407,17 @@ pub async fn vendor_yarn_berry( WiringRecord { file: PACKAGE_JSON.to_string(), kind: KIND_RESOLUTION.to_string(), - // Rewritten only when replacing our own stale entry — and then - // there is deliberately no `original` (never record our own edit - // as a pre-vendor fragment). + // Rewritten when replacing our own stale entry (no `original` — + // never record our own edit as a pre-vendor fragment) or a + // taken-over user pin (whose value IS the `original`, restored + // verbatim on revert). action: if existing_entry { WiringAction::Rewritten } else { WiringAction::Added }, key: Some(name.to_string()), - original: None, + original: takeover_original.map(Value::String), new: Some(Value::String(spec)), }, WiringRecord { @@ -688,6 +703,13 @@ fn revert_resolution_record( )); return; } + // A takeover recorded the user's pinned value: restore it in place + // (the key and table stay). Otherwise remove our entry as before. + if let Some(orig) = rec.original.as_ref().and_then(Value::as_str) { + res_obj.insert(key.to_string(), Value::String(orig.to_string())); + *changed = true; + return; + } res_obj.shift_remove(key); if res_obj.is_empty() { obj.shift_remove("resolutions"); @@ -876,7 +898,7 @@ fn carried_sections(lines: &[String]) -> Vec { } /// Read a berry scalar field (`: `, value possibly quoted). -fn berry_field<'a>(lines: &'a [String], field: &str) -> Option<&'a str> { +pub(super) fn berry_field<'a>(lines: &'a [String], field: &str) -> Option<&'a str> { for line in lines.iter().skip(1) { let Some(rest) = body_field_line(line) else { continue; @@ -1356,6 +1378,62 @@ __metadata: assert_eq!(tokio::fs::read(fx.pkg_path()).await.unwrap(), fx.pkg_bytes); } + /// A user-authored BARE-name pin to the exact version being vendored is + /// taken over: the value moves to our spec, the wiring records the pin + /// as `original`, and revert restores it (table kept). Range-keyed + /// selectors keep refusing. + #[tokio::test] + async fn user_exact_pin_resolution_is_taken_over_and_revert_restores_it() { + let pkg_before = B3_BEFORE_PKG.replace( + " }\n}", + " },\n \"resolutions\": {\n \"left-pad\": \"1.3.0\"\n }\n}", + ); + let fx = fixture_with(&pkg_before, B3_BEFORE_LOCK).await; + + let (result, entry, _) = expect_done(fx.vendor(false).await); + assert!(result.success, "{:?}", result.error); + let entry = entry.unwrap(); + + let pkg: Value = + serde_json::from_slice(&tokio::fs::read(fx.pkg_path()).await.unwrap()).unwrap(); + let val = pkg["resolutions"]["left-pad"].as_str().unwrap(); + assert!( + parse_vendor_path(val).is_some_and(|p| p.eco == "npm"), + "pin value rewritten to our spec: {val}" + ); + + let rec = entry + .wiring + .iter() + .find(|r| r.kind == KIND_RESOLUTION) + .unwrap(); + assert_eq!(rec.action, WiringAction::Rewritten); + assert_eq!( + rec.original, + Some(Value::String("1.3.0".to_string())), + "the user's pin is the original" + ); + + // Revert restores the pin in place (the resolutions table stays). + let outcome = revert_yarn_berry(&entry, fx.root(), false).await; + assert!(outcome.success, "{:?}", outcome.error); + let pkg: Value = + serde_json::from_slice(&tokio::fs::read(fx.pkg_path()).await.unwrap()).unwrap(); + assert_eq!( + pkg["resolutions"]["left-pad"], + Value::String("1.3.0".to_string()), + "pin restored" + ); + + // A range-keyed selector with the same value still refuses. + let pkg = B3_BEFORE_PKG.replace( + " }\n}", + " },\n \"resolutions\": {\n \"left-pad@npm:1.x\": \"1.3.0\"\n }\n}", + ); + let fx = fixture_with(&pkg, B3_BEFORE_LOCK).await; + expect_refused(fx.vendor(false).await, "vendor_override_conflict"); + } + #[tokio::test] async fn missing_entry_and_other_version_guards() { // No left-pad entry at all. diff --git a/crates/socket-patch-core/src/patch/vendor/yarn_classic_lock.rs b/crates/socket-patch-core/src/patch/vendor/yarn_classic_lock.rs index ac1ff84..6278bb8 100644 --- a/crates/socket-patch-core/src/patch/vendor/yarn_classic_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/yarn_classic_lock.rs @@ -68,7 +68,7 @@ pub async fn vendor_yarn_classic( Ok(coords) => coords, Err(outcome) => return *outcome, }; - let (name, version) = (coords.name, coords.version); + let (name, version) = (coords.name.as_str(), coords.version.as_str()); let uuid_dir_rel = coords.uuid_dir_rel; let base_purl = coords.base_purl; @@ -135,6 +135,7 @@ pub async fn vendor_yarn_classic( sources, dry_run, force, + &mut warnings, ) .await { @@ -554,7 +555,7 @@ fn rewrite_classic_block( /// Does this block's `resolved` already point into `.socket/vendor/npm/` /// (ours — current or stale uuid)? -fn block_points_into_vendor(lines: &[String]) -> bool { +pub(super) fn block_points_into_vendor(lines: &[String]) -> bool { classic_field(lines, "resolved") .and_then(parse_vendor_path) .is_some_and(|p| p.eco == "npm") diff --git a/crates/socket-patch-core/src/utils/purl.rs b/crates/socket-patch-core/src/utils/purl.rs index 6ea0e80..4fa80ec 100644 --- a/crates/socket-patch-core/src/utils/purl.rs +++ b/crates/socket-patch-core/src/utils/purl.rs @@ -1,3 +1,5 @@ +use std::borrow::Cow; + /// Strip the trailing `?qualifiers` and `#subpath` components from a PURL, /// leaving the canonical `pkg:type/namespace/name@version` base. /// @@ -18,6 +20,94 @@ pub fn strip_purl_qualifiers(purl: &str) -> &str { } } +/// Strictly percent-decode ONE purl path component (a scope, namespace +/// segment, name, or version) AFTER it has been split out of the purl. +/// +/// The patches API serves purls in canonical percent-encoded form +/// (`pkg:npm/%40scope/name@1.0.0`), while crawlers build purls from the +/// literal on-disk names (`pkg:npm/@scope/name@1.0.0`). Parsers must +/// decode the API form to find installed packages. +/// +/// SECURITY: this must only ever be called on a component AFTER the purl +/// has been split on `/` and the version `@` — so an encoded separator +/// (`%2f`) cannot create new path segments at parse time; it surfaces as +/// a literal `/` *inside* one component — and BEFORE the path-safety +/// guards run, so `%2e%2e`, `%2f`, `%5c`, `%00` are rejected post-decode +/// by the same `is_safe_*` gates that reject their literal forms. +/// Guarding the encoded form instead would be a traversal bypass. +/// +/// Decoding is all-or-nothing: an invalid escape (`%G1`, trailing `%4`) +/// or a non-UTF8 decode returns the input unchanged (fail-safe — the +/// undecoded form contains no separators, and `%` is not a legal +/// character in any real package name). Zero-alloc when no `%`. +pub fn percent_decode_purl_component(component: &str) -> Cow<'_, str> { + if !component.contains('%') { + return Cow::Borrowed(component); + } + fn hex_val(b: u8) -> Option { + match b { + b'0'..=b'9' => Some(b - b'0'), + b'a'..=b'f' => Some(b - b'a' + 10), + b'A'..=b'F' => Some(b - b'A' + 10), + _ => None, + } + } + let bytes = component.as_bytes(); + let mut out: Vec = Vec::with_capacity(bytes.len()); + let mut i = 0; + while i < bytes.len() { + if bytes[i] == b'%' { + let (Some(hi), Some(lo)) = ( + bytes.get(i + 1).copied().and_then(hex_val), + bytes.get(i + 2).copied().and_then(hex_val), + ) else { + // Invalid escape: leave the whole component verbatim. + return Cow::Borrowed(component); + }; + out.push(hi * 16 + lo); + i += 3; + } else { + out.push(bytes[i]); + i += 1; + } + } + match String::from_utf8(out) { + Ok(s) => Cow::Owned(s), + // Decoded bytes are not UTF-8: leave the component verbatim. + Err(_) => Cow::Borrowed(component), + } +} + +/// Canonical string form for purl-to-purl comparison and display: +/// percent-decode each `/`-separated component of the +/// `pkg:type/...@version` base; qualifiers/subpath are appended verbatim. +/// +/// Used ONLY for string equality (`purl_eq`) and human output — never to +/// build filesystem paths (a `%2f` decoding into a name can at worst make +/// two distinct purls compare equal, not change a write location). +pub fn normalize_purl(purl: &str) -> Cow<'_, str> { + if !purl.contains('%') { + return Cow::Borrowed(purl); + } + let split = purl.find(['?', '#']).unwrap_or(purl.len()); + let (base, suffix) = purl.split_at(split); + let mut out = String::with_capacity(purl.len()); + for (i, seg) in base.split('/').enumerate() { + if i > 0 { + out.push('/'); + } + out.push_str(&percent_decode_purl_component(seg)); + } + out.push_str(suffix); + Cow::Owned(out) +} + +/// Purl equality up to percent-encoding of the base components +/// (`pkg:npm/%40scope/x@1` ≡ `pkg:npm/@scope/x@1`). +pub fn purl_eq(a: &str, b: &str) -> bool { + normalize_purl(a) == normalize_purl(b) +} + /// Parse a PyPI PURL to extract name and version. /// /// e.g., `"pkg:pypi/requests@2.28.0?artifact_id=abc"` -> `Some(("requests", "2.28.0"))` @@ -154,8 +244,12 @@ pub fn build_composer_purl(namespace: &str, name: &str, version: &str) -> String /// We follow the same shape as `parse_composer_purl` since both /// have a `/` namespace structure. The leading `@` on /// the scope is preserved (matching npm's `@scope/name` convention). +/// `((scope, name), version)` from a JSR purl, percent-decoded. +#[cfg(feature = "deno")] +pub type JsrPurlParts<'a> = ((Cow<'a, str>, Cow<'a, str>), Cow<'a, str>); + #[cfg(feature = "deno")] -pub fn parse_jsr_purl(purl: &str) -> Option<((&str, &str), &str)> { +pub fn parse_jsr_purl(purl: &str) -> Option> { let base = strip_purl_qualifiers(purl); let rest = base.strip_prefix("pkg:jsr/")?; let at_idx = rest.rfind('@')?; @@ -167,8 +261,12 @@ pub fn parse_jsr_purl(purl: &str) -> Option<((&str, &str), &str)> { } let slash_idx = name_part.find('/')?; - let scope = &name_part[..slash_idx]; - let name = &name_part[slash_idx + 1..]; + // Decode AFTER splitting on `/`/`@` and BEFORE the shape checks below + // (and the caller's `is_safe_jsr_component` gate) — see + // `percent_decode_purl_component`. The API serves `%40scope`. + let scope = percent_decode_purl_component(&name_part[..slash_idx]); + let name = percent_decode_purl_component(&name_part[slash_idx + 1..]); + let version = percent_decode_purl_component(version); // Scope must be `@`. The bare `@` (length 1) is // invalid — there's no actual scope after the marker. @@ -248,15 +346,22 @@ pub fn is_purl(s: &str) -> bool { /// /// Non-PyPI keys never carry a `?`, so for them this reduces to plain /// equality. +/// +/// Comparison is encoding-tolerant (`purl_eq`): manifest keys come from +/// the API in percent-encoded form (`pkg:npm/%40scope/x@1`) while users +/// type the literal form — both spellings must match either way around. pub fn purl_matches_identifier(manifest_key: &str, identifier: &str) -> bool { if identifier.contains('?') { - manifest_key == identifier + purl_eq(manifest_key, identifier) } else { // Base identifier: compare bases. Strip both sides so a subpath // (`#...`) carried by either the key or the identifier doesn't // defeat the match — `strip_purl_qualifiers(identifier)` is a no-op // for a plain base PURL, so existing behaviour is unchanged. - strip_purl_qualifiers(manifest_key) == strip_purl_qualifiers(identifier) + purl_eq( + strip_purl_qualifiers(manifest_key), + strip_purl_qualifiers(identifier), + ) } } @@ -504,25 +609,31 @@ mod tests { ); } + #[cfg(feature = "deno")] + fn jsr_parts(purl: &str) -> Option<(String, String, String)> { + parse_jsr_purl(purl) + .map(|((s, n), v)| (s.into_owned(), n.into_owned(), v.into_owned())) + } + #[cfg(feature = "deno")] #[test] fn test_parse_jsr_purl() { assert_eq!( - parse_jsr_purl("pkg:jsr/@std/path@0.220.0"), - Some((("@std", "path"), "0.220.0")) + jsr_parts("pkg:jsr/@std/path@0.220.0"), + Some(("@std".into(), "path".into(), "0.220.0".into())) ); assert_eq!( - parse_jsr_purl("pkg:jsr/@luca/flag@1.0.0"), - Some((("@luca", "flag"), "1.0.0")) + jsr_parts("pkg:jsr/@luca/flag@1.0.0"), + Some(("@luca".into(), "flag".into(), "1.0.0".into())) ); // Scope must start with `@`. - assert_eq!(parse_jsr_purl("pkg:jsr/std/path@0.220.0"), None); + assert_eq!(jsr_parts("pkg:jsr/std/path@0.220.0"), None); // Empty pieces. - assert_eq!(parse_jsr_purl("pkg:jsr/@/path@0.220.0"), None); - assert_eq!(parse_jsr_purl("pkg:jsr/@std/@0.220.0"), None); - assert_eq!(parse_jsr_purl("pkg:jsr/@std/path@"), None); + assert_eq!(jsr_parts("pkg:jsr/@/path@0.220.0"), None); + assert_eq!(jsr_parts("pkg:jsr/@std/@0.220.0"), None); + assert_eq!(jsr_parts("pkg:jsr/@std/path@"), None); // Wrong scheme. - assert_eq!(parse_jsr_purl("pkg:npm/@std/path@0.220.0"), None); + assert_eq!(jsr_parts("pkg:npm/@std/path@0.220.0"), None); } #[cfg(feature = "deno")] @@ -661,8 +772,8 @@ mod tests { // Scope `@` + version `@` + qualifier `@` all coexist; only the // version `@` should be honored. assert_eq!( - parse_jsr_purl("pkg:jsr/@std/path@0.220.0?download_url=x@y"), - Some((("@std", "path"), "0.220.0")) + jsr_parts("pkg:jsr/@std/path@0.220.0?download_url=x@y"), + Some(("@std".into(), "path".into(), "0.220.0".into())) ); } @@ -748,6 +859,88 @@ mod tests { )); } + // --- Percent-decoding: API purls carry %-encoded components -------------- + + #[test] + fn test_percent_decode_purl_component() { + // The canonical case: an encoded npm scope marker. + assert_eq!( + percent_decode_purl_component("%40modelcontextprotocol"), + "@modelcontextprotocol" + ); + // Traversal sequences decode — the post-decode safety guards are + // what reject them, not this helper. + assert_eq!(percent_decode_purl_component("%2e%2e"), ".."); + assert_eq!(percent_decode_purl_component("a%2fb"), "a/b"); + assert_eq!(percent_decode_purl_component("%00"), "\0"); + // Invalid escapes leave the WHOLE component verbatim (all-or-nothing). + assert_eq!(percent_decode_purl_component("%G1abc"), "%G1abc"); + assert_eq!(percent_decode_purl_component("abc%4"), "abc%4"); + assert_eq!(percent_decode_purl_component("abc%"), "abc%"); + // Non-UTF8 decode (lone continuation byte) leaves it verbatim. + assert_eq!(percent_decode_purl_component("%FF"), "%FF"); + // No '%' is zero-alloc (borrowed). + assert!(matches!( + percent_decode_purl_component("plain-name"), + Cow::Borrowed(_) + )); + } + + #[test] + fn test_normalize_purl_and_purl_eq() { + assert_eq!( + normalize_purl("pkg:npm/%40modelcontextprotocol/sdk@1.12.0"), + "pkg:npm/@modelcontextprotocol/sdk@1.12.0" + ); + assert!(purl_eq( + "pkg:npm/%40scope/x@1.0.0", + "pkg:npm/@scope/x@1.0.0" + )); + assert!(purl_eq( + "pkg:npm/@scope/x@1.0.0", + "pkg:npm/%40scope/x@1.0.0" + )); + assert!(!purl_eq("pkg:npm/%40scope/x@1.0.0", "pkg:npm/@scope/x@2.0.0")); + // Qualifiers/subpath are preserved verbatim (not decoded). + assert_eq!( + normalize_purl("pkg:npm/%40s/x@1?artifact_id=a%2Fb"), + "pkg:npm/@s/x@1?artifact_id=a%2Fb" + ); + // Unencoded input is unchanged (and borrowed). + assert!(matches!( + normalize_purl("pkg:npm/lodash@4.17.21"), + Cow::Borrowed(_) + )); + } + + #[test] + fn test_purl_matches_identifier_decodes_encoded_key() { + // Encoded manifest key vs literal identifier — and vice versa. + assert!(purl_matches_identifier( + "pkg:npm/%40scope/x@1.0.0", + "pkg:npm/@scope/x@1.0.0" + )); + assert!(purl_matches_identifier( + "pkg:npm/@scope/x@1.0.0", + "pkg:npm/%40scope/x@1.0.0" + )); + assert!(!purl_matches_identifier( + "pkg:npm/%40scope/x@1.0.0", + "pkg:npm/@scope/y@1.0.0" + )); + } + + #[cfg(feature = "deno")] + #[test] + fn test_parse_jsr_purl_percent_encoded_scope() { + let ((scope, name), version) = parse_jsr_purl("pkg:jsr/%40std/path@0.220.0").unwrap(); + assert_eq!(scope, "@std"); + assert_eq!(name, "path"); + assert_eq!(version, "0.220.0"); + // The encoded bare `@` is still rejected post-decode. + assert_eq!(jsr_parts("pkg:jsr/%40/path@0.220.0"), None); + } + // --- Regression: name must not absorb the version separator ------------- #[test]