From 42e95bee0c4df13b172d49e8c996aef734635360 Mon Sep 17 00:00:00 2001 From: Mikola Lysenko Date: Thu, 11 Jun 2026 15:05:04 -0400 Subject: [PATCH 01/19] fix(purl): percent-decode purl components from the API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The patches API serves scoped purls percent-encoded (pkg:npm/%40scope/name@1.0.0) and scan stores them verbatim as manifest keys, but neither the npm crawler nor the vendor coordinate parser decoded them — so apply/vendor reported scoped packages as 'package not installed', and detect_prunable saw every encoded entry as prunable. - utils/purl.rs: percent_decode_purl_component (strict, all-or-nothing, fail-safe passthrough), normalize_purl + purl_eq (compare/display only, never path construction) - npm_crawler parse_purl_components, vendor parse_npm_purl (NpmCoords now owns decoded name/version; base_purl stays verbatim for ledger/ manifest key parity), parse_jsr_purl: decode AFTER /-and-@ splits, BEFORE the is_safe_* guards — %2e%2e/%2f cannot smuggle traversal - detect_prunable + purl_matches_identifier compare normalized forms - human output shows the decoded purl; JSON keeps verbatim keys Co-Authored-By: Claude Fable 5 --- crates/socket-patch-cli/src/commands/get.rs | 4 +- crates/socket-patch-cli/src/commands/scan.rs | 42 +++- .../socket-patch-cli/src/commands/vendor.rs | 8 +- .../src/crawlers/deno_crawler.rs | 13 +- .../src/crawlers/npm_crawler.rs | 125 +++++++++- .../src/patch/vendor/bun_lock.rs | 2 +- .../src/patch/vendor/npm_common.rs | 91 ++++++-- .../src/patch/vendor/npm_lock.rs | 6 +- .../src/patch/vendor/pnpm_lock.rs | 2 +- .../src/patch/vendor/yarn_berry_lock.rs | 2 +- .../src/patch/vendor/yarn_classic_lock.rs | 2 +- crates/socket-patch-core/src/utils/purl.rs | 221 ++++++++++++++++-- 12 files changed, 447 insertions(+), 71 deletions(-) diff --git a/crates/socket-patch-cli/src/commands/get.rs b/crates/socket-patch-cli/src/commands/get.rs index e7359aa..0f520f6 100644 --- a/crates/socket-patch-cli/src/commands/get.rs +++ b/crates/socket-patch-cli/src/commands/get.rs @@ -13,7 +13,7 @@ use socket_patch_core::manifest::schema::{ }; use socket_patch_core::patch::apply::select_installed_variants; use socket_patch_core::utils::fuzzy_match::fuzzy_match_packages; -use socket_patch_core::utils::purl::{is_purl, strip_purl_qualifiers}; +use socket_patch_core::utils::purl::{is_purl, normalize_purl, strip_purl_qualifiers}; use socket_patch_core::utils::telemetry::{track_patch_fetch_failed, track_patch_fetched}; use std::collections::HashMap; use std::fmt; @@ -1030,7 +1030,7 @@ pub async fn download_and_apply_patches( let action = decide_patch_action(&manifest, &patch.purl, &patch.uuid); if let PatchAction::Skipped = action { if !params.json && !params.silent { - eprintln!(" [skip] {} (already in manifest)", patch.purl); + eprintln!(" [skip] {} (already in manifest)", normalize_purl(&patch.purl)); } downloaded_patches.push(serde_json::json!({ "purl": patch.purl, diff --git a/crates/socket-patch-cli/src/commands/scan.rs b/crates/socket-patch-cli/src/commands/scan.rs index f395763..00b2fe9 100644 --- a/crates/socket-patch-cli/src/commands/scan.rs +++ b/crates/socket-patch-cli/src/commands/scan.rs @@ -10,7 +10,7 @@ use socket_patch_core::patch::apply_lock; use socket_patch_core::utils::cleanup_blobs::{ cleanup_unused_archives, cleanup_unused_blobs, CleanupResult, }; -use socket_patch_core::utils::purl::strip_purl_qualifiers; +use socket_patch_core::utils::purl::{normalize_purl, strip_purl_qualifiers}; use socket_patch_core::utils::telemetry::{ track_patch_scan_failed, track_patch_scanned, track_patch_vendor_failed, track_patch_vendored, }; @@ -197,23 +197,29 @@ async fn preview_apply_gc( /// copy is its NORMAL state, not "no longer installed". Without this, a /// wiped node_modules would prune the manifest entry — and the next /// `vendor` run would then reconcile-revert the vendoring itself. +/// +/// Both sides are compared in percent-DECODED form (`normalize_purl`): +/// manifest keys come from the API encoded (`pkg:npm/%40scope/x@1`) while +/// crawler purls carry the literal `@scope` — comparing the raw strings +/// would make every encoded scoped entry look prunable and `--prune`/ +/// `--sync` would GC the very patch it just downloaded. pub(crate) fn detect_prunable( manifest: &PatchManifest, scanned_purls: &HashSet, vendored: &HashSet, ) -> Vec { - let scanned_bases: HashSet<&str> = scanned_purls + let scanned_bases: HashSet = scanned_purls .iter() - .map(|p| strip_purl_qualifiers(p)) + .map(|p| normalize_purl(strip_purl_qualifiers(p)).into_owned()) .collect(); manifest .patches .keys() .filter(|p| { - let base = strip_purl_qualifiers(p); - !scanned_bases.contains(base) + let base = normalize_purl(strip_purl_qualifiers(p)); + !scanned_bases.contains(base.as_ref()) && !vendored.contains(p.as_str()) - && !vendored.contains(base) + && !vendored.contains(strip_purl_qualifiers(p)) }) .cloned() .collect() @@ -1804,7 +1810,7 @@ pub async fn run(args: ScanArgs) -> i32 { for p in &vendored_selected { println!( " [skip] {} (vendored — run scan --vendor to update)", - p.purl + normalize_purl(&p.purl) ); } } @@ -1851,7 +1857,10 @@ pub async fn run(args: ScanArgs) -> i32 { println!( " {} [{}] {}", - patch.purl, + // Human display only: show the decoded form of an + // API-encoded purl (`%40scope` → `@scope`). JSON output + // keeps the verbatim key. + normalize_purl(&patch.purl), patch.tier.to_uppercase(), sev_colored, ); @@ -2244,6 +2253,23 @@ mod tests { ); } + #[test] + fn detect_prunable_encoded_manifest_key_not_pruned() { + // The API serves scoped purls percent-encoded and they land in the + // manifest verbatim; the crawler reports the literal `@scope` form. + // Comparing raw strings would make every encoded scoped entry look + // prunable — `scan --prune` would GC the patch it just downloaded. + let m = manifest_with(&[("pkg:npm/%40scope/x@1.0.0", "uuid-a")]); + let s = scanned(&["pkg:npm/@scope/x@1.0.0"]); + assert!( + detect_prunable(&m, &s, &no_vendored()).is_empty(), + "encoded manifest key must match the decoded scanned purl" + ); + // A genuinely-gone encoded entry still prunes. + let out = detect_prunable(&m, &scanned(&[]), &no_vendored()); + assert_eq!(out, vec!["pkg:npm/%40scope/x@1.0.0".to_string()]); + } + #[test] fn detect_prunable_exempts_qualified_variant_of_vendored_base() { // The ledger key set carries qualifier-stripped bases (see diff --git a/crates/socket-patch-cli/src/commands/vendor.rs b/crates/socket-patch-cli/src/commands/vendor.rs index ac7f9c6..f306f1d 100644 --- a/crates/socket-patch-cli/src/commands/vendor.rs +++ b/crates/socket-patch-cli/src/commands/vendor.rs @@ -27,7 +27,7 @@ use socket_patch_core::patch::vendor::{ self, ecosystem_dir_for_purl, load_state, save_state, RevertOutcome, VendorEntry, VendorOutcome, VendorWarning, }; -use socket_patch_core::utils::purl::strip_purl_qualifiers; +use socket_patch_core::utils::purl::{normalize_purl, strip_purl_qualifiers}; use socket_patch_core::utils::telemetry::{track_patch_vendor_failed, track_patch_vendored}; use socket_patch_core::vex::time::now_rfc3339; use std::collections::{HashMap, HashSet}; @@ -566,7 +566,7 @@ pub(crate) async fn vendor_records( ); } if !common.silent && !common.json { - eprintln!("Cannot vendor {candidate}: {detail}"); + eprintln!("Cannot vendor {}: {detail}", normalize_purl(candidate)); } } Some(VendorOutcome::Done { @@ -579,7 +579,7 @@ pub(crate) async fn vendor_records( if !common.silent && !common.json { eprintln!( "Failed to vendor {}: {}", - candidate, + normalize_purl(candidate), result.error.as_deref().unwrap_or("unknown error") ); } @@ -702,7 +702,7 @@ pub(crate) async fn vendor_records( .with_reason("package_not_installed", "no installed package found"), ); if !common.silent && !common.json { - eprintln!("Cannot vendor {purl}: package not installed"); + eprintln!("Cannot vendor {}: package not installed", normalize_purl(purl)); } } } diff --git a/crates/socket-patch-core/src/crawlers/deno_crawler.rs b/crates/socket-patch-core/src/crawlers/deno_crawler.rs index 5a12c2d..150cc27 100644 --- a/crates/socket-patch-core/src/crawlers/deno_crawler.rs +++ b/crates/socket-patch-core/src/crawlers/deno_crawler.rs @@ -120,16 +120,21 @@ impl DenoCrawler { // manifest PURL and are joined onto the cache root below. A real // JSR coordinate is a single path segment, so reject any that // could traverse out of the cache (`..`/`.`, a separator, NUL). + // The parser percent-decodes components, so these guards see the + // decoded form — `%2e%2e` cannot smuggle a traversal past them. // Unlike the cargo/npm crawlers there is no content check to catch // a bogus path, and jsr patches in place — so fail closed here. - if !(is_safe_jsr_component(scope) - && is_safe_jsr_component(name) - && is_safe_jsr_component(version)) + if !(is_safe_jsr_component(&scope) + && is_safe_jsr_component(&name) + && is_safe_jsr_component(&version)) { continue; } // Cache layout: //// - let pkg_dir = jsr_cache_path.join(scope).join(name).join(version); + let pkg_dir = jsr_cache_path + .join(&*scope) + .join(&*name) + .join(&*version); if !is_dir(&pkg_dir).await { continue; } diff --git a/crates/socket-patch-core/src/crawlers/npm_crawler.rs b/crates/socket-patch-core/src/crawlers/npm_crawler.rs index 91cb624..2d9f8d3 100644 --- a/crates/socket-patch-core/src/crawlers/npm_crawler.rs +++ b/crates/socket-patch-core/src/crawlers/npm_crawler.rs @@ -4,6 +4,7 @@ use std::path::{Path, PathBuf}; use serde::Deserialize; use super::types::{CrawledPackage, CrawlerOptions}; +use crate::utils::purl::{percent_decode_purl_component, strip_purl_qualifiers}; /// Default batch size for crawling. #[cfg(test)] @@ -686,11 +687,7 @@ impl NpmCrawler { /// Parse a PURL string to extract namespace, name, and version. fn parse_purl_components(purl: &str) -> Option<(Option, String, String)> { - // Strip qualifiers - let base = match purl.find('?') { - Some(idx) => &purl[..idx], - None => purl, - }; + let base = strip_purl_qualifiers(purl); let rest = base.strip_prefix("pkg:npm/")?; let at_idx = rest.rfind('@')?; @@ -701,16 +698,33 @@ impl NpmCrawler { return None; } - if name_part.starts_with('@') { - let slash_idx = name_part.find('/')?; - let namespace = name_part[..slash_idx].to_string(); - let name = name_part[slash_idx + 1..].to_string(); - if name.is_empty() { + // SECURITY: components are percent-decoded AFTER the `/`/`@` splits + // above (so an encoded `%2f` cannot create a new path segment here) + // and BEFORE the `is_safe_npm_component` guards in `find_by_purls` + // (so `%2e%2e` cannot smuggle a traversal past them). The API serves + // scoped purls as `pkg:npm/%40scope/name@version`, which must match + // the literal `node_modules/@scope/name` install. + let version = percent_decode_purl_component(version); + + if let Some(slash_idx) = name_part.find('/') { + let namespace = percent_decode_purl_component(&name_part[..slash_idx]); + let name = percent_decode_purl_component(&name_part[slash_idx + 1..]); + // An npm namespace is always an `@scope` (checked post-decode). + if name.is_empty() || !namespace.starts_with('@') { return None; } - Some((Some(namespace), name, version.to_string())) + Some(( + Some(namespace.into_owned()), + name.into_owned(), + version.into_owned(), + )) } else { - Some((None, name_part.to_string(), version.to_string())) + let name = percent_decode_purl_component(name_part); + // A bare `@scope` with no `/name` is not a package name. + if name.starts_with('@') { + return None; + } + Some((None, name.into_owned(), version.into_owned())) } } } @@ -1031,6 +1045,93 @@ mod tests { assert!(!result.contains_key("pkg:npm/not-installed@0.0.1")); } + /// Regression: the patches API serves scoped purls percent-encoded + /// (`pkg:npm/%40scope/name@version`) and `scan` stores them verbatim as + /// manifest keys. `find_by_purls` must decode the components to match + /// the literal `node_modules/@scope/name` install — while keeping the + /// result keyed by the *verbatim* encoded input (downstream contract). + #[test] + fn test_parse_purl_components_percent_encoded_scope() { + let (ns, name, ver) = + NpmCrawler::parse_purl_components("pkg:npm/%40modelcontextprotocol/sdk@1.12.0") + .unwrap(); + assert_eq!(ns.as_deref(), Some("@modelcontextprotocol")); + assert_eq!(name, "sdk"); + assert_eq!(ver, "1.12.0"); + // An encoded bare scope with no `/name` is still not a package. + assert!(NpmCrawler::parse_purl_components("pkg:npm/%40scope@1.0.0").is_none()); + // A `#subpath` without a qualifier must not bleed into the version. + let (_, name, ver) = + NpmCrawler::parse_purl_components("pkg:npm/foo@1.0.0#lib/util").unwrap(); + assert_eq!(name, "foo"); + assert_eq!(ver, "1.0.0"); + } + + #[tokio::test] + async fn test_find_by_purls_percent_encoded_scope_resolves() { + let dir = tempfile::tempdir().unwrap(); + let nm = dir.path().join("node_modules"); + + let sdk_dir = nm.join("@modelcontextprotocol").join("sdk"); + tokio::fs::create_dir_all(&sdk_dir).await.unwrap(); + tokio::fs::write( + sdk_dir.join("package.json"), + r#"{"name": "@modelcontextprotocol/sdk", "version": "1.12.0"}"#, + ) + .await + .unwrap(); + + let crawler = NpmCrawler::new(); + let encoded = "pkg:npm/%40modelcontextprotocol/sdk@1.12.0".to_string(); + let result = crawler + .find_by_purls(&nm, std::slice::from_ref(&encoded)) + .await + .unwrap(); + + assert_eq!(result.len(), 1, "encoded scope must resolve: {result:?}"); + let pkg = result + .get(&encoded) + .expect("result keyed by the verbatim encoded input purl"); + assert_eq!(pkg.path, sdk_dir); + assert_eq!(pkg.name, "sdk"); + assert_eq!(pkg.namespace.as_deref(), Some("@modelcontextprotocol")); + } + + /// SECURITY regression: percent-encoded traversal sequences must be + /// rejected by the post-decode guards — `%2e%2e` decodes to `..` and + /// `%2f` to `/`, so guarding the *encoded* form would be a bypass. + #[tokio::test] + async fn test_find_by_purls_rejects_encoded_traversal() { + let root = tempfile::tempdir().unwrap(); + let nm = root.path().join("node_modules"); + // A real scope dir so a scoped traversal's kernel walk could resolve. + tokio::fs::create_dir_all(nm.join("@x")).await.unwrap(); + + // A victim package OUTSIDE node_modules, reachable only via `..`. + let evil_dir = root.path().join("evil"); + tokio::fs::create_dir_all(&evil_dir).await.unwrap(); + tokio::fs::write( + evil_dir.join("package.json"), + r#"{"name": "evil", "version": "1.0.0"}"#, + ) + .await + .unwrap(); + + let crawler = NpmCrawler::new(); + let purls = vec![ + "pkg:npm/%2e%2e/evil@1.0.0".to_string(), + "pkg:npm/@x/%2e%2e@1.0.0".to_string(), + "pkg:npm/@x/%2e%2e%2f%2e%2e%2fevil@1.0.0".to_string(), + "pkg:npm/..%2fevil@1.0.0".to_string(), + ]; + let result = crawler.find_by_purls(&nm, &purls).await.unwrap(); + + assert!( + result.is_empty(), + "encoded traversal must not escape node_modules; got {result:?}" + ); + } + /// Regression: a qualified PURL (carrying `?qualifiers`) must resolve and /// be keyed by the *verbatim* input PURL — not a reconstructed, stripped /// form. The dispatcher drives npm with `passthrough_purls` + diff --git a/crates/socket-patch-core/src/patch/vendor/bun_lock.rs b/crates/socket-patch-core/src/patch/vendor/bun_lock.rs index 8199bc6..1565d01 100644 --- a/crates/socket-patch-core/src/patch/vendor/bun_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/bun_lock.rs @@ -82,7 +82,7 @@ pub async fn vendor_bun( Ok(coords) => coords, Err(outcome) => return *outcome, }; - let (name, version) = (coords.name, coords.version); + let (name, version) = (coords.name.as_str(), coords.version.as_str()); // BN3 spelling: BARE project-relative path, no `file:`/`./` prefix. let rel_tgz = format!("{}/{}", coords.uuid_dir_rel, tgz_rel_leaf(name, version)); diff --git a/crates/socket-patch-core/src/patch/vendor/npm_common.rs b/crates/socket-patch-core/src/patch/vendor/npm_common.rs index 1b26cc4..1cfc62d 100644 --- a/crates/socket-patch-core/src/patch/vendor/npm_common.rs +++ b/crates/socket-patch-core/src/patch/vendor/npm_common.rs @@ -21,21 +21,25 @@ use crate::manifest::schema::PatchRecord; use crate::patch::apply::{apply_package_patch, normalize_file_path, ApplyResult, PatchSources}; use crate::patch::copy_tree::{fresh_copy, remove_tree}; use crate::patch::path_safety; -use crate::utils::purl::strip_purl_qualifiers; +use crate::utils::purl::{percent_decode_purl_component, strip_purl_qualifiers}; use super::npm_pack::{pack_deterministic, PackedTarball}; use super::path::vendor_uuid_dir_rel; use super::VendorOutcome; /// Validated npm vendoring coordinates (the output of -/// [`guard_coordinates`]). `name`/`version` borrow from the purl. +/// [`guard_coordinates`]). `name`/`version` are the percent-DECODED purl +/// components (the API serves scoped purls as `%40scope/name`; the +/// lockfile and node_modules carry the literal `@scope/name`). #[derive(Debug)] -pub(super) struct NpmCoords<'a> { - pub name: &'a str, - pub version: &'a str, +pub(super) struct NpmCoords { + pub name: String, + pub version: String, /// `.socket/vendor/npm/` (validated, forward slashes). pub uuid_dir_rel: String, - /// Qualifier-free base PURL. + /// Qualifier-free base PURL — VERBATIM (still encoded when the API + /// encoded it): the ledger's `base_purl`/entry keys must keep + /// matching the manifest keys, which store the purl as-served. pub base_purl: String, } @@ -49,17 +53,17 @@ pub(super) struct NpmCoords<'a> { /// vendor, arbitrary delete on revert) — reject fail-closed before any disk /// access. `Err` carries a ready [`VendorOutcome::Refused`] to bubble /// verbatim. -pub(super) fn guard_coordinates<'a>( - purl: &'a str, +pub(super) fn guard_coordinates( + purl: &str, record: &PatchRecord, -) -> Result, Box> { +) -> Result> { let Some((name, version)) = parse_npm_purl(purl) else { return Err(Box::new(refused( "unsafe_coordinates", format!("cannot parse an npm name@version out of `{purl}`"), ))); }; - if !is_safe_npm_name(name) || !path_safety::is_safe_single_segment(version) { + if !is_safe_npm_name(&name) || !path_safety::is_safe_single_segment(&version) { return Err(Box::new(refused( "unsafe_coordinates", format!( @@ -199,7 +203,7 @@ pub(super) async fn stage_patch_pack( let rel_tgz = format!( "{}/{}", coords.uuid_dir_rel, - tgz_rel_leaf(coords.name, coords.version) + tgz_rel_leaf(&coords.name, &coords.version) ); let dest = project_root.join(&rel_tgz); if let Some(parent) = dest.parent() { @@ -236,8 +240,8 @@ pub(super) async fn stage_patch_pack( Ok(( Some(NpmStagedPack { - name: coords.name.to_string(), - version: coords.version.to_string(), + name: coords.name, + version: coords.version, rel_tgz, packed, staged_pkg_json, @@ -251,14 +255,27 @@ pub(super) async fn stage_patch_pack( /// `pkg:npm/[@scope/]name@version` → `(name, version)`; scoped names keep /// the `@scope/` prefix. The LAST `@` separates the version (a leading /// scope-`@` is at index 0 and never the last `@` of a versioned purl). -pub(super) fn parse_npm_purl(purl: &str) -> Option<(&str, &str)> { +/// +/// Components are percent-DECODED (the API serves `pkg:npm/%40scope/...`). +/// SECURITY: each segment decodes independently AFTER the `/`/`@` splits, +/// and the post-decode `is_safe_npm_name`/`is_safe_single_segment` gates in +/// [`guard_coordinates`] reject any separator or traversal sequence a +/// decode may have surfaced (`%2e%2e`, `%2f`, ...) — decoding never runs +/// after the guards. +pub(super) fn parse_npm_purl(purl: &str) -> Option<(String, String)> { let base = strip_purl_qualifiers(purl); let rest = base.strip_prefix("pkg:npm/")?; let at = rest.rfind('@').filter(|&i| i > 0)?; - let (name, version) = (&rest[..at], &rest[at + 1..]); - if name.is_empty() || version.is_empty() { + let (name_raw, version_raw) = (&rest[..at], &rest[at + 1..]); + if name_raw.is_empty() || version_raw.is_empty() { return None; } + let name = name_raw + .split('/') + .map(percent_decode_purl_component) + .collect::>() + .join("/"); + let version = percent_decode_purl_component(version_raw).into_owned(); Some((name, version)) } @@ -369,18 +386,42 @@ mod tests { fn guard_coordinates_accepts_plain_and_scoped_names() { let record = record_with_uuid(UUID); let coords = guard_coordinates("pkg:npm/left-pad@1.3.0", &record).unwrap(); - assert_eq!((coords.name, coords.version), ("left-pad", "1.3.0")); + assert_eq!((coords.name.as_str(), coords.version.as_str()), ("left-pad", "1.3.0")); assert_eq!(coords.uuid_dir_rel, format!(".socket/vendor/npm/{UUID}")); assert_eq!(coords.base_purl, "pkg:npm/left-pad@1.3.0"); let coords = guard_coordinates("pkg:npm/@scope/pkg@1.0.0?artifact_id=x", &record).unwrap(); - assert_eq!((coords.name, coords.version), ("@scope/pkg", "1.0.0")); + assert_eq!((coords.name.as_str(), coords.version.as_str()), ("@scope/pkg", "1.0.0")); assert_eq!( coords.base_purl, "pkg:npm/@scope/pkg@1.0.0", "qualifiers stripped" ); } + /// The API serves scoped purls percent-encoded; the coordinates must + /// decode to the literal `@scope/name` (which keys the lockfile and + /// the artifact path), while `base_purl` stays verbatim — the ledger + /// must keep matching the manifest key as-served. + #[test] + fn guard_coordinates_decodes_percent_encoded_scope() { + let record = record_with_uuid(UUID); + let coords = + guard_coordinates("pkg:npm/%40modelcontextprotocol/sdk@1.12.0", &record).unwrap(); + assert_eq!( + (coords.name.as_str(), coords.version.as_str()), + ("@modelcontextprotocol/sdk", "1.12.0") + ); + assert_eq!( + coords.base_purl, "pkg:npm/%40modelcontextprotocol/sdk@1.12.0", + "base_purl stays verbatim-encoded (manifest/ledger key parity)" + ); + assert_eq!( + tgz_rel_leaf(&coords.name, &coords.version), + "@modelcontextprotocol/sdk-1.12.0.tgz", + "artifact leaf is built from the decoded name" + ); + } + #[test] fn guard_coordinates_refuses_fail_closed() { let record = record_with_uuid(UUID); @@ -399,6 +440,20 @@ mod tests { guard_coordinates("pkg:npm/x@../1.0.0", &record).unwrap_err(), "unsafe_coordinates", ); + // SECURITY: percent-encoded traversal must be rejected POST-decode — + // guarding the encoded form would be a bypass (`%2e%2e` → `..`). + expect_refusal( + guard_coordinates("pkg:npm/%2e%2e/escape@1.0.0", &record).unwrap_err(), + "unsafe_coordinates", + ); + expect_refusal( + guard_coordinates("pkg:npm/@scope/%2e%2e%2f%2e%2e@1.0.0", &record).unwrap_err(), + "unsafe_coordinates", + ); + expect_refusal( + guard_coordinates("pkg:npm/x@%2e%2e%2f1.0.0", &record).unwrap_err(), + "unsafe_coordinates", + ); // Tampered uuid. let record = record_with_uuid("../../x"); expect_refusal( diff --git a/crates/socket-patch-core/src/patch/vendor/npm_lock.rs b/crates/socket-patch-core/src/patch/vendor/npm_lock.rs index 2be4c7a..bed244b 100644 --- a/crates/socket-patch-core/src/patch/vendor/npm_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/npm_lock.rs @@ -91,7 +91,7 @@ pub async fn vendor_npm( Ok(coords) => coords, Err(outcome) => return *outcome, }; - let (name, version) = (coords.name, coords.version); + let (name, version) = (coords.name.as_str(), coords.version.as_str()); let uuid_dir_rel = coords.uuid_dir_rel; let base_purl = coords.base_purl; @@ -1633,11 +1633,11 @@ mod tests { fn purl_and_name_helpers() { assert_eq!( parse_npm_purl("pkg:npm/left-pad@1.3.0"), - Some(("left-pad", "1.3.0")) + Some(("left-pad".into(), "1.3.0".into())) ); assert_eq!( parse_npm_purl("pkg:npm/@scope/pkg@1.0.0?foo=bar"), - Some(("@scope/pkg", "1.0.0")) + Some(("@scope/pkg".into(), "1.0.0".into())) ); assert_eq!(parse_npm_purl("pkg:npm/@scope/pkg"), None, "no version"); assert_eq!( diff --git a/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs b/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs index cac16e9..48ec98c 100644 --- a/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs @@ -93,7 +93,7 @@ pub async fn vendor_pnpm( Ok(coords) => coords, Err(outcome) => return *outcome, }; - let (name, version) = (coords.name, coords.version); + let (name, version) = (coords.name.as_str(), coords.version.as_str()); let rel_tgz = format!("{}/{}", coords.uuid_dir_rel, tgz_rel_leaf(name, version)); // pnpm spells the override target `file:` with NO // `./` (spike P1 fixtures, verbatim). diff --git a/crates/socket-patch-core/src/patch/vendor/yarn_berry_lock.rs b/crates/socket-patch-core/src/patch/vendor/yarn_berry_lock.rs index 613b503..d24c7c8 100644 --- a/crates/socket-patch-core/src/patch/vendor/yarn_berry_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/yarn_berry_lock.rs @@ -83,7 +83,7 @@ pub async fn vendor_yarn_berry( Ok(coords) => coords, Err(outcome) => return *outcome, }; - let (name, version) = (coords.name, coords.version); + let (name, version) = (coords.name.as_str(), coords.version.as_str()); let uuid_dir_rel = coords.uuid_dir_rel.clone(); let base_purl = coords.base_purl.clone(); let rel_tgz = format!("{}/{}", coords.uuid_dir_rel, tgz_rel_leaf(name, version)); diff --git a/crates/socket-patch-core/src/patch/vendor/yarn_classic_lock.rs b/crates/socket-patch-core/src/patch/vendor/yarn_classic_lock.rs index ac1ff84..8d82a00 100644 --- a/crates/socket-patch-core/src/patch/vendor/yarn_classic_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/yarn_classic_lock.rs @@ -68,7 +68,7 @@ pub async fn vendor_yarn_classic( Ok(coords) => coords, Err(outcome) => return *outcome, }; - let (name, version) = (coords.name, coords.version); + let (name, version) = (coords.name.as_str(), coords.version.as_str()); let uuid_dir_rel = coords.uuid_dir_rel; let base_purl = coords.base_purl; diff --git a/crates/socket-patch-core/src/utils/purl.rs b/crates/socket-patch-core/src/utils/purl.rs index 6ea0e80..393873d 100644 --- a/crates/socket-patch-core/src/utils/purl.rs +++ b/crates/socket-patch-core/src/utils/purl.rs @@ -1,3 +1,5 @@ +use std::borrow::Cow; + /// Strip the trailing `?qualifiers` and `#subpath` components from a PURL, /// leaving the canonical `pkg:type/namespace/name@version` base. /// @@ -18,6 +20,94 @@ pub fn strip_purl_qualifiers(purl: &str) -> &str { } } +/// Strictly percent-decode ONE purl path component (a scope, namespace +/// segment, name, or version) AFTER it has been split out of the purl. +/// +/// The patches API serves purls in canonical percent-encoded form +/// (`pkg:npm/%40scope/name@1.0.0`), while crawlers build purls from the +/// literal on-disk names (`pkg:npm/@scope/name@1.0.0`). Parsers must +/// decode the API form to find installed packages. +/// +/// SECURITY: this must only ever be called on a component AFTER the purl +/// has been split on `/` and the version `@` — so an encoded separator +/// (`%2f`) cannot create new path segments at parse time; it surfaces as +/// a literal `/` *inside* one component — and BEFORE the path-safety +/// guards run, so `%2e%2e`, `%2f`, `%5c`, `%00` are rejected post-decode +/// by the same `is_safe_*` gates that reject their literal forms. +/// Guarding the encoded form instead would be a traversal bypass. +/// +/// Decoding is all-or-nothing: an invalid escape (`%G1`, trailing `%4`) +/// or a non-UTF8 decode returns the input unchanged (fail-safe — the +/// undecoded form contains no separators, and `%` is not a legal +/// character in any real package name). Zero-alloc when no `%`. +pub fn percent_decode_purl_component(component: &str) -> Cow<'_, str> { + if !component.contains('%') { + return Cow::Borrowed(component); + } + fn hex_val(b: u8) -> Option { + match b { + b'0'..=b'9' => Some(b - b'0'), + b'a'..=b'f' => Some(b - b'a' + 10), + b'A'..=b'F' => Some(b - b'A' + 10), + _ => None, + } + } + let bytes = component.as_bytes(); + let mut out: Vec = Vec::with_capacity(bytes.len()); + let mut i = 0; + while i < bytes.len() { + if bytes[i] == b'%' { + let (Some(hi), Some(lo)) = ( + bytes.get(i + 1).copied().and_then(hex_val), + bytes.get(i + 2).copied().and_then(hex_val), + ) else { + // Invalid escape: leave the whole component verbatim. + return Cow::Borrowed(component); + }; + out.push(hi * 16 + lo); + i += 3; + } else { + out.push(bytes[i]); + i += 1; + } + } + match String::from_utf8(out) { + Ok(s) => Cow::Owned(s), + // Decoded bytes are not UTF-8: leave the component verbatim. + Err(_) => Cow::Borrowed(component), + } +} + +/// Canonical string form for purl-to-purl comparison and display: +/// percent-decode each `/`-separated component of the +/// `pkg:type/...@version` base; qualifiers/subpath are appended verbatim. +/// +/// Used ONLY for string equality (`purl_eq`) and human output — never to +/// build filesystem paths (a `%2f` decoding into a name can at worst make +/// two distinct purls compare equal, not change a write location). +pub fn normalize_purl(purl: &str) -> Cow<'_, str> { + if !purl.contains('%') { + return Cow::Borrowed(purl); + } + let split = purl.find(['?', '#']).unwrap_or(purl.len()); + let (base, suffix) = purl.split_at(split); + let mut out = String::with_capacity(purl.len()); + for (i, seg) in base.split('/').enumerate() { + if i > 0 { + out.push('/'); + } + out.push_str(&percent_decode_purl_component(seg)); + } + out.push_str(suffix); + Cow::Owned(out) +} + +/// Purl equality up to percent-encoding of the base components +/// (`pkg:npm/%40scope/x@1` ≡ `pkg:npm/@scope/x@1`). +pub fn purl_eq(a: &str, b: &str) -> bool { + normalize_purl(a) == normalize_purl(b) +} + /// Parse a PyPI PURL to extract name and version. /// /// e.g., `"pkg:pypi/requests@2.28.0?artifact_id=abc"` -> `Some(("requests", "2.28.0"))` @@ -155,7 +245,7 @@ pub fn build_composer_purl(namespace: &str, name: &str, version: &str) -> String /// have a `/` namespace structure. The leading `@` on /// the scope is preserved (matching npm's `@scope/name` convention). #[cfg(feature = "deno")] -pub fn parse_jsr_purl(purl: &str) -> Option<((&str, &str), &str)> { +pub fn parse_jsr_purl(purl: &str) -> Option<((Cow<'_, str>, Cow<'_, str>), Cow<'_, str>)> { let base = strip_purl_qualifiers(purl); let rest = base.strip_prefix("pkg:jsr/")?; let at_idx = rest.rfind('@')?; @@ -167,8 +257,12 @@ pub fn parse_jsr_purl(purl: &str) -> Option<((&str, &str), &str)> { } let slash_idx = name_part.find('/')?; - let scope = &name_part[..slash_idx]; - let name = &name_part[slash_idx + 1..]; + // Decode AFTER splitting on `/`/`@` and BEFORE the shape checks below + // (and the caller's `is_safe_jsr_component` gate) — see + // `percent_decode_purl_component`. The API serves `%40scope`. + let scope = percent_decode_purl_component(&name_part[..slash_idx]); + let name = percent_decode_purl_component(&name_part[slash_idx + 1..]); + let version = percent_decode_purl_component(version); // Scope must be `@`. The bare `@` (length 1) is // invalid — there's no actual scope after the marker. @@ -248,15 +342,22 @@ pub fn is_purl(s: &str) -> bool { /// /// Non-PyPI keys never carry a `?`, so for them this reduces to plain /// equality. +/// +/// Comparison is encoding-tolerant (`purl_eq`): manifest keys come from +/// the API in percent-encoded form (`pkg:npm/%40scope/x@1`) while users +/// type the literal form — both spellings must match either way around. pub fn purl_matches_identifier(manifest_key: &str, identifier: &str) -> bool { if identifier.contains('?') { - manifest_key == identifier + purl_eq(manifest_key, identifier) } else { // Base identifier: compare bases. Strip both sides so a subpath // (`#...`) carried by either the key or the identifier doesn't // defeat the match — `strip_purl_qualifiers(identifier)` is a no-op // for a plain base PURL, so existing behaviour is unchanged. - strip_purl_qualifiers(manifest_key) == strip_purl_qualifiers(identifier) + purl_eq( + strip_purl_qualifiers(manifest_key), + strip_purl_qualifiers(identifier), + ) } } @@ -504,25 +605,31 @@ mod tests { ); } + #[cfg(feature = "deno")] + fn jsr_parts(purl: &str) -> Option<(String, String, String)> { + parse_jsr_purl(purl) + .map(|((s, n), v)| (s.into_owned(), n.into_owned(), v.into_owned())) + } + #[cfg(feature = "deno")] #[test] fn test_parse_jsr_purl() { assert_eq!( - parse_jsr_purl("pkg:jsr/@std/path@0.220.0"), - Some((("@std", "path"), "0.220.0")) + jsr_parts("pkg:jsr/@std/path@0.220.0"), + Some(("@std".into(), "path".into(), "0.220.0".into())) ); assert_eq!( - parse_jsr_purl("pkg:jsr/@luca/flag@1.0.0"), - Some((("@luca", "flag"), "1.0.0")) + jsr_parts("pkg:jsr/@luca/flag@1.0.0"), + Some(("@luca".into(), "flag".into(), "1.0.0".into())) ); // Scope must start with `@`. - assert_eq!(parse_jsr_purl("pkg:jsr/std/path@0.220.0"), None); + assert_eq!(jsr_parts("pkg:jsr/std/path@0.220.0"), None); // Empty pieces. - assert_eq!(parse_jsr_purl("pkg:jsr/@/path@0.220.0"), None); - assert_eq!(parse_jsr_purl("pkg:jsr/@std/@0.220.0"), None); - assert_eq!(parse_jsr_purl("pkg:jsr/@std/path@"), None); + assert_eq!(jsr_parts("pkg:jsr/@/path@0.220.0"), None); + assert_eq!(jsr_parts("pkg:jsr/@std/@0.220.0"), None); + assert_eq!(jsr_parts("pkg:jsr/@std/path@"), None); // Wrong scheme. - assert_eq!(parse_jsr_purl("pkg:npm/@std/path@0.220.0"), None); + assert_eq!(jsr_parts("pkg:npm/@std/path@0.220.0"), None); } #[cfg(feature = "deno")] @@ -661,8 +768,8 @@ mod tests { // Scope `@` + version `@` + qualifier `@` all coexist; only the // version `@` should be honored. assert_eq!( - parse_jsr_purl("pkg:jsr/@std/path@0.220.0?download_url=x@y"), - Some((("@std", "path"), "0.220.0")) + jsr_parts("pkg:jsr/@std/path@0.220.0?download_url=x@y"), + Some(("@std".into(), "path".into(), "0.220.0".into())) ); } @@ -748,6 +855,88 @@ mod tests { )); } + // --- Percent-decoding: API purls carry %-encoded components -------------- + + #[test] + fn test_percent_decode_purl_component() { + // The canonical case: an encoded npm scope marker. + assert_eq!( + percent_decode_purl_component("%40modelcontextprotocol"), + "@modelcontextprotocol" + ); + // Traversal sequences decode — the post-decode safety guards are + // what reject them, not this helper. + assert_eq!(percent_decode_purl_component("%2e%2e"), ".."); + assert_eq!(percent_decode_purl_component("a%2fb"), "a/b"); + assert_eq!(percent_decode_purl_component("%00"), "\0"); + // Invalid escapes leave the WHOLE component verbatim (all-or-nothing). + assert_eq!(percent_decode_purl_component("%G1abc"), "%G1abc"); + assert_eq!(percent_decode_purl_component("abc%4"), "abc%4"); + assert_eq!(percent_decode_purl_component("abc%"), "abc%"); + // Non-UTF8 decode (lone continuation byte) leaves it verbatim. + assert_eq!(percent_decode_purl_component("%FF"), "%FF"); + // No '%' is zero-alloc (borrowed). + assert!(matches!( + percent_decode_purl_component("plain-name"), + Cow::Borrowed(_) + )); + } + + #[test] + fn test_normalize_purl_and_purl_eq() { + assert_eq!( + normalize_purl("pkg:npm/%40modelcontextprotocol/sdk@1.12.0"), + "pkg:npm/@modelcontextprotocol/sdk@1.12.0" + ); + assert!(purl_eq( + "pkg:npm/%40scope/x@1.0.0", + "pkg:npm/@scope/x@1.0.0" + )); + assert!(purl_eq( + "pkg:npm/@scope/x@1.0.0", + "pkg:npm/%40scope/x@1.0.0" + )); + assert!(!purl_eq("pkg:npm/%40scope/x@1.0.0", "pkg:npm/@scope/x@2.0.0")); + // Qualifiers/subpath are preserved verbatim (not decoded). + assert_eq!( + normalize_purl("pkg:npm/%40s/x@1?artifact_id=a%2Fb"), + "pkg:npm/@s/x@1?artifact_id=a%2Fb" + ); + // Unencoded input is unchanged (and borrowed). + assert!(matches!( + normalize_purl("pkg:npm/lodash@4.17.21"), + Cow::Borrowed(_) + )); + } + + #[test] + fn test_purl_matches_identifier_decodes_encoded_key() { + // Encoded manifest key vs literal identifier — and vice versa. + assert!(purl_matches_identifier( + "pkg:npm/%40scope/x@1.0.0", + "pkg:npm/@scope/x@1.0.0" + )); + assert!(purl_matches_identifier( + "pkg:npm/@scope/x@1.0.0", + "pkg:npm/%40scope/x@1.0.0" + )); + assert!(!purl_matches_identifier( + "pkg:npm/%40scope/x@1.0.0", + "pkg:npm/@scope/y@1.0.0" + )); + } + + #[cfg(feature = "deno")] + #[test] + fn test_parse_jsr_purl_percent_encoded_scope() { + let ((scope, name), version) = parse_jsr_purl("pkg:jsr/%40std/path@0.220.0").unwrap(); + assert_eq!(scope, "@std"); + assert_eq!(name, "path"); + assert_eq!(version, "0.220.0"); + // The encoded bare `@` is still rejected post-decode. + assert_eq!(jsr_parts("pkg:jsr/%40/path@0.220.0"), None); + } + // --- Regression: name must not absorb the version separator ------------- #[test] From c3c012f25a52a468b57d3597018c997bb967c962 Mon Sep 17 00:00:00 2001 From: Mikola Lysenko Date: Thu, 11 Jun 2026 15:32:14 -0400 Subject: [PATCH 02/19] feat(vendor): auto-force staging on content mismatch + correct already-applied events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The vendor stage is a private copy and every apply write path is hash-gated to exactly afterHash, so a beforeHash mismatch (a patch built against different bytes than the installed artifact, or a package already patched in place by apply) no longer fails the vendor: the stage is overwritten with the verified patched content and the overwrite surfaces as a vendor_content_mismatch_overwritten warning event. Missing patch-target files still fail closed without --force (force's silent NotFound skip would pack an artifact without the fix). - shared force_apply_staged / missing_existing_patch_files / mismatch_overwrite_warnings policy helpers in vendor/mod.rs, used by all npm flavors (via stage_patch_pack) + cargo/composer/gem/pypi/ golang backends; dry runs predict the same outcome - vendor.rs: gate the already_vendored rewrite on entry.is_none() — the first vendor of an in-place-applied package now emits Applied (it packed + rewired this run) instead of a miscounted skip - scan --vendor: pre-prompt baseline check annotates mismatched packages before the confirm prompt (best-effort, read-only) - --force narrowed to missing-file tolerance + variant-probe bypass; CLI_CONTRACT.md documents the new warning code Co-Authored-By: Claude Fable 5 --- crates/socket-patch-cli/CLI_CONTRACT.md | 3 +- crates/socket-patch-cli/src/commands/scan.rs | 70 ++++++ .../socket-patch-cli/src/commands/vendor.rs | 44 +++- .../tests/in_process_vendor.rs | 178 +++++++++++++++ .../src/patch/vendor/bun_lock.rs | 1 + .../src/patch/vendor/cargo.rs | 38 ++-- .../src/patch/vendor/composer_lock.rs | 29 +-- .../socket-patch-core/src/patch/vendor/gem.rs | 29 +-- .../src/patch/vendor/golang.rs | 36 +++- .../socket-patch-core/src/patch/vendor/mod.rs | 204 +++++++++++++++++- .../src/patch/vendor/npm_common.rs | 29 ++- .../src/patch/vendor/npm_lock.rs | 171 +++++++++++++++ .../src/patch/vendor/pnpm_lock.rs | 1 + .../src/patch/vendor/pypi.rs | 1 + .../src/patch/vendor/pypi_wheel.rs | 83 ++++++- .../src/patch/vendor/yarn_berry_lock.rs | 1 + .../src/patch/vendor/yarn_classic_lock.rs | 1 + 17 files changed, 840 insertions(+), 79 deletions(-) diff --git a/crates/socket-patch-cli/CLI_CONTRACT.md b/crates/socket-patch-cli/CLI_CONTRACT.md index 7fb1d59..464ef6a 100644 --- a/crates/socket-patch-cli/CLI_CONTRACT.md +++ b/crates/socket-patch-cli/CLI_CONTRACT.md @@ -55,7 +55,7 @@ Beyond the globals above, each subcommand defines a small set of local arguments | Subcommand | Local arg | Env var | Purpose | |---|---|---|---| | `apply` | `--force` / `-f` | `SOCKET_FORCE` | Bypass beforeHash check | -| `vendor` | `--force` / `-f` | `SOCKET_FORCE` | Bypass beforeHash check when staging the vendored copy | +| `vendor` | `--force` / `-f` | `SOCKET_FORCE` | Tolerate missing patch-target files in the stage + bypass the variant probe. A beforeHash mismatch no longer needs it: vendor staging auto-overwrites with the verified patched content (`vendor_content_mismatch_overwritten` warning) | | `vendor` | `--revert` | `SOCKET_VENDOR_REVERT` | Undo vendoring: restore recorded original lockfile fragments + remove `.socket/vendor/` artifacts. Works without a manifest | | `apply`, `scan`, `vendor` | `--vex` | `SOCKET_VEX` | Generate an OpenVEX 0.2.0 document at this path on a successful run; see "embedded VEX" below | | `apply`, `scan`, `vendor` | `--vex-product`, `--vex-no-verify`, `--vex-doc-id`, `--vex-compact` | `SOCKET_VEX_PRODUCT`, `SOCKET_VEX_NO_VERIFY`, `SOCKET_VEX_DOC_ID`, `SOCKET_VEX_COMPACT` | Passthrough to the embedded VEX builder; mirror the standalone `vex` knobs. Inert unless `--vex` is set | @@ -602,6 +602,7 @@ Every `--json` invocation emits a single JSON object that follows the **unified | `vendor_yarn_berry_cache_unsupported` | `failed` | vendor (yarn berry): lock `cacheKey ≠ 10c0` or non-default `.yarnrc.yml` `compressionLevel` — the cache-zip checksum is not reproducible. | | `vendor_override_conflict` | `failed` | vendor (pnpm/yarn-berry): a user-authored override/resolution for the package already exists. | | `vendor_integrity_unverified` | `skipped` (warning) | vendor (pipenv): the lockfile format does not hash-check file entries; the committed wheel bytes are the protection. | +| `vendor_content_mismatch_overwritten` | `skipped` (warning) | vendor: a staged file matched NEITHER beforeHash nor afterHash (patch built against different bytes, or local edits); the stage was overwritten with the verified patched content and the vendor succeeded. | | `vendor_lock_checksums_unsupported` / `vendor_stale_lock_checksum` | `failed` | vendor (gem): an ambiguous/platform CHECKSUMS entry, or a v1-wired lock whose stale token blocks the hot path (run `vendor --revert` + re-vendor). | | `pypi_{poetry,pdm,pipenv}_no_lockfile` | `failed` | vendor (pypi): a lock-less tool marker with no `requirements.txt` fallback — run ` lock`. | | `vendor_*` / `pypi_*` / `gemfile_*` / `lock_*` / `locked_version_mismatch` / `user_authored_*` / `native_extensions_unsupported` / `platform_gem_unsupported` | `failed`/`skipped` | vendor: per-ecosystem refusal + drift vocabulary; see the Vendor command contract section. New tags are additive (MINOR). | diff --git a/crates/socket-patch-cli/src/commands/scan.rs b/crates/socket-patch-cli/src/commands/scan.rs index 00b2fe9..182e10d 100644 --- a/crates/socket-patch-cli/src/commands/scan.rs +++ b/crates/socket-patch-cli/src/commands/scan.rs @@ -225,6 +225,56 @@ pub(crate) fn detect_prunable( .collect() } +/// Vendor-mode pre-prompt check: uuids of selected patches whose installed +/// files match NEITHER beforeHash nor afterHash — the patch was built +/// against different bytes than the installed artifact. Vendoring still +/// succeeds for these (the vendor stage force-applies the verified patched +/// content; see `force_apply_staged`), but the user should learn it BEFORE +/// the confirm prompt, not from a post-hoc warning event. +/// +/// Best-effort and read-only: a detail-fetch failure or an unresolvable +/// installed path just skips the annotation — it never blocks the flow and +/// writes nothing (unlike `download_patch_records`, which stages blobs). +async fn preverify_vendor_baselines( + api_client: &socket_patch_core::api::client::ApiClient, + org_slug: Option<&str>, + selected: &[PatchSearchResult], + crawled: &[socket_patch_core::crawlers::types::CrawledPackage], +) -> HashSet { + use socket_patch_core::manifest::schema::PatchFileInfo; + use socket_patch_core::patch::apply::{verify_file_patch, VerifyStatus}; + use socket_patch_core::utils::purl::purl_eq; + + let mut mismatched: HashSet = HashSet::new(); + for patch in selected { + // API purls come percent-encoded, crawler purls literal — purl_eq + // bridges the two spellings. + let base = strip_purl_qualifiers(&patch.purl); + let Some(pkg) = crawled.iter().find(|c| purl_eq(&c.purl, base)) else { + continue; + }; + let Ok(Some(detail)) = api_client.fetch_patch(org_slug, &patch.uuid).await else { + continue; + }; + for (file, info) in &detail.files { + let info = PatchFileInfo { + before_hash: info.before_hash.clone().unwrap_or_default(), + after_hash: info.after_hash.clone().unwrap_or_default(), + }; + if info.before_hash.is_empty() { + continue; // a new file has no baseline to compare + } + if verify_file_patch(&pkg.path, file, &info).await.status + == VerifyStatus::HashMismatch + { + mismatched.insert(patch.uuid.clone()); + break; + } + } + } + mismatched +} + /// Cross-reference an existing manifest against discovery results to find /// PURLs whose newest available patch UUID differs from the locally-recorded /// one. Used by both the discovery JSON path and the table-print path. @@ -1822,6 +1872,21 @@ pub async fn run(args: ScanArgs) -> i32 { return embed_vex_human(&args.common, &args.vex, &manifest_path, 0).await; } + // Vendor mode: pre-verify baselines so a content mismatch surfaces + // BEFORE the confirm prompt (vendoring still proceeds for these — + // the stage force-applies the verified patched content). + let mismatched_baselines: HashSet = if args.vendor && !args.common.silent { + preverify_vendor_baselines( + &api_client, + effective_org_slug, + &selected, + &filtered_crawled, + ) + .await + } else { + HashSet::new() + }; + // Display detailed summary of selected patches before confirming // (presentational only — skipped wholesale under --silent). if !args.common.silent { @@ -1864,6 +1929,11 @@ pub async fn run(args: ScanArgs) -> i32 { patch.tier.to_uppercase(), sev_colored, ); + if mismatched_baselines.contains(&patch.uuid) { + println!( + " (installed content differs from patch baseline — will vendor patched content)" + ); + } if !vuln_ids.is_empty() { println!(" Fixes: {}", vuln_ids.join(", ")); } diff --git a/crates/socket-patch-cli/src/commands/vendor.rs b/crates/socket-patch-cli/src/commands/vendor.rs index f306f1d..94fd6e7 100644 --- a/crates/socket-patch-cli/src/commands/vendor.rs +++ b/crates/socket-patch-cli/src/commands/vendor.rs @@ -49,8 +49,12 @@ pub struct VendorArgs { #[command(flatten)] pub common: GlobalArgs, - /// Skip pre-vendor hash verification (vendor even if the installed - /// package's files differ from the patch's beforeHash). + /// Tolerate MISSING patch-target files in the staged copy (they are + /// skipped instead of failing the vendor) and bypass the variant + /// probe for multi-release ecosystems. A plain beforeHash mismatch + /// no longer needs this: vendor staging always overwrites mismatched + /// content with the verified patched bytes (surfaced as a + /// `vendor_content_mismatch_overwritten` warning). #[arg( short = 'f', long, @@ -586,16 +590,38 @@ pub(crate) async fn vendor_records( } let mut event = result_to_event(&result, common.dry_run); // The shared translator's in-sync classification reads - // `already_patched`; under `vendor` the contract tag is - // `already_vendored` (artifact + wiring already in sync). + // `already_patched`. Two distinct cases land there: + // + // * `entry` is None — the TRUE in-sync rerun (the backend + // synthesized AlreadyPatched and recorded nothing); + // under `vendor` the contract tag is `already_vendored`. + // * `entry` is Some — the FIRST vendor of a package + // already patched in place by `apply`: every file + // verified AlreadyPatched, but THIS run packed the + // artifact and rewired the lock. That is an Applied + // (`summary.applied` must count it), not a skip. if event.action == PatchAction::Skipped && event.error_code.as_deref() == Some("already_patched") { - event = PatchEvent::new(PatchAction::Skipped, candidate.clone()) - .with_reason( - "already_vendored", - "artifact and lockfile wiring already in sync", - ); + if entry.is_none() { + event = PatchEvent::new(PatchAction::Skipped, candidate.clone()) + .with_reason( + "already_vendored", + "artifact and lockfile wiring already in sync", + ); + } else { + let files = result + .files_verified + .iter() + .map(|f| crate::json_envelope::PatchEventFile { + path: f.file.clone(), + verified: true, + applied_via: None, + }) + .collect(); + event = PatchEvent::new(PatchAction::Applied, candidate.clone()) + .with_files(files); + } } env.record(event); for w in &warnings { diff --git a/crates/socket-patch-cli/tests/in_process_vendor.rs b/crates/socket-patch-cli/tests/in_process_vendor.rs index 0ef740c..791d9f9 100644 --- a/crates/socket-patch-cli/tests/in_process_vendor.rs +++ b/crates/socket-patch-cli/tests/in_process_vendor.rs @@ -1228,3 +1228,181 @@ fn json_envelope_shape() { assert_eq!(env["status"], "noManifest"); assert!(events(&env).is_empty()); } + +// ──────────────── vendor auto-force + already-applied lifecycle ──────────────── + +/// A package already patched IN PLACE by `apply` must vendor cleanly on the +/// first run — and the envelope must report it as `applied` (this run packed +/// the artifact and rewired the lock), NOT `skipped/already_vendored`. The +/// second run is the true in-sync rerun and reports `already_vendored`. +#[test] +fn vendor_after_in_place_apply_emits_applied_event() { + let fx = npm_fixture(); + // Simulate a prior in-place `socket-patch apply`. + std::fs::write(fx.installed_index(), PATCHED_INDEX).unwrap(); + + let (code, env) = vendor_cli(fx.root(), &[]); + assert_eq!(code, 0, "{env:#}"); + let applied = find_event(&env, "applied", None); + assert_eq!(applied["purl"], PURL); + assert_eq!( + env["summary"]["applied"], 1, + "first vendor of an applied package counts as applied: {env:#}" + ); + assert!(fx.tgz_path().exists(), "artifact packed"); + assert!(fx.state_path().exists(), "ledger entry recorded"); + // No mismatch warning: afterHash content is AlreadyPatched, not divergent. + assert!( + !events(&env) + .iter() + .any(|e| e["errorCode"] == "vendor_content_mismatch_overwritten"), + "{env:#}" + ); + + // Second run: artifact + wiring already in sync. + let (code, env) = vendor_cli(fx.root(), &[]); + assert_eq!(code, 0, "{env:#}"); + find_event(&env, "skipped", Some("already_vendored")); + assert_eq!(env["summary"]["applied"], 0); +} + +/// Installed content matching NEITHER hash (a patch built against different +/// bytes than the installed artifact — the flatted@3.3.1 case) still vendors: +/// the stage is overwritten with the verified patched content, the run exits +/// 0 with an `applied` event, and the overwrite surfaces as a +/// `vendor_content_mismatch_overwritten` warning event. +#[test] +fn mismatched_baseline_vendors_with_warning_event() { + let fx = npm_fixture(); + std::fs::write( + fx.installed_index(), + b"module.exports = () => 'divergent';\n", + ) + .unwrap(); + + let (code, env) = vendor_cli(fx.root(), &[]); + assert_eq!(code, 0, "{env:#}"); + let applied = find_event(&env, "applied", None); + assert_eq!(applied["purl"], PURL); + let warning = find_event(&env, "skipped", Some("vendor_content_mismatch_overwritten")); + assert!( + warning["reason"] + .as_str() + .unwrap_or("") + .contains("left-pad@1.3.0"), + "warning names the package: {env:#}" + ); + assert!(fx.tgz_path().exists(), "artifact packed despite the mismatch"); + // The installed tree keeps its divergent bytes (only the stage changed). + assert_eq!( + std::fs::read(fx.installed_index()).unwrap(), + b"module.exports = () => 'divergent';\n" + ); +} + +/// A patch-target file MISSING from the installed package still fails closed +/// (auto-force must not inherit `--force`'s silent NotFound skip — the +/// tarball would ship without the fix); `--force` keeps that tolerance. +#[test] +fn vendor_missing_file_fails_closed_without_force() { + let fx = npm_fixture(); + std::fs::remove_file(fx.installed_index()).unwrap(); + + let (code, env) = vendor_cli(fx.root(), &[]); + assert_ne!(code, 0, "missing patch target must fail: {env:#}"); + let failed = find_event(&env, "failed", None); + assert!( + failed["error"] + .as_str() + .unwrap_or("") + .contains("File not found"), + "{env:#}" + ); + assert_eq!(fx.lock_bytes(), fx.original_lock, "lock byte-untouched"); + assert!(!fx.vendor_dir().exists(), "no artifacts on failure"); + + // --force: the missing file is tolerated (skipped) and the vendor lands. + let fx2 = npm_fixture(); + std::fs::remove_file(fx2.installed_index()).unwrap(); + let (code, env) = vendor_cli(fx2.root(), &["--force"]); + assert_eq!(code, 0, "{env:#}"); +} + +// ──────────────── percent-encoded scoped purls (Fix A integration) ──────────────── + +/// Build a fixture whose installed package is the SCOPED `@scope/left-pad` +/// while the manifest keys the patch by the API's percent-encoded purl +/// (`pkg:npm/%40scope/left-pad@1.3.0`) — exactly what `scan` writes. +fn npm_scoped_fixture() -> NpmFixture { + let fx = npm_fixture_with_purls(&["pkg:npm/%40scope/left-pad@1.3.0"]); + let root = fx.root(); + + // Re-home the installed package under the scope dir. + let scoped = root.join("node_modules/@scope/left-pad"); + std::fs::create_dir_all(scoped.parent().unwrap()).unwrap(); + std::fs::rename(root.join("node_modules/left-pad"), &scoped).unwrap(); + std::fs::write( + scoped.join("package.json"), + br#"{"name":"@scope/left-pad","version":"1.3.0"}"#, + ) + .unwrap(); + + // Re-key the lock entry to the scoped install path. + let mut lock: Value = serde_json::from_slice(&fx.original_lock).unwrap(); + let packages = lock["packages"].as_object_mut().unwrap(); + let entry = packages.remove("node_modules/left-pad").unwrap(); + packages.insert("node_modules/@scope/left-pad".to_string(), entry); + lock["packages"][""]["dependencies"] = json!({ "@scope/left-pad": "^1.3.0" }); + let mut lock_bytes = serde_json::to_vec_pretty(&lock).unwrap(); + lock_bytes.push(b'\n'); + std::fs::write(root.join("package-lock.json"), &lock_bytes).unwrap(); + + fx +} + +/// The API serves scoped purls percent-encoded and `scan` stores them +/// verbatim as manifest keys; vendor must decode them to find the installed +/// `node_modules/@scope/...` package and wire the lock — while the ledger +/// stays keyed by the verbatim encoded purl (manifest parity). +#[test] +fn vendor_resolves_percent_encoded_scope_purl() { + let fx = npm_scoped_fixture(); + + let (code, env) = vendor_cli(fx.root(), &[]); + assert_eq!(code, 0, "{env:#}"); + let applied = find_event(&env, "applied", None); + assert_eq!(applied["purl"], "pkg:npm/%40scope/left-pad@1.3.0"); + + // Artifact lands under the DECODED scope dir. + let tgz = fx + .root() + .join(format!(".socket/vendor/npm/{UUID}/@scope/left-pad-1.3.0.tgz")); + assert!(tgz.exists(), "tarball at the decoded scoped path"); + + // Lock rewired to the vendored artifact. + let lock = fx.lock_value(); + assert_eq!( + lock["packages"]["node_modules/@scope/left-pad"]["resolved"], + json!(format!( + "file:.socket/vendor/npm/{UUID}/@scope/left-pad-1.3.0.tgz" + )) + ); + + // Ledger keyed by the VERBATIM encoded purl (manifest key parity). + let state: Value = + serde_json::from_slice(&std::fs::read(fx.state_path()).unwrap()).unwrap(); + assert!( + state["entries"]["pkg:npm/%40scope/left-pad@1.3.0"].is_object(), + "state keyed by the encoded manifest purl: {state:#}" + ); + + // Round-trip: revert restores the original (scoped) lock bytes. + let (code, env) = vendor_cli(fx.root(), &["--revert"]); + assert_eq!(code, 0, "{env:#}"); + let lock = fx.lock_value(); + assert_eq!( + lock["packages"]["node_modules/@scope/left-pad"]["resolved"], + json!(REG_RESOLVED) + ); + assert!(!fx.vendor_dir().join("npm").exists(), "artifacts removed"); +} diff --git a/crates/socket-patch-core/src/patch/vendor/bun_lock.rs b/crates/socket-patch-core/src/patch/vendor/bun_lock.rs index 1565d01..f1b3c51 100644 --- a/crates/socket-patch-core/src/patch/vendor/bun_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/bun_lock.rs @@ -136,6 +136,7 @@ pub async fn vendor_bun( sources, dry_run, force, + &mut warnings, ) .await { diff --git a/crates/socket-patch-core/src/patch/vendor/cargo.rs b/crates/socket-patch-core/src/patch/vendor/cargo.rs index 85ea5cb..614fd28 100644 --- a/crates/socket-patch-core/src/patch/vendor/cargo.rs +++ b/crates/socket-patch-core/src/patch/vendor/cargo.rs @@ -18,7 +18,7 @@ use std::path::{Path, PathBuf}; use crate::manifest::schema::{PatchFileInfo, PatchRecord}; use crate::patch::apply::{ - apply_package_patch, normalize_file_path, ApplyResult, PatchSources, VerifyResult, VerifyStatus, + normalize_file_path, ApplyResult, PatchSources, VerifyResult, VerifyStatus, }; use crate::patch::copy_tree::{fresh_copy, remove_tree}; use crate::patch::file_hash::compute_file_git_sha256; @@ -269,22 +269,27 @@ pub async fn vendor_cargo_crate( } if dry_run { - // Verify (read-only) against the pristine source — apply_package_patch - // never writes when dry_run — for an accurate "would patch" report, - // without creating the copy or editing config/lock. - let mut result = apply_package_patch( + // Verify (read-only) against the pristine source — the apply + // pipeline never writes when dry_run — for an accurate "would + // patch" report (including the auto-force overwrite warnings the + // real run would emit), without creating the copy or editing + // config/lock. + let mut dry_warnings: Vec = Vec::new(); + let mut result = super::force_apply_staged( purl, pristine_src, - &record.files, + record, sources, - Some(&record.uuid), true, force, + name, + version, + &mut dry_warnings, ) .await; result.package_path = copy_dir.display().to_string(); result.sidecar = None; - return done(result, None, Vec::new()); + return done(result, None, dry_warnings); } // Hot path: already in sync → touch nothing (entry stays with the caller's @@ -333,15 +338,19 @@ pub async fn vendor_cargo_crate( ); } - // Delegate to the hardened pipeline, pointed at the copy. - let mut result = apply_package_patch( + // Delegate to the hardened pipeline (vendor auto-force policy — see + // `force_apply_staged`), pointed at the copy. + let mut warnings: Vec = Vec::new(); + let mut result = super::force_apply_staged( purl, ©_dir, - &record.files, + record, sources, - Some(&record.uuid), false, force, + name, + version, + &mut warnings, ) .await; result.package_path = copy_dir.display().to_string(); @@ -350,7 +359,7 @@ pub async fn vendor_cargo_crate( // Don't leave a half-built copy (or an empty uuid husk) that // verify/sweep would misjudge. let _ = remove_tree(&uuid_dir).await; - return done(result, None, Vec::new()); + return done(result, None, warnings); } // A path-dep copy must never carry a checksum sidecar. The fresh copy @@ -370,10 +379,9 @@ pub async fn vendor_cargo_crate( let _ = remove_tree(&uuid_dir).await; result.success = false; result.error = Some(format!("failed to update .cargo/config.toml: {e}")); - return done(result, None, Vec::new()); + return done(result, None, warnings); } - let mut warnings = Vec::new(); let prior_path = prior_entry.as_ref().and_then(|i| i.path.clone()); if prior_path.as_deref().is_some_and(is_legacy_redirect_path) { warnings.push(VendorWarning::new( diff --git a/crates/socket-patch-core/src/patch/vendor/composer_lock.rs b/crates/socket-patch-core/src/patch/vendor/composer_lock.rs index b2ae9cd..5a533eb 100644 --- a/crates/socket-patch-core/src/patch/vendor/composer_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/composer_lock.rs @@ -35,7 +35,7 @@ use serde_json::{json, Map, Value}; use crate::manifest::schema::{PatchFileInfo, PatchRecord}; use crate::patch::apply::{ - apply_package_patch, is_safe_relative_subpath, normalize_file_path, ApplyResult, PatchSources, + is_safe_relative_subpath, normalize_file_path, ApplyResult, PatchSources, VerifyResult, VerifyStatus, }; use crate::patch::copy_tree::{fresh_copy, remove_tree}; @@ -193,21 +193,24 @@ pub async fn vendor_composer( // ── dry run: verify-only against the installed dir, no writes ──────── if dry_run { - let mut result = apply_package_patch( + let mut dry_warnings: Vec = Vec::new(); + let mut result = super::force_apply_staged( purl, installed_dir, - &record.files, + record, sources, - Some(&record.uuid), true, force, + &pkg, + version, + &mut dry_warnings, ) .await; result.package_path = copy_dir.display().to_string(); return VendorOutcome::Done { result, entry: None, - warnings: Vec::new(), + warnings: dry_warnings, }; } @@ -225,14 +228,17 @@ pub async fn vendor_composer( warnings: Vec::new(), }; } - let mut result = apply_package_patch( + let mut warnings: Vec = Vec::new(); + let mut result = super::force_apply_staged( purl, ©_dir, - &record.files, + record, sources, - Some(&record.uuid), false, force, + &pkg, + version, + &mut warnings, ) .await; result.package_path = copy_dir.display().to_string(); @@ -242,7 +248,7 @@ pub async fn vendor_composer( return VendorOutcome::Done { result, entry: None, - warnings: Vec::new(), + warnings, }; } @@ -256,7 +262,7 @@ pub async fn vendor_composer( return VendorOutcome::Done { result, entry: None, - warnings: Vec::new(), + warnings, }; }; let rewritten = rewrite_lock_entry(original_obj, ©_rel, &record.uuid); @@ -272,12 +278,11 @@ pub async fn vendor_composer( return VendorOutcome::Done { result, entry: None, - warnings: Vec::new(), + warnings, }; } // ── marker + ledger entry ──────────────────────────────────────────── - let mut warnings = Vec::new(); let base_purl = build_composer_purl(&vendor, &name, version); let mut vulnerabilities: Vec = record.vulnerabilities.keys().cloned().collect(); vulnerabilities.sort(); diff --git a/crates/socket-patch-core/src/patch/vendor/gem.rs b/crates/socket-patch-core/src/patch/vendor/gem.rs index 5ce51a2..0eccace 100644 --- a/crates/socket-patch-core/src/patch/vendor/gem.rs +++ b/crates/socket-patch-core/src/patch/vendor/gem.rs @@ -53,7 +53,7 @@ use serde_json::Value; use crate::manifest::schema::{PatchFileInfo, PatchRecord}; use crate::patch::apply::{ - apply_package_patch, is_safe_relative_subpath, normalize_file_path, ApplyResult, PatchSources, + is_safe_relative_subpath, normalize_file_path, ApplyResult, PatchSources, VerifyResult, VerifyStatus, }; use crate::patch::copy_tree::{fresh_copy, remove_tree}; @@ -282,21 +282,24 @@ pub async fn vendor_gem( // ── dry run: verify-only against the installed dir, no writes ──────── if dry_run { - let mut result = apply_package_patch( + let mut dry_warnings: Vec = Vec::new(); + let mut result = super::force_apply_staged( purl, installed_dir, - &record.files, + record, sources, - Some(&record.uuid), true, force, + name, + version, + &mut dry_warnings, ) .await; result.package_path = copy_dir.display().to_string(); return VendorOutcome::Done { result, entry: None, - warnings: Vec::new(), + warnings: dry_warnings, }; } @@ -338,14 +341,17 @@ pub async fn vendor_gem( warnings: Vec::new(), }; } - let mut result = apply_package_patch( + let mut warnings: Vec = Vec::new(); + let mut result = super::force_apply_staged( purl, ©_dir, - &record.files, + record, sources, - Some(&record.uuid), false, force, + name, + version, + &mut warnings, ) .await; result.package_path = copy_dir.display().to_string(); @@ -355,7 +361,7 @@ pub async fn vendor_gem( return VendorOutcome::Done { result, entry: None, - warnings: Vec::new(), + warnings, }; } @@ -368,7 +374,7 @@ pub async fn vendor_gem( return VendorOutcome::Done { result, entry: None, - warnings: Vec::new(), + warnings, }; } @@ -395,13 +401,12 @@ pub async fn vendor_gem( return VendorOutcome::Done { result, entry: None, - warnings: Vec::new(), + warnings, }; } }; // ── marker + ledger entry ──────────────────────────────────────────── - let mut warnings = Vec::new(); let base_purl = build_gem_purl(name, version); let mut vulnerabilities: Vec = record.vulnerabilities.keys().cloned().collect(); vulnerabilities.sort(); diff --git a/crates/socket-patch-core/src/patch/vendor/golang.rs b/crates/socket-patch-core/src/patch/vendor/golang.rs index 8961482..066676c 100644 --- a/crates/socket-patch-core/src/patch/vendor/golang.rs +++ b/crates/socket-patch-core/src/patch/vendor/golang.rs @@ -101,6 +101,29 @@ pub async fn vendor_go_module( .is_some_and(|e| e.owner == Some(ReplaceOwner::GoPatches)); let prior_path = prior.as_ref().and_then(|e| e.path.clone()); + // Vendor auto-force policy (the engine's copy is staged from the + // pristine source, never the user's tree — see `force_apply_staged`): + // missing patch targets still fail closed unless the caller's own + // `--force` asked for the skip tolerance, then the engine apply runs + // forced so a beforeHash mismatch (already-applied module, or a patch + // built against different bytes) overwrites with the verified patched + // content. The engine is shared with the in-place `apply` redirect + // path, whose strict semantics stay unchanged. + let mut warnings: Vec = Vec::new(); + if !force { + let missing = super::missing_existing_patch_files(pristine_src, &record.files).await; + if let Some(first) = missing.first() { + return VendorOutcome::Done { + result: super::failed_apply_result( + purl, + format!("Cannot apply patch: {first} - File not found"), + ), + entry: None, + warnings, + }; + } + } + // The engine does the heavy lifting: fresh copy → hardened apply pipeline // → `replace` upsert (which refuses a user-authored same-version pin). let result = apply_go_redirect( @@ -114,15 +137,18 @@ pub async fn vendor_go_module( sources, Some(&record.uuid), dry_run, - force, + /*force=*/ true, ) .await; + if result.success { + warnings.extend(super::mismatch_overwrite_warnings(&result, module, version)); + } if dry_run { return VendorOutcome::Done { result, entry: None, - warnings: Vec::new(), + warnings, }; } if !result.success { @@ -134,7 +160,7 @@ pub async fn vendor_go_module( return VendorOutcome::Done { result, entry: None, - warnings: Vec::new(), + warnings, }; } // A patch with no files is a no-op success: the engine wrote no copy and @@ -143,12 +169,10 @@ pub async fn vendor_go_module( return VendorOutcome::Done { result, entry: None, - warnings: Vec::new(), + warnings, }; } - let mut warnings = Vec::new(); - if takeover { // The `replace` line was already atomically repointed by the upsert; // the apply backend's copy is now unreachable — delete it (built from diff --git a/crates/socket-patch-core/src/patch/vendor/mod.rs b/crates/socket-patch-core/src/patch/vendor/mod.rs index 7d60fdc..1aa70cc 100644 --- a/crates/socket-patch-core/src/patch/vendor/mod.rs +++ b/crates/socket-patch-core/src/patch/vendor/mod.rs @@ -75,7 +75,14 @@ pub mod yarn_classic_lock; pub use path::{ecosystem_dir_for_purl, parse_vendor_path, VendorPathParts, VENDOR_DIR}; pub use state::{load_state, save_state, VendorEntry, VendorState, VENDOR_STATE_REL}; -use crate::patch::apply::ApplyResult; +use std::collections::HashMap; +use std::path::Path; + +use crate::manifest::schema::{PatchFileInfo, PatchRecord}; +use crate::patch::apply::{ + apply_package_patch, is_safe_relative_subpath, normalize_file_path, ApplyResult, PatchSources, + VerifyStatus, +}; /// A non-fatal advisory surfaced as a warning event (`code` is a stable /// reason tag from the CLI contract; `detail` is human text). @@ -94,6 +101,141 @@ impl VendorWarning { } } +/// One warning per staged file whose pre-patch content matched NEITHER +/// `beforeHash` nor `afterHash` and was overwritten with the verified +/// patched content (vendor staging always force-applies — the stage is a +/// private copy, and every apply write path is hash-gated to exactly +/// `afterHash`). +/// +/// Detection rides the verify signature `apply_package_patch` leaves +/// behind: a force-promoted file keeps `status: Ready` WITH +/// `expected_hash: Some(..)` and a differing `current_hash`, whereas a +/// cleanly-verified file carries `expected_hash: None` (see +/// `verify_file_patch`). +pub(crate) fn mismatch_overwrite_warnings( + result: &ApplyResult, + name: &str, + version: &str, +) -> Vec { + let mut warnings: Vec = result + .files_verified + .iter() + .filter(|v| { + v.status == VerifyStatus::Ready + && v.expected_hash.is_some() + && v.current_hash != v.expected_hash + }) + .map(|v| { + VendorWarning::new( + "vendor_content_mismatch_overwritten", + format!( + "installed {name}@{version} does not match this patch's expected original \ + ({}); vendored the patched content anyway", + v.file + ), + ) + }) + .collect(); + // HashMap-driven verify order is randomized; keep warning order stable. + warnings.sort_by(|a, b| a.detail.cmp(&b.detail)); + warnings +} + +/// Patch-target files (non-empty `beforeHash`) absent from the staged +/// copy. Vendor staging force-applies (see [`force_apply_staged`]), and +/// force silently SKIPS missing files — which would pack an artifact +/// without the fix. This pre-check restores the strict apply's +/// fail-closed behavior for the non-`--force` path. Unsafe keys are +/// skipped here: the apply pipeline itself rejects them fail-closed. +pub(crate) async fn missing_existing_patch_files( + staged_dir: &Path, + files: &HashMap, +) -> Vec { + let mut missing: Vec = Vec::new(); + for (file_name, info) in files { + if info.before_hash.is_empty() { + continue; // a new file is expected to not exist yet + } + let normalized = normalize_file_path(file_name); + if !is_safe_relative_subpath(normalized) { + continue; + } + if tokio::fs::metadata(staged_dir.join(normalized)).await.is_err() { + missing.push(file_name.clone()); + } + } + missing.sort(); + missing +} + +/// A failed synthesized [`ApplyResult`] in the shape the strict apply +/// pipeline would have produced (success=false, `error` set, no files). +pub(crate) fn failed_apply_result(purl: &str, error: String) -> ApplyResult { + ApplyResult { + package_key: purl.to_string(), + package_path: String::new(), + success: false, + files_verified: Vec::new(), + files_patched: Vec::new(), + applied_via: HashMap::new(), + error: Some(error), + sidecar: None, + } +} + +/// Run the hardened apply pipeline against a vendor stage/copy with the +/// vendor auto-force policy: +/// +/// * Missing patch-target files fail closed unless the caller's own +/// `--force` asked for that skip tolerance. +/// * The apply itself ALWAYS forces: the stage is a private copy (never +/// the user's tree), and every apply write path is hash-gated to +/// exactly `afterHash` (the archive and blob paths verify content +/// BEFORE writing; the diff path self-disables on a base mismatch) — +/// forcing can only produce the verified patched content or fail +/// closed. This is what lets vendor succeed on a package already +/// patched in place by `apply`, or on a patch whose `beforeHash` was +/// built against different bytes than the installed artifact. +/// * Every force-overwritten file (content matched NEITHER hash) emits a +/// `vendor_content_mismatch_overwritten` warning — including on dry +/// runs, so previews predict the real outcome. +#[allow(clippy::too_many_arguments)] +pub(crate) async fn force_apply_staged( + purl: &str, + staged_dir: &Path, + record: &PatchRecord, + sources: &PatchSources<'_>, + dry_run: bool, + force: bool, + name: &str, + version: &str, + warnings: &mut Vec, +) -> ApplyResult { + if !force { + let missing = missing_existing_patch_files(staged_dir, &record.files).await; + if let Some(first) = missing.first() { + return failed_apply_result( + purl, + format!("Cannot apply patch: {first} - File not found"), + ); + } + } + let result = apply_package_patch( + purl, + staged_dir, + &record.files, + sources, + Some(&record.uuid), + dry_run, + /*force=*/ true, + ) + .await; + if result.success { + warnings.extend(mismatch_overwrite_warnings(&result, name, version)); + } + result +} + /// The result of one backend `vendor_*` call. // // `large_enum_variant`: `Done` is much bigger than `Refused` because it carries @@ -187,3 +329,63 @@ pub async fn vendored_purl_keys( Err(_) => std::collections::HashSet::new(), } } + +#[cfg(test)] +mod policy_tests { + use super::*; + use crate::patch::apply::VerifyResult; + + fn verify(status: VerifyStatus, expected: Option<&str>, current: Option<&str>) -> VerifyResult { + VerifyResult { + file: "package/index.js".to_string(), + status, + message: None, + current_hash: current.map(str::to_string), + expected_hash: expected.map(str::to_string), + target_hash: None, + } + } + + fn result_with(files_verified: Vec) -> ApplyResult { + ApplyResult { + package_key: "pkg:npm/x@1.0.0".to_string(), + package_path: String::new(), + success: true, + files_verified, + files_patched: Vec::new(), + applied_via: HashMap::new(), + error: None, + sidecar: None, + } + } + + /// Only the force-promoted signature (`Ready` + `expected_hash: Some` + + /// differing `current_hash`) flags an overwrite; clean verifies and + /// AlreadyPatched files never do. + #[test] + fn mismatch_overwrite_warnings_detects_promoted_ready() { + // Force-promoted mismatch: flagged. + let r = result_with(vec![verify(VerifyStatus::Ready, Some("aa"), Some("bb"))]); + let w = mismatch_overwrite_warnings(&r, "left-pad", "1.3.0"); + assert_eq!(w.len(), 1); + assert_eq!(w[0].code, "vendor_content_mismatch_overwritten"); + assert!(w[0].detail.contains("left-pad@1.3.0")); + assert!(w[0].detail.contains("package/index.js")); + + // Clean Ready (verify matched beforeHash): expected_hash is None. + let r = result_with(vec![verify(VerifyStatus::Ready, None, Some("aa"))]); + assert!(mismatch_overwrite_warnings(&r, "x", "1").is_empty()); + + // AlreadyPatched (afterHash content): not a mismatch. + let r = result_with(vec![verify( + VerifyStatus::AlreadyPatched, + None, + Some("after"), + )]); + assert!(mismatch_overwrite_warnings(&r, "x", "1").is_empty()); + + // NotFound (force-skipped): not an overwrite. + let r = result_with(vec![verify(VerifyStatus::NotFound, None, None)]); + assert!(mismatch_overwrite_warnings(&r, "x", "1").is_empty()); + } +} diff --git a/crates/socket-patch-core/src/patch/vendor/npm_common.rs b/crates/socket-patch-core/src/patch/vendor/npm_common.rs index 1cfc62d..f54a893 100644 --- a/crates/socket-patch-core/src/patch/vendor/npm_common.rs +++ b/crates/socket-patch-core/src/patch/vendor/npm_common.rs @@ -12,20 +12,19 @@ //! the project byte-untouched (a dry run stops after verification and //! creates nothing on disk). -use std::collections::HashMap; use std::path::Path; use serde_json::Value; use crate::manifest::schema::PatchRecord; -use crate::patch::apply::{apply_package_patch, normalize_file_path, ApplyResult, PatchSources}; +use crate::patch::apply::{normalize_file_path, ApplyResult, PatchSources}; use crate::patch::copy_tree::{fresh_copy, remove_tree}; use crate::patch::path_safety; use crate::utils::purl::{percent_decode_purl_component, strip_purl_qualifiers}; use super::npm_pack::{pack_deterministic, PackedTarball}; use super::path::vendor_uuid_dir_rel; -use super::VendorOutcome; +use super::{VendorOutcome, VendorWarning}; /// Validated npm vendoring coordinates (the output of /// [`guard_coordinates`]). `name`/`version` are the percent-DECODED purl @@ -130,6 +129,7 @@ pub(super) async fn stage_patch_pack( sources: &PatchSources<'_>, dry_run: bool, force: bool, + warnings: &mut Vec, ) -> Result<(Option, ApplyResult), Box> { let coords = guard_coordinates(purl, record)?; @@ -179,18 +179,21 @@ pub(super) async fn stage_patch_pack( } } - // Delegate to the hardened apply pipeline, pointed at the stage (which + // Delegate to the hardened apply pipeline (with the vendor auto-force + // policy — see `force_apply_staged`), pointed at the stage (which // plays the role of the installed package dir — manifest npm keys carry // the `package/` prefix and `apply` strips it via `normalize_file_path`, // exactly as it does for an in-place npm apply). - let result = apply_package_patch( + let result = super::force_apply_staged( purl, &stage, - &record.files, + record, sources, - Some(&record.uuid), dry_run, force, + &coords.name, + &coords.version, + warnings, ) .await; // A failed patch never packs (wiring is last — the caller returns with @@ -331,16 +334,7 @@ pub(super) fn refused(code: &'static str, detail: String) -> VendorOutcome { /// results. pub(super) fn done_failure(purl: &str, error: String) -> VendorOutcome { VendorOutcome::Done { - result: ApplyResult { - package_key: purl.to_string(), - package_path: String::new(), - success: false, - files_verified: Vec::new(), - files_patched: Vec::new(), - applied_via: HashMap::new(), - error: Some(error), - sidecar: None, - }, + result: super::failed_apply_result(purl, error), entry: None, warnings: Vec::new(), } @@ -350,6 +344,7 @@ pub(super) fn done_failure(purl: &str, error: String) -> VendorOutcome { mod tests { use super::*; use crate::manifest::schema::PatchFileInfo; + use std::collections::HashMap; const UUID: &str = "9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f"; diff --git a/crates/socket-patch-core/src/patch/vendor/npm_lock.rs b/crates/socket-patch-core/src/patch/vendor/npm_lock.rs index bed244b..7e23591 100644 --- a/crates/socket-patch-core/src/patch/vendor/npm_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/npm_lock.rs @@ -175,6 +175,7 @@ pub async fn vendor_npm( sources, dry_run, force, + &mut warnings, ) .await { @@ -1090,6 +1091,176 @@ mod tests { assert!(found, "package/index.js missing from the tarball"); } + /// Read one member's bytes out of the packed tarball. + fn tgz_member(tgz: &[u8], member: &str) -> Option> { + let mut archive = tar::Archive::new(flate2::read::GzDecoder::new(tgz)); + for e in archive.entries().unwrap() { + let mut e = e.unwrap(); + if e.path().unwrap().to_string_lossy() == member { + let mut data = Vec::new(); + std::io::Read::read_to_end(&mut e, &mut data).unwrap(); + return Some(data); + } + } + None + } + + /// Vendor auto-force policy: installed content matching NEITHER hash + /// (e.g. a patch built against different bytes than the registry + /// artifact) is overwritten in the STAGE with the verified patched + /// content; the run succeeds, wires the lock, and surfaces the + /// overwrite as a `vendor_content_mismatch_overwritten` warning. The + /// installed tree is never touched. + #[tokio::test] + async fn vendor_overwrites_mismatched_content_with_warning() { + let fx = fixture().await; + let divergent: &[u8] = b"module.exports = () => 'divergent';\n"; + tokio::fs::write(fx.installed().join("index.js"), divergent) + .await + .unwrap(); + + let (result, entry, warnings) = expect_done(fx.vendor(false).await); + assert!(result.success, "{:?}", result.error); + assert!(entry.is_some(), "first vendor records a ledger entry"); + assert_eq!( + warnings + .iter() + .filter(|w| w.code == "vendor_content_mismatch_overwritten") + .count(), + 1, + "overwrite surfaced exactly once: {warnings:?}" + ); + assert!( + warnings[0].detail.contains("left-pad@1.3.0") + && warnings[0].detail.contains("package/index.js"), + "warning names the package and file: {warnings:?}" + ); + + // The tarball carries the VERIFIED patched bytes, not the divergent + // ones — every apply write path is hash-gated to afterHash. + let tgz = tokio::fs::read(fx.root().join(fx.expected_rel_tgz())) + .await + .unwrap(); + assert_eq!( + tgz_member(&tgz, "package/index.js").unwrap(), + PATCHED_INDEX + ); + + // The installed tree keeps its (divergent) bytes — only the stage + // was overwritten. + assert_eq!( + tokio::fs::read(fx.installed().join("index.js")) + .await + .unwrap(), + divergent + ); + + // The lock was rewired to the vendored artifact. + let lock = fx.read_lock().await; + assert_eq!( + lock["packages"]["node_modules/left-pad"]["resolved"], + json!(format!("file:{}", fx.expected_rel_tgz())) + ); + } + + /// Auto-force must NOT inherit force's silent NotFound skip: a missing + /// patch-target file still fails closed (a tarball without the fix + /// must never be packed), leaving the project byte-untouched. + #[tokio::test] + async fn vendor_missing_patch_file_fails_without_force() { + let fx = fixture().await; + tokio::fs::remove_file(fx.installed().join("index.js")) + .await + .unwrap(); + + let (result, entry, _) = expect_done(fx.vendor(false).await); + assert!(!result.success, "missing file must fail closed"); + assert!( + result + .error + .as_deref() + .unwrap_or("") + .contains("File not found"), + "error names the missing file: {:?}", + result.error + ); + assert!(entry.is_none()); + assert_eq!( + tokio::fs::read(fx.lock_path()).await.unwrap(), + fx.lock_bytes, + "lock byte-untouched on failure" + ); + assert!( + tokio::fs::metadata(fx.root().join(".socket/vendor")) + .await + .is_err(), + "no artifact dir on failure" + ); + } + + /// `vendor --force` keeps its missing-file tolerance (strict superset + /// of the auto-force policy). + #[tokio::test] + async fn vendor_force_still_skips_missing_files() { + let fx = fixture().await; + tokio::fs::remove_file(fx.installed().join("index.js")) + .await + .unwrap(); + + let blobs = fx.root().join(".socket/blobs"); + let sources = PatchSources::blobs_only(&blobs); + let outcome = vendor_npm( + &fx.purl(), + &fx.installed(), + fx.root(), + &fx.record, + &sources, + "2026-06-09T00:00:00Z", + false, + /*force=*/ true, + ) + .await; + let (result, entry, _) = expect_done(outcome); + assert!(result.success, "{:?}", result.error); + assert!(entry.is_some()); + } + + /// A package already patched IN PLACE by `apply` vendors cleanly: the + /// staged copy verifies AlreadyPatched (no mismatch warning — the + /// content is exactly the patch's afterHash) and the tarball ships the + /// patched bytes. + #[tokio::test] + async fn vendor_of_already_applied_package_succeeds() { + let fx = fixture().await; + // Simulate a prior in-place `socket-patch apply`. + tokio::fs::write(fx.installed().join("index.js"), PATCHED_INDEX) + .await + .unwrap(); + + let (result, entry, warnings) = expect_done(fx.vendor(false).await); + assert!(result.success, "{:?}", result.error); + assert!(entry.is_some(), "first vendor records a ledger entry"); + assert!( + warnings + .iter() + .all(|w| w.code != "vendor_content_mismatch_overwritten"), + "afterHash content is AlreadyPatched, not a mismatch: {warnings:?}" + ); + + let tgz = tokio::fs::read(fx.root().join(fx.expected_rel_tgz())) + .await + .unwrap(); + assert_eq!( + tgz_member(&tgz, "package/index.js").unwrap(), + PATCHED_INDEX + ); + let lock = fx.read_lock().await; + assert_eq!( + lock["packages"]["node_modules/left-pad"]["resolved"], + json!(format!("file:{}", fx.expected_rel_tgz())) + ); + } + #[tokio::test] async fn rerun_is_in_sync_and_byte_stable() { let fx = fixture().await; diff --git a/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs b/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs index 48ec98c..e2ef0af 100644 --- a/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs @@ -163,6 +163,7 @@ pub async fn vendor_pnpm( sources, dry_run, force, + &mut warnings, ) .await { diff --git a/crates/socket-patch-core/src/patch/vendor/pypi.rs b/crates/socket-patch-core/src/patch/vendor/pypi.rs index 553a0dc..8b317f3 100644 --- a/crates/socket-patch-core/src/patch/vendor/pypi.rs +++ b/crates/socket-patch-core/src/patch/vendor/pypi.rs @@ -405,6 +405,7 @@ pub async fn vendor_pypi( &dest, dry_run, force, + &mut warnings, ) .await; let (result, artifact) = match built { diff --git a/crates/socket-patch-core/src/patch/vendor/pypi_wheel.rs b/crates/socket-patch-core/src/patch/vendor/pypi_wheel.rs index 69d4ffb..4b851ff 100644 --- a/crates/socket-patch-core/src/patch/vendor/pypi_wheel.rs +++ b/crates/socket-patch-core/src/patch/vendor/pypi_wheel.rs @@ -20,7 +20,7 @@ use sha2::Digest as _; use crate::crawlers::python_crawler::{canonicalize_pypi_name, read_python_metadata}; use crate::manifest::schema::PatchRecord; use crate::patch::apply::{ - apply_package_patch, is_safe_relative_subpath, normalize_file_path, ApplyResult, PatchSources, + is_safe_relative_subpath, normalize_file_path, ApplyResult, PatchSources, }; use crate::utils::fs::{atomic_write_bytes, list_dir_entries}; @@ -255,6 +255,7 @@ pub async fn build_patched_wheel( dest: &Path, dry_run: bool, force: bool, + warnings: &mut Vec, ) -> Result<(ApplyResult, Option), (&'static str, String)> { // Editable installs (`pip install -e` / uv tool dev mode) point // site-packages at the user's own working tree: the RECORD describes a @@ -371,15 +372,18 @@ pub async fn build_patched_wheel( } // Patch the stage through the shared apply pipeline (same verify/source - // strategy contract as `apply`). The installed tree is never touched. - let mut result = apply_package_patch( + // strategy contract as `apply`, with the vendor auto-force policy — + // see `force_apply_staged`). The installed tree is never touched. + let mut result = super::force_apply_staged( purl, stage.path(), - &record.files, + record, sources, - Some(&record.uuid), dry_run, force, + &dist.dist_name, + &dist.version, + warnings, ) .await; if dry_run || !result.success { @@ -903,6 +907,7 @@ mod tests { &fx.dest, false, false, + &mut Vec::new(), ) .await .unwrap(); @@ -952,6 +957,7 @@ mod tests { &fx.dest, false, false, + &mut Vec::new(), ) .await .unwrap_err(); @@ -977,6 +983,7 @@ mod tests { &fx.dest, false, false, + &mut Vec::new(), ) .await .unwrap(); @@ -994,6 +1001,7 @@ mod tests { &fx.dest, false, false, + &mut Vec::new(), ) .await .unwrap(); @@ -1044,6 +1052,7 @@ mod tests { &fx.dest, false, false, + &mut Vec::new(), ) .await .unwrap(); @@ -1082,6 +1091,7 @@ mod tests { &fx.dest, false, false, + &mut Vec::new(), ) .await .unwrap_err(); @@ -1105,6 +1115,7 @@ mod tests { &fx.dest, true, false, + &mut Vec::new(), ) .await .unwrap(); @@ -1120,8 +1131,12 @@ mod tests { ); } + /// Vendor auto-force policy: installed content matching NEITHER hash is + /// overwritten with the verified patched content in the STAGE (the + /// installed tree is never touched), and the overwrite is surfaced as a + /// `vendor_content_mismatch_overwritten` warning. #[tokio::test] - async fn hash_mismatch_fails_without_touching_install_or_dest() { + async fn hash_mismatch_overwrites_in_stage_with_warning() { let fx = make_fixture("", None).await; // Corrupt the installed six.py so verify sees a HashMismatch. tokio::fs::write(fx.site_packages.join("six.py"), b"tampered") @@ -1132,6 +1147,7 @@ mod tests { .unwrap(); let record = patch_record(&[("six.py", ORIG, PATCHED)]); let sources = PatchSources::blobs_only(&fx.blobs); + let mut warnings = Vec::new(); let (result, artifact) = build_patched_wheel( "pkg:pypi/six@1.16.0", &fx.site_packages, @@ -1141,10 +1157,65 @@ mod tests { &fx.dest, false, false, + &mut warnings, + ) + .await + .unwrap(); + assert!(result.success, "{:?}", result.error); + assert!(artifact.is_some()); + assert!(fx.dest.exists(), "patched wheel must be written"); + assert_eq!( + warnings + .iter() + .filter(|w| w.code == "vendor_content_mismatch_overwritten") + .count(), + 1, + "overwrite surfaced as a warning: {warnings:?}" + ); + // Installed tree untouched — only the stage was overwritten. + assert_eq!( + tokio::fs::read(fx.site_packages.join("six.py")) + .await + .unwrap(), + b"tampered" + ); + } + + /// A patch-target file MISSING from the install still fails closed + /// without `--force` — auto-force must not inherit force's silent + /// NotFound skip (the wheel would ship without the fix). + #[tokio::test] + async fn missing_patch_file_fails_without_force() { + let fx = make_fixture("", None).await; + tokio::fs::remove_file(fx.site_packages.join("six.py")) + .await + .unwrap(); + let dist = locate_installed_dist(&fx.site_packages, "six", "1.16.0") + .await + .unwrap(); + let record = patch_record(&[("six.py", ORIG, PATCHED)]); + let sources = PatchSources::blobs_only(&fx.blobs); + let (result, artifact) = build_patched_wheel( + "pkg:pypi/six@1.16.0", + &fx.site_packages, + &dist, + &record, + &sources, + &fx.dest, + false, + false, + &mut Vec::new(), ) .await .unwrap(); assert!(!result.success); + // The RECORD staging step trips first ("RECORD member ... is + // unreadable") — either way the build fails closed rather than + // packing a wheel without the fix. + assert!( + result.error.is_some(), + "missing file fails closed with an error" + ); assert!(artifact.is_none()); assert!(!fx.dest.exists()); } diff --git a/crates/socket-patch-core/src/patch/vendor/yarn_berry_lock.rs b/crates/socket-patch-core/src/patch/vendor/yarn_berry_lock.rs index d24c7c8..b7fa943 100644 --- a/crates/socket-patch-core/src/patch/vendor/yarn_berry_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/yarn_berry_lock.rs @@ -254,6 +254,7 @@ pub async fn vendor_yarn_berry( sources, dry_run, force, + &mut warnings, ) .await { diff --git a/crates/socket-patch-core/src/patch/vendor/yarn_classic_lock.rs b/crates/socket-patch-core/src/patch/vendor/yarn_classic_lock.rs index 8d82a00..fb25126 100644 --- a/crates/socket-patch-core/src/patch/vendor/yarn_classic_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/yarn_classic_lock.rs @@ -135,6 +135,7 @@ pub async fn vendor_yarn_classic( sources, dry_run, force, + &mut warnings, ) .await { From 7363c65ec3b85f441661ac61517a60761a69ed7d Mon Sep 17 00:00:00 2001 From: Mikola Lysenko Date: Thu, 11 Jun 2026 15:42:34 -0400 Subject: [PATCH 03/19] feat(vendor): take over exact-version override pins (pnpm + yarn berry) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A user-authored override/resolution that pins the package to exactly the version being vendored (Flowise: pnpm.overrides 'tar-fs': '3.1.0') no longer refuses with vendor_override_conflict. The pin's key is kept (its spelling and quoting preserved on both pnpm surfaces — pnpm hard-requires the package.json and lock override maps to agree), its VALUE is rewritten to the file:.socket/vendor/... spec, and the pinned value is recorded as the wiring original so every revert path (--revert, reconcile, remove) restores the user's pin verbatim. - pnpm: classify_pkg_override (Insert / Ours / Takeover) replaces the boolean conflict checks; effective key threads through EditCtx, apply_pkg_override and edit_overrides; revert restores originals in place instead of deleting. Ranges, different versions, parent>child selector chains, and duplicate same-name keys still refuse, now with a hint that exact pins are taken over. - yarn berry: bare-name resolutions pin equal to the version is taken over symmetrically (KIND_RESOLUTION records the original). - npm/yarn-classic/bun wire the lock only (no override surface), so no conflict exists there to take over. Co-Authored-By: Claude Fable 5 --- .../src/patch/vendor/pnpm_lock.rs | 398 ++++++++++++++++-- .../src/patch/vendor/yarn_berry_lock.rs | 103 ++++- 2 files changed, 445 insertions(+), 56 deletions(-) diff --git a/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs b/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs index e2ef0af..433f3b2 100644 --- a/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs @@ -138,10 +138,15 @@ pub async fn vendor_pnpm( let mut lines = split_lines(&lock_text); // ── 3. Pre-flight refusals (override conflicts, entry present) ─────── - if let Err(detail) = check_pkg_override_conflict(&pkg, name, &override_key) { - return refused("vendor_override_conflict", detail); - } - if let Err(detail) = check_lock_override_conflict(&lines, name, &override_key) { + // A user-authored exact-version pin equal to `version` is TAKEN OVER + // (the pin's key is rewritten to our spec on both surfaces and the + // original value recorded for revert); anything else same-name refuses. + let disposition = match classify_pkg_override(&pkg, name, version, &override_key) { + Ok(d) => d, + Err(detail) => return refused("vendor_override_conflict", detail), + }; + let effective_key = disposition.effective_key(&override_key).to_string(); + if let Err(detail) = check_lock_override(&lines, name, version, &effective_key) { return refused("vendor_override_conflict", detail); } if !lock_has_target_package(&lines, name, version) { @@ -201,11 +206,12 @@ pub async fn vendor_pnpm( rel_tgz: &rel_tgz, spec: &spec, integrity: &packed.integrity, + override_key: &effective_key, }; let mut wiring: Vec = Vec::new(); let (pkg_changed, created_pnpm_table, created_overrides_table) = - match apply_pkg_override(&mut pkg, &override_key, &spec, &mut wiring) { + match apply_pkg_override(&mut pkg, &effective_key, &spec, &mut wiring) { Ok(out) => out, Err(e) => return done_failure(purl, e), }; @@ -486,6 +492,11 @@ struct EditCtx<'a> { spec: &'a str, /// `sha512-` of the packed tarball. integrity: &'a str, + /// The override key BOTH surfaces edit (see + /// [`OverrideDisposition::effective_key`]): our canonical + /// `name@version` on a fresh insert, or the user's existing key on a + /// takeover / re-run over a taken-over key. + override_key: &'a str, } impl EditCtx<'_> { @@ -561,46 +572,129 @@ fn override_key_name(key: &str) -> &str { } } -/// Is this (key, value) override pair OURS for the target package — the -/// exact versioned selector pointing into `.socket/vendor/npm/`? -fn override_is_ours(key: &str, value: &str, our_key: &str) -> bool { - key == our_key && parse_vendor_path(value).is_some_and(|p| p.eco == "npm") +/// Does `value` point into `.socket/vendor/npm/` (ours — any uuid)? +fn is_vendor_value(value: &str) -> bool { + parse_vendor_path(value).is_some_and(|p| p.eco == "npm") +} + +/// How the package.json `pnpm.overrides` table relates to the package +/// being vendored. The lock's `overrides:` section must mirror this map +/// key-for-key (pnpm hard-checks the two and fails +/// `ERR_PNPM_LOCKFILE_CONFIG_MISMATCH` on any drift), so whichever key +/// this classification yields is the one BOTH surfaces edit. +#[derive(Debug, Clone, PartialEq, Eq)] +enum OverrideDisposition { + /// No same-name key: insert our canonical `name@version` key. + Insert, + /// A same-name key already points into `.socket/vendor/npm/` — ours + /// (any uuid; possibly a user key an earlier vendor took over). + /// Rewrite that key's value in place; our own value is never + /// recorded as an `original`. + Ours { key: String }, + /// A user-authored exact-version pin equal to the version being + /// vendored (`"tar-fs": "3.1.0"` or `"tar-fs@3.1.0": "3.1.0"`): take + /// the key over — rewrite its VALUE to the `file:` spec (the user's + /// pin already forces every `tar-fs` to this exact version, so + /// redirecting the same key preserves their semantics) and record + /// the pin as the wiring `original` so revert restores it exactly. + Takeover { key: String, original: String }, +} + +impl OverrideDisposition { + /// The override key both surfaces edit: the matched existing key, or + /// our canonical `name@version` on a fresh insert. + fn effective_key<'a>(&'a self, our_key: &'a str) -> &'a str { + match self { + OverrideDisposition::Insert => our_key, + OverrideDisposition::Ours { key } | OverrideDisposition::Takeover { key, .. } => key, + } + } } -/// A user-authored override already steering this package would be -/// silently fought over by ours; refuse instead (fail-closed). -fn check_pkg_override_conflict(pkg: &Value, name: &str, our_key: &str) -> Result<(), String> { +/// Classify the package.json override state for `name` (see +/// [`OverrideDisposition`]). `Err` is a genuine conflict (fail-closed): +/// a range/different-version value, a `parent>child` selector chain +/// (scoped to one dependent — our whole-graph rewrite has different +/// semantics), a non-string value, or several same-name keys. +fn classify_pkg_override( + pkg: &Value, + name: &str, + version: &str, + our_key: &str, +) -> Result { let Some(overrides) = pkg.get("pnpm").and_then(|p| p.get("overrides")) else { - return Ok(()); + return Ok(OverrideDisposition::Insert); }; let Some(map) = overrides.as_object() else { return Err("package.json pnpm.overrides is not an object".to_string()); }; + let mut found: Option = None; for (key, value) in map { if override_key_name(key) != name { continue; } + if found.is_some() { + return Err(format!( + "package.json carries more than one pnpm override for `{name}`; vendoring \ + cannot pick one — remove the extras first" + )); + } let value_str = value.as_str().unwrap_or(""); - if override_is_ours(key, value_str, our_key) { - continue; // ours (possibly a stale uuid) — the edit handles it + let classified = if key.contains('>') { + None + } else if is_vendor_value(value_str) { + Some(OverrideDisposition::Ours { key: key.clone() }) + } else if value_str == version && (key == name || key == our_key) { + Some(OverrideDisposition::Takeover { + key: key.clone(), + original: value_str.to_string(), + }) + } else { + None + }; + match classified { + Some(d) => found = Some(d), + None => { + return Err(format!( + "package.json already carries a pnpm override for `{key}` ({value}); \ + vendoring would fight it — remove the override (or vendor --revert) \ + first (an exact-version pin equal to {version} is taken over \ + automatically)" + )) + } } - return Err(format!( - "package.json already carries a pnpm override for `{key}` ({value}); vendoring \ - would fight it — remove the override (or vendor --revert) first" - )); } - Ok(()) + Ok(found.unwrap_or(OverrideDisposition::Insert)) } -/// Same conflict check against the lock's own `overrides:` section (a -/// desynced lock-side override would be silently clobbered otherwise). -fn check_lock_override_conflict(lines: &[String], name: &str, our_key: &str) -> Result<(), String> { +/// Lock-side mirror check against the effective key. Every same-name key +/// in the lock's `overrides:` section must BE `effective_key` (pnpm +/// requires the lock's override map to equal package.json's — a key-shape +/// drift means the pair is already desynced) with a value the edit can +/// own: ours, the exact pinned `version` (takeover), or already our spec. +/// A missing section/key is fine — the edit inserts it, restoring parity. +fn check_lock_override( + lines: &[String], + name: &str, + version: &str, + effective_key: &str, +) -> Result<(), String> { let Some((start, end)) = section_bounds(lines, "overrides") else { return Ok(()); }; for line in &lines[start + 1..end] { if let Some((key, _repr, rest)) = parse_key_line(line, 2) { - if override_key_name(&key) == name && !override_is_ours(&key, &rest, our_key) { + if override_key_name(&key) != name { + continue; + } + if key != effective_key { + return Err(format!( + "{PNPM_LOCK} carries an override key `{key}` for `{name}` that does not \ + match package.json's `{effective_key}` — the two override maps must \ + agree (run `pnpm install` to re-sync them) before vendoring" + )); + } + if !(is_vendor_value(&rest) || rest == version) { return Err(format!( "{PNPM_LOCK} already carries an override for `{key}` ({rest}); vendoring \ would fight it — remove the override (or vendor --revert) first" @@ -666,20 +760,24 @@ fn apply_pkg_override( if existing == Some(spec) { return Ok((false, false, false)); // in sync, no record } - // The conflict pre-flight guarantees any existing value here is OURS - // (a stale uuid): never record our own edit as the "original". - let was_ours = existing.is_some(); + // The classify pre-flight guarantees an existing value here is either + // OURS (a stale uuid — never recorded as an "original") or the user's + // exact-version pin being TAKEN OVER (recorded so revert restores it). + let was_present = existing.is_some(); + let original = existing + .filter(|v| !is_vendor_value(v)) + .map(|v| Value::String(v.to_string())); overrides.insert(our_key.to_string(), Value::String(spec.to_string())); wiring.push(WiringRecord { file: PACKAGE_JSON.to_string(), kind: KIND_PKG_OVERRIDE.to_string(), - action: if was_ours { + action: if was_present { WiringAction::Rewritten } else { WiringAction::Added }, key: Some(our_key.to_string()), - original: None, // Added has none; Rewritten-over-ours records none by design + original, new: Some(Value::String(spec.to_string())), }); Ok((true, created_pnpm_table, created_overrides_table)) @@ -695,7 +793,7 @@ fn edit_overrides( ctx: &EditCtx<'_>, wiring: &mut Vec, ) -> Result { - let our_key = ctx.reg_key(); + let our_key = ctx.override_key.to_string(); let entry_line = format!(" {}: {}", yaml_key(&our_key), ctx.spec); if let Some((start, end)) = section_bounds(lines, "overrides") { // Immutable scan first: our line's position (if present) + the last @@ -703,29 +801,38 @@ fn edit_overrides( let mut ours = None; let mut last_entry = start; for (i, line) in lines.iter().enumerate().take(end).skip(start + 1) { - if let Some((key, _repr, rest)) = parse_key_line(line, 2) { + if let Some((key, repr, rest)) = parse_key_line(line, 2) { last_entry = i; if key == our_key { - ours = Some((i, rest)); + ours = Some((i, repr, rest)); break; } } } - if let Some((i, rest)) = ours { + if let Some((i, repr, rest)) = ours { if rest == ctx.spec { return Ok(false); // in sync } - // Ours with a stale uuid (conflict pre-flight proved it). - lines[i] = entry_line; + // Ours with a stale uuid (no original), or the user's pinned + // value being TAKEN OVER (recorded as original; the live key + // repr/quoting is preserved so revert is byte-faithful). + let original = (!is_vendor_value(&rest)).then(|| rest.clone()); + lines[i] = format!(" {}: {}", yaml_key_like(&our_key, &repr), ctx.spec); wiring.push(overrides_record( &our_key, ctx.spec, WiringAction::Rewritten, + original, )); return Ok(true); } lines.insert(last_entry + 1, entry_line); - wiring.push(overrides_record(&our_key, ctx.spec, WiringAction::Added)); + wiring.push(overrides_record( + &our_key, + ctx.spec, + WiringAction::Added, + None, + )); return Ok(true); } // No overrides section: insert one right before `importers:` (with the @@ -736,17 +843,29 @@ fn edit_overrides( importers..importers, ["overrides:".to_string(), entry_line, String::new()], ); - wiring.push(overrides_record(&our_key, ctx.spec, WiringAction::Added)); + wiring.push(overrides_record( + &our_key, + ctx.spec, + WiringAction::Added, + None, + )); Ok(true) } -fn overrides_record(key: &str, spec: &str, action: WiringAction) -> WiringRecord { +fn overrides_record( + key: &str, + spec: &str, + action: WiringAction, + original: Option, +) -> WiringRecord { WiringRecord { file: PNPM_LOCK.to_string(), kind: KIND_LOCK_OVERRIDES.to_string(), action, key: Some(key.to_string()), - original: None, // Added, or rewritten-over-ours (never an original) + // `Some` only on a takeover (the user's pinned value); Added and + // rewritten-over-ours never record an original. + original: original.map(Value::String), new: Some(Value::String(spec.to_string())), } } @@ -1063,7 +1182,17 @@ fn revert_pkg_record( ))); return; } - overrides.shift_remove(key); + // A takeover recorded the user's pinned value as `original`: restore + // it in place (the key stays). A plain Added/Rewritten-over-ours + // record has no original — remove the key as before. + match rec.original.as_ref().and_then(Value::as_str) { + Some(orig) => { + overrides.insert(key.to_string(), Value::String(orig.to_string())); + } + None => { + overrides.shift_remove(key); + } + } *dirty = true; } @@ -1113,15 +1242,15 @@ fn revert_overrides_line( let mut ours_at = None; let mut others = 0usize; for (i, line) in lines.iter().enumerate().take(end).skip(start + 1) { - if let Some((k, _repr, rest)) = parse_key_line(line, 2) { + if let Some((k, repr, rest)) = parse_key_line(line, 2) { if k == key && ours_at.is_none() { - ours_at = Some((i, rest)); + ours_at = Some((i, repr, rest)); } else { others += 1; } } } - let Some((idx, rest)) = ours_at else { + let Some((idx, repr, rest)) = ours_at else { warnings.push(drifted(format!("overrides entry `{key}` no longer exists"))); return; }; @@ -1133,6 +1262,13 @@ fn revert_overrides_line( ))); return; } + // A takeover recorded the user's pinned value: restore it in place + // (key + quoting preserved; the section obviously stays). + if let Some(orig) = rec.original.as_ref().and_then(Value::as_str) { + lines[idx] = format!(" {}: {orig}", yaml_key_like(key, &repr)); + *dirty = true; + return; + } lines.remove(idx); *dirty = true; if others == 0 { @@ -2112,6 +2248,182 @@ snapshots: assert!(live_lock.contains("overrides:\n other-pkg: 2.0.0\n\nimporters:")); } + // ── exact-version pin takeover ───────────────────────────────────────── + + /// package.json with a user-authored override pin (`key: value`) plus the + /// matching lock-side `overrides:` mirror line. + fn pin_fixture_inputs(key: &str, value: &str) -> (String, String) { + let pkg = format!( + "{{\n \"name\": \"vendor-spike\",\n \"version\": \"1.0.0\",\n \"private\": true,\n \"dependencies\": {{\n \"consumer\": \"file:./consumer\",\n \"left-pad\": \"1.3.0\",\n \"left-pad-old\": \"npm:left-pad@1.2.0\"\n }},\n \"pnpm\": {{\n \"overrides\": {{\n \"{key}\": \"{value}\"\n }}\n }}\n}}\n" + ); + let lock = P1_BEFORE_LOCK.replace( + "importers:", + &format!("overrides:\n {key}: {value}\n\nimporters:"), + ); + (pkg, lock) + } + + /// A user-authored EXACT-version pin equal to the patched version is + /// taken over: the user's key keeps its spelling on both surfaces, its + /// value moves to our `file:` spec, the wiring records the pin as + /// `original`, and a full revert restores both files byte-identically. + #[tokio::test] + async fn user_exact_pin_bare_key_is_taken_over_and_revert_restores_it() { + let (pkg_before, lock_before) = pin_fixture_inputs("left-pad", "1.3.0"); + let fx = fixture_with(&pkg_before, &lock_before).await; + + let (result, entry, _) = expect_done(fx.vendor(false).await); + assert!(result.success, "{:?}", result.error); + let entry = entry.unwrap(); + + // package.json: the USER'S key (`left-pad`) now carries our spec; + // no `left-pad@1.3.0` key was added; tables pre-existed. + let pkg: Value = serde_json::from_str(&fx.read(PACKAGE_JSON).await).unwrap(); + let overrides = &pkg["pnpm"]["overrides"]; + assert_eq!( + overrides["left-pad"], + Value::String(format!("file:{}", fx.rel_tgz())) + ); + assert!(overrides.get("left-pad@1.3.0").is_none()); + assert_eq!( + entry.pnpm, + Some(PnpmMeta { + created_overrides_table: false, + created_pnpm_table: false + }) + ); + + // Lock: same key, same value (map parity — pnpm hard-checks it). + let live_lock = fx.read(PNPM_LOCK).await; + assert!( + live_lock.contains(&format!("overrides:\n left-pad: file:{}", fx.rel_tgz())), + "{live_lock}" + ); + + // Wiring: both override records carry the user's key, action + // Rewritten, and the pin as `original`. + for kind in [KIND_PKG_OVERRIDE, KIND_LOCK_OVERRIDES] { + let rec = entry + .wiring + .iter() + .find(|r| r.kind == kind) + .unwrap_or_else(|| panic!("no {kind} record: {:?}", entry.wiring)); + assert_eq!(rec.key.as_deref(), Some("left-pad"), "{kind}"); + assert_eq!(rec.action, WiringAction::Rewritten, "{kind}"); + assert_eq!( + rec.original, + Some(Value::String("1.3.0".to_string())), + "{kind}: the user's pin is the original" + ); + } + + // Full revert restores the pin on both surfaces byte-identically. + let outcome = revert_pnpm(&entry, fx.root(), false).await; + assert!(outcome.success, "{:?}", outcome.error); + assert_eq!(fx.read(PACKAGE_JSON).await, pkg_before); + assert_eq!(fx.read(PNPM_LOCK).await, lock_before); + } + + /// The versioned key shape (`left-pad@1.3.0: 1.3.0`) is taken over the + /// same way — the key happens to equal our canonical key. + #[tokio::test] + async fn user_exact_pin_versioned_key_is_taken_over() { + let (pkg_before, lock_before) = pin_fixture_inputs("left-pad@1.3.0", "1.3.0"); + let fx = fixture_with(&pkg_before, &lock_before).await; + + let (result, entry, _) = expect_done(fx.vendor(false).await); + assert!(result.success, "{:?}", result.error); + let entry = entry.unwrap(); + + let pkg: Value = serde_json::from_str(&fx.read(PACKAGE_JSON).await).unwrap(); + assert_eq!( + pkg["pnpm"]["overrides"]["left-pad@1.3.0"], + Value::String(format!("file:{}", fx.rel_tgz())) + ); + let rec = entry + .wiring + .iter() + .find(|r| r.kind == KIND_PKG_OVERRIDE) + .unwrap(); + assert_eq!(rec.original, Some(Value::String("1.3.0".to_string()))); + + // Revert restores the pin. + let outcome = revert_pnpm(&entry, fx.root(), false).await; + assert!(outcome.success, "{:?}", outcome.error); + assert_eq!(fx.read(PACKAGE_JSON).await, pkg_before); + assert_eq!(fx.read(PNPM_LOCK).await, lock_before); + } + + /// A second vendor over a taken-over key is the in-sync hot path: + /// AlreadyPatched, no new ledger entry, bytes stable. (Guards the + /// `Ours` classification accepting the user-keyed vendor value — the + /// old `key == our_key` requirement would refuse its own wiring.) + #[tokio::test] + async fn takeover_rerun_is_in_sync_and_records_nothing() { + let (pkg_before, lock_before) = pin_fixture_inputs("left-pad", "1.3.0"); + let fx = fixture_with(&pkg_before, &lock_before).await; + let (result, entry, _) = expect_done(fx.vendor(false).await); + assert!(result.success, "{:?}", result.error); + assert!(entry.is_some()); + let pkg_after = fx.read(PACKAGE_JSON).await; + let lock_after = fx.read(PNPM_LOCK).await; + + let (result, entry, _) = expect_done(fx.vendor(false).await); + assert!(result.success, "{:?}", result.error); + assert!(entry.is_none(), "in-sync rerun records nothing"); + assert!(result + .files_verified + .iter() + .all(|v| v.status == crate::patch::apply::VerifyStatus::AlreadyPatched)); + assert_eq!(fx.read(PACKAGE_JSON).await, pkg_after, "bytes stable"); + assert_eq!(fx.read(PNPM_LOCK).await, lock_after, "bytes stable"); + } + + /// Selector chains and duplicate same-name keys still refuse — only a + /// plain exact pin is taken over. (Range keys and different-version + /// values are covered by `existing_user_override_for_the_name_is_refused`.) + #[tokio::test] + async fn chain_and_duplicate_override_keys_still_refuse() { + // `parent>child` chain, even with the exact version value. + let (pkg, lock) = pin_fixture_inputs("consumer>left-pad", "1.3.0"); + let fx = fixture_with(&pkg, &lock).await; + let detail = expect_refused(fx.vendor(false).await, "vendor_override_conflict"); + assert!(detail.contains("consumer>left-pad"), "{detail}"); + + // Two same-name keys (one ours-shaped pin + one bare pin). + let pkg = "{\n \"name\": \"x\",\n \"pnpm\": {\n \"overrides\": {\n \"left-pad\": \"1.3.0\",\n \"left-pad@1.3.0\": \"1.3.0\"\n }\n }\n}\n".to_string(); + let fx = fixture_with(&pkg, P1_BEFORE_LOCK).await; + let detail = expect_refused(fx.vendor(false).await, "vendor_override_conflict"); + assert!(detail.contains("more than one"), "{detail}"); + } + + /// pkg↔lock override-key shape drift refuses (pnpm itself would fail + /// `ERR_PNPM_LOCKFILE_CONFIG_MISMATCH`); a pkg-side pin with NO lock + /// mirror is fine — the edit inserts the same key, restoring parity. + #[tokio::test] + async fn takeover_lock_shape_mismatch_refuses_but_missing_section_inserts() { + // Shape drift: pkg keys `left-pad`, lock keys `left-pad@1.3.0`. + let (pkg, _) = pin_fixture_inputs("left-pad", "1.3.0"); + let lock = P1_BEFORE_LOCK.replace( + "importers:", + "overrides:\n left-pad@1.3.0: 1.3.0\n\nimporters:", + ); + let fx = fixture_with(&pkg, &lock).await; + let detail = expect_refused(fx.vendor(false).await, "vendor_override_conflict"); + assert!(detail.contains("must"), "{detail}"); + + // No lock overrides section at all: takeover inserts the pkg key. + let fx = fixture_with(&pkg, P1_BEFORE_LOCK).await; + let (result, entry, _) = expect_done(fx.vendor(false).await); + assert!(result.success, "{:?}", result.error); + let live_lock = fx.read(PNPM_LOCK).await; + assert!( + live_lock.contains(&format!("overrides:\n left-pad: file:{}", fx.rel_tgz())), + "lock key matches the pkg key: {live_lock}" + ); + assert!(entry.is_some()); + } + #[tokio::test] async fn created_tables_bookkeeping_and_revert_prunes_them() { // pnpm table exists (other keys), overrides created by us: revert diff --git a/crates/socket-patch-core/src/patch/vendor/yarn_berry_lock.rs b/crates/socket-patch-core/src/patch/vendor/yarn_berry_lock.rs index b7fa943..f4c7ecc 100644 --- a/crates/socket-patch-core/src/patch/vendor/yarn_berry_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/yarn_berry_lock.rs @@ -195,6 +195,12 @@ pub async fn vendor_yarn_berry( format!("{PACKAGE_JSON} root is not an object"), ); }; + // A user-authored BARE-name pin to the exact version being vendored is + // TAKEN OVER (its value is rewritten to our spec — the pin already + // forced this exact version, so semantics are preserved — and recorded + // as the wiring `original` so revert restores it). Anything else + // same-name still refuses. + let mut takeover_original: Option = None; if let Some(res) = pkg_obj.get("resolutions") { let Some(res_obj) = res.as_object() else { return refused( @@ -210,19 +216,26 @@ pub async fn vendor_yarn_berry( continue; } // Our own (possibly stale-uuid) entry is fine to overwrite; a - // user-authored override is never clobbered. + // user-authored override is never clobbered silently. let ours = value .as_str() .is_some_and(|v| parse_vendor_path(v).is_some_and(|p| p.eco == "npm")); - if !ours { - return refused( - "vendor_override_conflict", - format!( - "{PACKAGE_JSON} already has a resolutions entry for `{selector}` \ - ({value}); vendor will not overwrite a user-authored override" - ), - ); + if ours { + continue; + } + if selector == name && value.as_str() == Some(version) { + takeover_original = Some(version.to_string()); + continue; } + return refused( + "vendor_override_conflict", + format!( + "{PACKAGE_JSON} already has a resolutions entry for `{selector}` \ + ({value}); vendor will not overwrite a user-authored override (an \ + exact-version pin `\"{name}\": \"{version}\"` is taken over \ + automatically)" + ), + ); } } @@ -394,16 +407,17 @@ pub async fn vendor_yarn_berry( WiringRecord { file: PACKAGE_JSON.to_string(), kind: KIND_RESOLUTION.to_string(), - // Rewritten only when replacing our own stale entry — and then - // there is deliberately no `original` (never record our own edit - // as a pre-vendor fragment). + // Rewritten when replacing our own stale entry (no `original` — + // never record our own edit as a pre-vendor fragment) or a + // taken-over user pin (whose value IS the `original`, restored + // verbatim on revert). action: if existing_entry { WiringAction::Rewritten } else { WiringAction::Added }, key: Some(name.to_string()), - original: None, + original: takeover_original.map(Value::String), new: Some(Value::String(spec)), }, WiringRecord { @@ -689,6 +703,13 @@ fn revert_resolution_record( )); return; } + // A takeover recorded the user's pinned value: restore it in place + // (the key and table stay). Otherwise remove our entry as before. + if let Some(orig) = rec.original.as_ref().and_then(Value::as_str) { + res_obj.insert(key.to_string(), Value::String(orig.to_string())); + *changed = true; + return; + } res_obj.shift_remove(key); if res_obj.is_empty() { obj.shift_remove("resolutions"); @@ -1357,6 +1378,62 @@ __metadata: assert_eq!(tokio::fs::read(fx.pkg_path()).await.unwrap(), fx.pkg_bytes); } + /// A user-authored BARE-name pin to the exact version being vendored is + /// taken over: the value moves to our spec, the wiring records the pin + /// as `original`, and revert restores it (table kept). Range-keyed + /// selectors keep refusing. + #[tokio::test] + async fn user_exact_pin_resolution_is_taken_over_and_revert_restores_it() { + let pkg_before = B3_BEFORE_PKG.replace( + " }\n}", + " },\n \"resolutions\": {\n \"left-pad\": \"1.3.0\"\n }\n}", + ); + let fx = fixture_with(&pkg_before, B3_BEFORE_LOCK).await; + + let (result, entry, _) = expect_done(fx.vendor(false).await); + assert!(result.success, "{:?}", result.error); + let entry = entry.unwrap(); + + let pkg: Value = + serde_json::from_slice(&tokio::fs::read(fx.pkg_path()).await.unwrap()).unwrap(); + let val = pkg["resolutions"]["left-pad"].as_str().unwrap(); + assert!( + parse_vendor_path(val).is_some_and(|p| p.eco == "npm"), + "pin value rewritten to our spec: {val}" + ); + + let rec = entry + .wiring + .iter() + .find(|r| r.kind == KIND_RESOLUTION) + .unwrap(); + assert_eq!(rec.action, WiringAction::Rewritten); + assert_eq!( + rec.original, + Some(Value::String("1.3.0".to_string())), + "the user's pin is the original" + ); + + // Revert restores the pin in place (the resolutions table stays). + let outcome = revert_yarn_berry(&entry, fx.root(), false).await; + assert!(outcome.success, "{:?}", outcome.error); + let pkg: Value = + serde_json::from_slice(&tokio::fs::read(fx.pkg_path()).await.unwrap()).unwrap(); + assert_eq!( + pkg["resolutions"]["left-pad"], + Value::String("1.3.0".to_string()), + "pin restored" + ); + + // A range-keyed selector with the same value still refuses. + let pkg = B3_BEFORE_PKG.replace( + " }\n}", + " },\n \"resolutions\": {\n \"left-pad@npm:1.x\": \"1.3.0\"\n }\n}", + ); + let fx = fixture_with(&pkg, B3_BEFORE_LOCK).await; + expect_refused(fx.vendor(false).await, "vendor_override_conflict"); + } + #[tokio::test] async fn missing_entry_and_other_version_guards() { // No left-pad entry at all. From adc517993f80bb1d883cc8b44600c0cbcc2b7d95 Mon Sep 17 00:00:00 2001 From: Mikola Lysenko Date: Thu, 11 Jun 2026 15:56:54 -0400 Subject: [PATCH 04/19] feat(scan): prune lifecycle for vendored packages scan --prune previously blanket-exempted vendored purls, so nothing ever cleaned unused vendored state: dropped patches kept their artifacts and overrides forever, removed dependencies stayed redirected, and orphan uuid dirs were only swept by vendor --revert. The prune pass now runs a vendored-state GC first (under the apply lock; contention degrades to a skip, never a scan failure): (a) entries whose patch is gone from the manifest are reverted (same stale test as the vendor flows' reconcile_dropped); (b) entries whose dependency left the lockfile graph are reverted and their manifest entries dropped, feeding the same pass's blob sweep. Per-flavor in-use probes: pnpm scans packages:/snapshots: blocks for the artifact (the mirrored overrides: declaration alone is not usage); package-lock/yarn/bun probe the lock text for the uuid dir (those flavors wire resolutions into the lock itself). None = cannot determine = keep, fail-safe; detached entries are exempt (lockfile-invisible by design); (c) orphan .socket/vendor// dirs are swept (extracted from run_revert into a shared sweep_orphan_vendor_dirs). JSON gc gains revertedVendoredEntries/removedVendorOrphanDirs (wet) and revertableVendoredEntries/vendorOrphanDirs (preview, which also mirrors the wet pass's manifest drops so blob counts agree); human output gains a GC summary line. CLI_CONTRACT.md updated. Co-Authored-By: Claude Fable 5 --- crates/socket-patch-cli/CLI_CONTRACT.md | 5 +- crates/socket-patch-cli/src/commands/scan.rs | 128 ++++- .../socket-patch-cli/src/commands/vendor.rs | 456 +++++++++++++++++- .../src/patch/vendor/npm_flavor.rs | 129 +++++ .../src/patch/vendor/pnpm_lock.rs | 86 ++++ 5 files changed, 767 insertions(+), 37 deletions(-) diff --git a/crates/socket-patch-cli/CLI_CONTRACT.md b/crates/socket-patch-cli/CLI_CONTRACT.md index 464ef6a..de9ef89 100644 --- a/crates/socket-patch-cli/CLI_CONTRACT.md +++ b/crates/socket-patch-cli/CLI_CONTRACT.md @@ -71,7 +71,7 @@ Beyond the globals above, each subcommand defines a small set of local arguments `scan --apply` opts JSON callers into the full discover → select → apply pipeline. Without it, `scan --json` stays read-only (discovery + `updates` array only). No effect outside `--json` mode — the non-JSON path always prompts the user interactively. -`scan --prune` opts into garbage collection. When set, `scan` removes manifest entries for packages no longer present in the crawl, then deletes orphan blob, diff, and package-archive files from `.socket/`. Off by default (v3.0) so a temporary uninstall doesn't silently destroy manifest state. +`scan --prune` opts into garbage collection. When set, `scan` removes manifest entries for packages no longer present in the crawl, then deletes orphan blob, diff, and package-archive files from `.socket/`. Off by default (v3.0) so a temporary uninstall doesn't silently destroy manifest state. The pass also reconciles vendored state (runs FIRST, under the apply lock — lock contention skips it without failing the scan): vendored entries whose patch is gone from the manifest are reverted, vendored entries whose dependency is no longer in the lockfile graph are reverted AND their manifest entries dropped (detached entries are exempt from both — they are manifest- and lockfile-invisible by design; a missing or undeterminable lockfile keeps the entry, fail-safe), and orphan `.socket/vendor//` dirs with no ledger entry are swept. The JSON `gc` sub-object gains `revertedVendoredEntries` + `removedVendorOrphanDirs` (wet) / `revertableVendoredEntries` + `vendorOrphanDirs` (preview). `scan` queries the patch API in `--batch-size` chunks. Authenticated runs POST `/v0/orgs/{slug}/patches/batch`; token-less runs POST `{proxy}/patch/batch` on the public proxy and degrade to per-package `GET /patch/by-package/:purl` requests in two cases: the deployed proxy predates the batch endpoint (legacy proxies answer the POST with their `400 "Unsupported endpoint"` catch-all), or the all-or-nothing batch validation rejects the chunk (e.g. a crawled PURL type the server doesn't recognize, such as `pkg:jsr/…` — the per-package path tolerates those individually, preserving the pre-batch scan semantics). Rate limits and over-capacity 503s surface instead of silently degrading. @@ -442,7 +442,8 @@ worse, lets a warm cache silently serve unpatched bytes): moved past the vendored uuid (that would break VEX verification with `vendor_uuid_mismatch` until a vendor run). The skip rides `apply.patches[]` as `skipped`/`vendored`; a newer available patch still surfaces in `updates[]` — the signal to run `scan --vendor`. `scan --prune` exempts - vendored purls (an absent installed copy is their NORMAL state, not grounds to prune). An + vendored purls from the crawl-based manifest prune (an absent installed copy is their NORMAL + state) but reconciles vendored state via the lockfile instead — see the `--prune` section. An explicit `get` is allowed to move the manifest past the vendored uuid and warns (`warnings[]` + stderr) that a `vendor` run must refresh the artifact. * **Old-binary skew caveat**: a pre-detached `socket-patch` binary running `vendor` against a diff --git a/crates/socket-patch-cli/src/commands/scan.rs b/crates/socket-patch-cli/src/commands/scan.rs index 182e10d..e9ebd7f 100644 --- a/crates/socket-patch-cli/src/commands/scan.rs +++ b/crates/socket-patch-cli/src/commands/scan.rs @@ -54,6 +54,12 @@ pub(crate) struct GcSummary { pub blobs: CleanupResult, pub diffs: CleanupResult, pub packages: CleanupResult, + /// Vendored entries reverted (or revertable, preview mode) because + /// their patch is gone from the manifest or their dependency left the + /// lockfile graph — see `vendor::run_vendor_gc`. Sorted. + pub vendored_reverted: Vec, + /// Orphan `.socket/vendor//` dirs swept (or sweepable). + pub vendor_orphan_dirs: usize, /// `true` when `--no-prune` was set; the sub-object only carries the /// `skipped: true` field in that case. pub skipped: bool, @@ -64,6 +70,17 @@ impl GcSummary { self.blobs.bytes_freed + self.diffs.bytes_freed + self.packages.bytes_freed } + /// Fold a vendored-state GC pass into this summary. + fn absorb_vendor_gc(&mut self, v: super::vendor::VendorGcSummary) { + self.vendored_reverted = v + .dropped_reverted + .into_iter() + .chain(v.unused_reverted) + .collect(); + self.vendored_reverted.sort(); + self.vendor_orphan_dirs = v.orphan_dirs; + } + /// Serialize for a *mutating* GC pass (post-apply). fn to_apply_json(&self) -> serde_json::Value { if self.skipped { @@ -74,6 +91,8 @@ impl GcSummary { "removedBlobs": self.blobs.blobs_removed, "removedDiffArchives": self.diffs.blobs_removed, "removedPackageArchives": self.packages.blobs_removed, + "revertedVendoredEntries": self.vendored_reverted, + "removedVendorOrphanDirs": self.vendor_orphan_dirs, "bytesFreed": self.total_bytes(), }) } @@ -88,6 +107,8 @@ impl GcSummary { "orphanBlobs": self.blobs.blobs_removed, "orphanDiffArchives": self.diffs.blobs_removed, "orphanPackageArchives": self.packages.blobs_removed, + "revertableVendoredEntries": self.vendored_reverted, + "vendorOrphanDirs": self.vendor_orphan_dirs, "bytesReclaimable": self.total_bytes(), }) } @@ -118,6 +139,7 @@ async fn run_gc( diffs, packages, skipped: false, + ..Default::default() } } @@ -127,16 +149,28 @@ async fn run_gc( /// `prune` flag — when GC isn't requested, simply don't call this function and /// don't emit a `gc` sub-object. async fn run_apply_gc( + common: &crate::args::GlobalArgs, manifest_path: &Path, socket_dir: &Path, scanned_purls: &HashSet, vendored: &HashSet, ) -> GcSummary { + // Vendored-state GC FIRST: it reverts manifest-dropped and + // lockfile-unused vendored entries, dropping the latter's manifest + // entries — so the manifest prune + blob sweep below reclaims their + // blobs in this same pass (and the stale `vendored` exemption set is + // harmless: the entries it would exempt are already gone). + let vendor_gc = super::vendor::run_vendor_gc(common, manifest_path, /*dry_run=*/ false).await; + // Re-read the just-written manifest (the apply step may have added // or updated entries we now want to consider for pruning). let mut manifest = match read_manifest(manifest_path).await { Ok(Some(m)) => m, - _ => return GcSummary::default(), + _ => { + let mut gc = GcSummary::default(); + gc.absorb_vendor_gc(vendor_gc); + return gc; + } }; let prunable = detect_prunable(&manifest, scanned_purls, vendored); for purl in &prunable { @@ -147,22 +181,42 @@ async fn run_apply_gc( // file-level cleanup below still operates on the in-memory copy. let _ = write_manifest(manifest_path, &manifest).await; } - run_gc(&manifest, prunable, socket_dir, /*dry_run=*/ false).await + let mut gc = run_gc(&manifest, prunable, socket_dir, /*dry_run=*/ false).await; + gc.absorb_vendor_gc(vendor_gc); + gc } /// Dry-run preview of the apply-mode GC pass. Same shape as /// [`run_apply_gc`] but emits `prunable*`/`orphan*` field names and /// performs no mutation. async fn preview_apply_gc( + common: &crate::args::GlobalArgs, manifest_path: &Path, socket_dir: &Path, scanned_purls: &HashSet, vendored: &HashSet, ) -> GcSummary { + // Read-only preview of the vendored-state GC (lists, never reverts). + let vendor_gc = super::vendor::run_vendor_gc(common, manifest_path, /*dry_run=*/ true).await; + let mut manifest = match read_manifest(manifest_path).await { Ok(Some(m)) => m, - _ => return GcSummary::default(), + _ => { + let mut gc = GcSummary::default(); + gc.absorb_vendor_gc(vendor_gc); + return gc; + } }; + // Mirror the wet pass: an unused vendored entry's manifest keys are + // dropped before the blob sweep, so drop them from the in-memory copy + // too — otherwise the preview under-reports orphan blobs/bytes + // relative to what the real `--prune` run frees. + for purl in &vendor_gc.unused_reverted { + let base = strip_purl_qualifiers(purl).to_string(); + manifest + .patches + .retain(|k, _| k != purl && strip_purl_qualifiers(k) != base); + } let prunable = detect_prunable(&manifest, scanned_purls, vendored); // Mirror `run_apply_gc`: drop the prunable entries from the manifest // *before* computing orphans (no write — this is the preview). The @@ -174,7 +228,9 @@ async fn preview_apply_gc( for purl in &prunable { manifest.patches.remove(purl); } - run_gc(&manifest, prunable, socket_dir, /*dry_run=*/ true).await + let mut gc = run_gc(&manifest, prunable, socket_dir, /*dry_run=*/ true).await; + gc.absorb_vendor_gc(vendor_gc); + gc } /// PURL strings present in the manifest but absent from `scanned_purls`. @@ -669,7 +725,7 @@ async fn run_vendor_json_path( result["vendor"] = preview_vendor_json(&args.common.cwd, &selected).await; if prune { let gc = - preview_apply_gc(manifest_path, socket_dir, scanned_purls, vendored_purls).await; + preview_apply_gc(&args.common, manifest_path, socket_dir, scanned_purls, vendored_purls).await; result["gc"] = gc.to_preview_json(); } let final_code = @@ -730,7 +786,7 @@ async fn run_vendor_json_path( // package_not_installed; vendored entries are exempt from // the prune itself. if prune { - let gc = run_apply_gc(manifest_path, socket_dir, scanned_purls, vendored_purls).await; + let gc = run_apply_gc(&args.common, manifest_path, socket_dir, scanned_purls, vendored_purls).await; result["gc"] = gc.to_apply_json(); } @@ -816,7 +872,7 @@ async fn run_vendor_interactive_path( // GC before the vendor step (see the JSON path): stale manifest // entries would fail vendoring with package_not_installed. if prune { - let gc = run_apply_gc(manifest_path, socket_dir, scanned_purls, vendored_purls).await; + let gc = run_apply_gc(&args.common, manifest_path, socket_dir, scanned_purls, vendored_purls).await; if !gc.pruned.is_empty() { println!("GC: pruned {} manifest entr{}.", gc.pruned.len(), { if gc.pruned.len() == 1 { @@ -826,6 +882,15 @@ async fn run_vendor_interactive_path( } }); } + if !gc.vendored_reverted.is_empty() || gc.vendor_orphan_dirs > 0 { + println!( + "GC: reverted {} vendored entr{}; swept {} orphan vendor dir{}.", + gc.vendored_reverted.len(), + if gc.vendored_reverted.len() == 1 { "y" } else { "ies" }, + gc.vendor_orphan_dirs, + if gc.vendor_orphan_dirs == 1 { "" } else { "s" }, + ); + } } match boxed_scan_vendor_step( &args.common, @@ -1567,10 +1632,10 @@ pub async fn run(args: ScanArgs) -> i32 { // --- GC (if requested) -------------------------------------- if prune { let gc = if dry { - preview_apply_gc(&manifest_path, &socket_dir, &scanned_purls, &vendored_purls) + preview_apply_gc(&args.common, &manifest_path, &socket_dir, &scanned_purls, &vendored_purls) .await } else { - run_apply_gc(&manifest_path, &socket_dir, &scanned_purls, &vendored_purls).await + run_apply_gc(&args.common, &manifest_path, &socket_dir, &scanned_purls, &vendored_purls).await }; result["gc"] = if dry { gc.to_preview_json() @@ -1620,9 +1685,9 @@ pub async fn run(args: ScanArgs) -> i32 { // --- GC-only path (no --apply, just --prune) -------------------- if prune { let gc = if dry { - preview_apply_gc(&manifest_path, &socket_dir, &scanned_purls, &vendored_purls).await + preview_apply_gc(&args.common, &manifest_path, &socket_dir, &scanned_purls, &vendored_purls).await } else { - run_apply_gc(&manifest_path, &socket_dir, &scanned_purls, &vendored_purls).await + run_apply_gc(&args.common, &manifest_path, &socket_dir, &scanned_purls, &vendored_purls).await }; result["gc"] = if dry { gc.to_preview_json() @@ -2033,7 +2098,7 @@ pub async fn run(args: ScanArgs) -> i32 { // run `socket-patch gc` (or `repair`) explicitly. (Vendor mode // already ran its GC before the vendor step.) if prune && !args.vendor { - let gc = run_apply_gc(&manifest_path, &socket_dir, &scanned_purls, &vendored_purls).await; + let gc = run_apply_gc(&args.common, &manifest_path, &socket_dir, &scanned_purls, &vendored_purls).await; let total = gc.blobs.blobs_removed + gc.diffs.blobs_removed + gc.packages.blobs_removed; if !args.common.silent && (!gc.pruned.is_empty() || total > 0) { println!( @@ -2045,6 +2110,15 @@ pub async fn run(args: ScanArgs) -> i32 { socket_patch_core::utils::cleanup_blobs::format_bytes(gc.total_bytes()), ); } + if !args.common.silent && (!gc.vendored_reverted.is_empty() || gc.vendor_orphan_dirs > 0) { + println!( + "GC: reverted {} vendored entr{}; swept {} orphan vendor dir{}.", + gc.vendored_reverted.len(), + if gc.vendored_reverted.len() == 1 { "y" } else { "ies" }, + gc.vendor_orphan_dirs, + if gc.vendor_orphan_dirs == 1 { "" } else { "s" }, + ); + } } embed_vex_human(&args.common, &args.vex, &manifest_path, code).await @@ -2235,6 +2309,16 @@ mod tests { HashSet::new() } + /// GlobalArgs rooted at the test project dir (the vendored-state GC + /// loads `.socket/vendor/state.json` from `cwd`; these fixtures have + /// none, so the vendor pass is a no-op). + fn gc_common(cwd: &Path) -> crate::args::GlobalArgs { + crate::args::GlobalArgs { + cwd: cwd.to_path_buf(), + ..Default::default() + } + } + #[test] fn detect_prunable_empty_manifest_empty_scanned() { let m = PatchManifest::new(); @@ -2415,7 +2499,14 @@ mod tests { seed_manifest_with_blob(tmp.path(), "pkg:npm/gone@1.0.0", &after_hash); let scanned: HashSet = HashSet::new(); - let preview = preview_apply_gc(&manifest_path, &socket_dir, &scanned, &no_vendored()).await; + let preview = preview_apply_gc( + &gc_common(tmp.path()), + &manifest_path, + &socket_dir, + &scanned, + &no_vendored(), + ) + .await; assert_eq!( preview.pruned, @@ -2452,13 +2543,20 @@ mod tests { let (mp_p, sd_p, blob_p) = seed_manifest_with_blob(tmp_preview.path(), "pkg:npm/gone@1.0.0", &after_hash); let scanned: HashSet = HashSet::new(); - let preview = preview_apply_gc(&mp_p, &sd_p, &scanned, &no_vendored()).await; + let preview = preview_apply_gc( + &gc_common(tmp_preview.path()), + &mp_p, + &sd_p, + &scanned, + &no_vendored(), + ) + .await; assert!(blob_p.exists(), "preview must not mutate"); let tmp_wet = tempfile::tempdir().unwrap(); let (mp_w, sd_w, blob_w) = seed_manifest_with_blob(tmp_wet.path(), "pkg:npm/gone@1.0.0", &after_hash); - let wet = run_apply_gc(&mp_w, &sd_w, &scanned, &no_vendored()).await; + let wet = run_apply_gc(&gc_common(tmp_wet.path()), &mp_w, &sd_w, &scanned, &no_vendored()).await; assert_eq!( preview.blobs.blobs_removed, wet.blobs.blobs_removed, diff --git a/crates/socket-patch-cli/src/commands/vendor.rs b/crates/socket-patch-cli/src/commands/vendor.rs index 94fd6e7..d032439 100644 --- a/crates/socket-patch-cli/src/commands/vendor.rs +++ b/crates/socket-patch-cli/src/commands/vendor.rs @@ -239,6 +239,62 @@ pub(crate) async fn dispatch_revert_one( } } +/// Is this vendored entry still consumed by its project's lockfile +/// dependency graph? `None` = cannot determine — callers must keep the +/// entry (fail-safe): non-npm ecosystems have no in-use probe yet, and a +/// missing/unreadable lockfile proves nothing. +pub(crate) async fn dispatch_in_use_one(entry: &VendorEntry, project_root: &Path) -> Option { + match entry.ecosystem.as_str() { + "npm" => { + socket_patch_core::patch::vendor::npm_flavor::vendored_entry_in_use( + entry, + project_root, + ) + .await + } + _ => None, + } +} + +/// Uuid dirs under `.socket/vendor//` with no owning `(eco, uuid)` +/// ledger entry (a hand-edited state file, or artifacts left by an +/// interrupted run). The lockfile wiring for these is already gone or +/// owned by a recorded entry, so removal is safe; removed unless +/// `dry_run`. Unparseable dirs are never returned (and never deleted). +/// Returns the orphans so callers can emit events / counts. +pub(crate) async fn sweep_orphan_vendor_dirs( + cwd: &Path, + state: &socket_patch_core::patch::vendor::VendorState, + dry_run: bool, +) -> Vec { + let recorded_units: HashSet<(&str, &str)> = state + .entries + .values() + .map(|e| (e.ecosystem.as_str(), e.uuid.as_str())) + .collect(); + let mut orphans = Vec::new(); + for unit in vendor::path::sweep_vendor_dirs(cwd).await { + if recorded_units.contains(&(unit.eco.as_str(), unit.uuid.as_str())) { + continue; + } + if !dry_run { + let _ = remove_tree(&unit.dir).await; + } + orphans.push(unit); + } + orphans +} + +/// Does `eco` fall inside this run's `--ecosystems` scope? +pub(crate) fn ecosystem_in_scope(common: &GlobalArgs, eco: &str) -> bool { + match common.ecosystems.as_deref() { + None => true, + Some(list) => list.iter().any(|e| { + e.eq_ignore_ascii_case(eco) || (eco == "golang" && e.eq_ignore_ascii_case("go")) + }), + } +} + /// Surface a backend warning: stderr line for humans, a Skipped event with /// the stable code for JSON consumers (Skipped never flips the status). fn record_warning(env: &mut Envelope, purl: &str, warning: &VendorWarning, common: &GlobalArgs) { @@ -767,12 +823,6 @@ pub(crate) async fn reconcile_dropped( // Respect this run's --ecosystems scope: a `vendor --ecosystems npm` // invocation must not silently revert a cargo/go entry (restoring its // lockfile and deleting its artifact) as a cross-ecosystem side effect. - let in_scope = |eco: &str| match common.ecosystems.as_deref() { - None => true, - Some(list) => list.iter().any(|e| { - e.eq_ignore_ascii_case(eco) || (eco == "golang" && e.eq_ignore_ascii_case("go")) - }), - }; let stale: Vec = state .entries .iter() @@ -782,7 +832,7 @@ pub(crate) async fn reconcile_dropped( // normal state, not a drop — only `vendor --revert` or // `remove` may undo them. !entry.detached - && in_scope(&entry.ecosystem) + && ecosystem_in_scope(common, &entry.ecosystem) && !manifest.patches.contains_key(*purl) && !manifest.patches.contains_key(&entry.base_purl) }) @@ -875,19 +925,7 @@ async fn run_revert(args: &VendorArgs, env: &mut Envelope) -> i32 { // state file, or artifacts left by an interrupted run). The lockfile // wiring for these is already gone or owned by a recorded entry, so // removal is safe; unparseable dirs are reported, never deleted. - let swept = vendor::path::sweep_vendor_dirs(&common.cwd).await; - let recorded_units: HashSet<(&str, &str)> = state - .entries - .values() - .map(|e| (e.ecosystem.as_str(), e.uuid.as_str())) - .collect(); - for unit in swept { - if recorded_units.contains(&(unit.eco.as_str(), unit.uuid.as_str())) { - continue; - } - if !common.dry_run { - let _ = remove_tree(&unit.dir).await; - } + for unit in sweep_orphan_vendor_dirs(&common.cwd, &state, common.dry_run).await { let label = unit .purls .first() @@ -925,3 +963,381 @@ async fn run_revert(args: &VendorArgs, env: &mut Envelope) -> i32 { 0 } } + +// ───────────────────────── prune-time vendored GC ───────────────────────── + +/// Summary of the vendored-state GC pass `scan --prune` runs (wet or +/// preview). Purls are the state-ledger keys (manifest spelling). +#[derive(Debug, Default)] +pub(crate) struct VendorGcSummary { + /// (a) entries whose patch is gone from the manifest — reverted. + pub dropped_reverted: Vec, + /// (b) entries whose package left the lockfile dependency graph — + /// reverted, and their manifest entries dropped. + pub unused_reverted: Vec, + /// (c) orphan uuid dirs (no owning ledger entry) swept. + pub orphan_dirs: usize, + /// Entries that could not be reverted (kept in the ledger), plus any + /// pass-level skip marker (e.g. lock contention). + pub failed: Vec, +} + +/// The vendored-state GC behind `scan --prune`: +/// +/// (a) revert entries whose patch was dropped from the manifest (same +/// stale test as [`reconcile_dropped`], shared with the vendor flows); +/// (b) revert entries whose dependency is no longer in the lockfile graph +/// ([`dispatch_in_use_one`] == `Some(false)`; `None` keeps, fail-safe) +/// and drop their manifest entries so the caller's manifest prune + +/// blob sweep reclaims the rest in the same pass; +/// (c) sweep orphan uuid dirs. +/// +/// Detached entries are exempt from BOTH (a) (never manifest-tracked) and +/// (b) (lockfile-invisible by design — the probe would always call them +/// unused). A missing/unreadable manifest skips (a) only (a prune must +/// not mass-revert on a deleted manifest — that is `vendor --revert`'s +/// explicit contract). +/// +/// Wet runs take the apply lock (lockfiles + the manifest are rewritten); +/// contention records a skip marker and returns — it never fails the +/// scan. Dry runs are read-only, lock-free, and list-only. +pub(crate) async fn run_vendor_gc( + common: &GlobalArgs, + manifest_path: &Path, + dry_run: bool, +) -> VendorGcSummary { + let mut out = VendorGcSummary::default(); + let mut state = match load_state(&common.cwd).await { + Ok(s) if !s.entries.is_empty() => s, + // No ledger (or unreadable): only the orphan sweep could apply, and + // without a trustworthy ledger it must not delete anything. + _ => return out, + }; + + let socket_dir = manifest_path + .parent() + .map(Path::to_path_buf) + .unwrap_or_else(|| common.cwd.clone()); + let _guard = if dry_run { + None + } else { + match socket_patch_core::patch::apply_lock::acquire(&socket_dir, Duration::from_secs(0)) { + Ok(g) => Some(g), + Err(_) => { + out.failed.push( + "vendor GC skipped: another socket-patch run holds the apply lock".to_string(), + ); + return out; + } + } + }; + + // (a) manifest-dropped entries. + let mut manifest = socket_patch_core::manifest::operations::read_manifest(manifest_path) + .await + .ok() + .flatten(); + if let Some(m) = &manifest { + let stale: Vec = state + .entries + .iter() + .filter(|(purl, entry)| { + !entry.detached + && ecosystem_in_scope(common, &entry.ecosystem) + && !m.patches.contains_key(*purl) + && !m.patches.contains_key(&entry.base_purl) + }) + .map(|(purl, _)| purl.clone()) + .collect(); + for purl in stale { + if dry_run { + out.dropped_reverted.push(purl); + continue; + } + let entry = state.entries.get(&purl).cloned().expect("listed above"); + if dispatch_revert_one(&entry, &common.cwd, false).await.success { + state.entries.remove(&purl); + out.dropped_reverted.push(purl); + } else { + out.failed.push(purl); + } + } + } + + // (b) lockfile-unused entries. + let mut manifest_dirty = false; + let candidates: Vec = state + .entries + .iter() + .filter(|(_, entry)| !entry.detached && ecosystem_in_scope(common, &entry.ecosystem)) + .map(|(purl, _)| purl.clone()) + .collect(); + for purl in candidates { + let entry = state.entries.get(&purl).cloned().expect("listed above"); + if dispatch_in_use_one(&entry, &common.cwd).await != Some(false) { + continue; // in use, or cannot determine — keep + } + if dry_run { + out.unused_reverted.push(purl); + continue; + } + if !dispatch_revert_one(&entry, &common.cwd, false).await.success { + out.failed.push(purl); + continue; + } + state.entries.remove(&purl); + if let Some(m) = manifest.as_mut() { + let base = strip_purl_qualifiers(&entry.base_purl).to_string(); + let dropped: Vec = m + .patches + .keys() + .filter(|k| *k == &purl || strip_purl_qualifiers(k) == base) + .cloned() + .collect(); + for k in dropped { + m.patches.remove(&k); + manifest_dirty = true; + } + } + out.unused_reverted.push(purl); + } + + if !dry_run { + let _ = save_state(&common.cwd, &state).await; + if manifest_dirty { + if let Some(m) = &manifest { + let _ = + socket_patch_core::manifest::operations::write_manifest(manifest_path, m).await; + } + } + } + + // (c) orphan uuid dirs, against the post-removal ledger. + out.orphan_dirs = sweep_orphan_vendor_dirs(&common.cwd, &state, dry_run) + .await + .len(); + out +} + +#[cfg(test)] +mod gc_tests { + use super::*; + use socket_patch_core::manifest::operations::{read_manifest, write_manifest}; + use socket_patch_core::patch::vendor::state::VendorArtifact; + use socket_patch_core::patch::vendor::VendorState; + use std::path::PathBuf; + + const UUID: &str = "9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f"; + const PURL: &str = "pkg:npm/left-pad@1.3.0"; + + fn entry(detached: bool) -> VendorEntry { + VendorEntry { + ecosystem: "npm".into(), + base_purl: PURL.into(), + uuid: UUID.into(), + artifact: VendorArtifact { + path: format!(".socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz"), + sha256: String::new(), + size: None, + platform_locked: None, + }, + wiring: Vec::new(), + lock: None, + took_over_go_patches: false, + detached, + record: None, + flavor: Some("package-lock".into()), + uv: None, + pnpm: None, + poetry: None, + pdm: None, + pipenv: None, + } + } + + /// Tempdir with: a manifest carrying PURL, a ledger with one entry, + /// the artifact on disk, and a package-lock that resolves to it. + async fn gc_fixture(detached: bool) -> (tempfile::TempDir, GlobalArgs, PathBuf) { + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + let socket = root.join(".socket"); + tokio::fs::create_dir_all(socket.join(format!("vendor/npm/{UUID}"))) + .await + .unwrap(); + tokio::fs::write( + socket.join(format!("vendor/npm/{UUID}/left-pad-1.3.0.tgz")), + b"tgz", + ) + .await + .unwrap(); + + let mut manifest = PatchManifest::new(); + manifest.patches.insert( + PURL.to_string(), + socket_patch_core::manifest::schema::PatchRecord { + uuid: UUID.to_string(), + exported_at: String::new(), + files: HashMap::new(), + vulnerabilities: HashMap::new(), + description: String::new(), + license: String::new(), + tier: String::new(), + }, + ); + let manifest_path = socket.join("manifest.json"); + write_manifest(&manifest_path, &manifest).await.unwrap(); + + let mut state = VendorState::default(); + state.entries.insert(PURL.to_string(), entry(detached)); + save_state(root, &state).await.unwrap(); + + tokio::fs::write( + root.join("package-lock.json"), + format!( + "{{\"packages\":{{\"node_modules/left-pad\":{{\"resolved\":\"file:.socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz\"}}}}}}" + ), + ) + .await + .unwrap(); + + let common = GlobalArgs { + cwd: root.to_path_buf(), + json: true, + silent: true, + ..GlobalArgs::default() + }; + (tmp, common, manifest_path) + } + + /// In-manifest + in-lock: the GC keeps everything. + #[tokio::test] + async fn vendor_gc_keeps_in_use_entries() { + let (tmp, common, manifest_path) = gc_fixture(false).await; + let out = run_vendor_gc(&common, &manifest_path, false).await; + assert!(out.dropped_reverted.is_empty(), "{out:?}"); + assert!(out.unused_reverted.is_empty(), "{out:?}"); + assert_eq!(out.orphan_dirs, 0); + assert!(load_state(tmp.path()).await.unwrap().entries.contains_key(PURL)); + } + + /// (a) the patch is gone from the manifest: revert + drop the entry. + #[tokio::test] + async fn vendor_gc_reverts_manifest_dropped_entry() { + let (tmp, common, manifest_path) = gc_fixture(false).await; + write_manifest(&manifest_path, &PatchManifest::new()) + .await + .unwrap(); + + let out = run_vendor_gc(&common, &manifest_path, false).await; + assert_eq!(out.dropped_reverted, vec![PURL.to_string()], "{out:?}"); + assert!(out.failed.is_empty(), "{out:?}"); + assert!(load_state(tmp.path()).await.unwrap().entries.is_empty()); + assert!( + !tmp.path().join(format!(".socket/vendor/npm/{UUID}")).exists(), + "artifact dir removed by the revert" + ); + } + + /// (b) the dependency left the lockfile graph: revert + drop BOTH the + /// ledger entry and the manifest entry. + #[tokio::test] + async fn vendor_gc_reverts_unused_entry_and_drops_manifest_entry() { + let (tmp, common, manifest_path) = gc_fixture(false).await; + // Re-lock without the dependency (no reference to the artifact). + tokio::fs::write(tmp.path().join("package-lock.json"), "{\"packages\":{}}") + .await + .unwrap(); + + let out = run_vendor_gc(&common, &manifest_path, false).await; + assert_eq!(out.unused_reverted, vec![PURL.to_string()], "{out:?}"); + assert!(load_state(tmp.path()).await.unwrap().entries.is_empty()); + let manifest = read_manifest(&manifest_path).await.unwrap().unwrap(); + assert!( + !manifest.patches.contains_key(PURL), + "the unused entry's manifest record is dropped too" + ); + } + + /// Dry run lists without mutating anything. + #[tokio::test] + async fn vendor_gc_dry_run_is_read_only() { + let (tmp, common, manifest_path) = gc_fixture(false).await; + tokio::fs::write(tmp.path().join("package-lock.json"), "{\"packages\":{}}") + .await + .unwrap(); + let state_before = tokio::fs::read(tmp.path().join(".socket/vendor/state.json")) + .await + .unwrap(); + let manifest_before = tokio::fs::read(&manifest_path).await.unwrap(); + + let out = run_vendor_gc(&common, &manifest_path, true).await; + assert_eq!(out.unused_reverted, vec![PURL.to_string()], "{out:?}"); + assert_eq!( + tokio::fs::read(tmp.path().join(".socket/vendor/state.json")) + .await + .unwrap(), + state_before, + "dry run must not touch the ledger" + ); + assert_eq!( + tokio::fs::read(&manifest_path).await.unwrap(), + manifest_before, + "dry run must not touch the manifest" + ); + assert!( + tmp.path().join(format!(".socket/vendor/npm/{UUID}")).exists(), + "dry run must not remove artifacts" + ); + } + + /// A missing/undeterminable lockfile keeps the entry (fail-safe), and a + /// DETACHED entry is exempt from both (a) and (b). + #[tokio::test] + async fn vendor_gc_keeps_undeterminable_and_detached_entries() { + // Lock removed entirely: probe says None → keep. + let (tmp, common, manifest_path) = gc_fixture(false).await; + tokio::fs::remove_file(tmp.path().join("package-lock.json")) + .await + .unwrap(); + let out = run_vendor_gc(&common, &manifest_path, false).await; + assert!(out.unused_reverted.is_empty(), "{out:?}"); + assert!(load_state(tmp.path()).await.unwrap().entries.contains_key(PURL)); + + // Detached entry: absent from the manifest AND lockfile-invisible — + // exactly its normal state. Never reverted by the GC. + let (tmp, common, manifest_path) = gc_fixture(true).await; + write_manifest(&manifest_path, &PatchManifest::new()) + .await + .unwrap(); + tokio::fs::write(tmp.path().join("package-lock.json"), "{\"packages\":{}}") + .await + .unwrap(); + let out = run_vendor_gc(&common, &manifest_path, false).await; + assert!(out.dropped_reverted.is_empty(), "{out:?}"); + assert!(out.unused_reverted.is_empty(), "{out:?}"); + assert!(load_state(tmp.path()).await.unwrap().entries.contains_key(PURL)); + } + + /// (c) uuid dirs with no owning ledger entry are swept (wet) / counted + /// (dry). + #[tokio::test] + async fn vendor_gc_sweeps_orphan_uuid_dirs() { + let (tmp, common, manifest_path) = gc_fixture(false).await; + let orphan_uuid = "1a2b3c4d-5e6f-4a1b-8c2d-9e0f1a2b3c4d"; + let orphan_dir = tmp.path().join(format!(".socket/vendor/npm/{orphan_uuid}")); + tokio::fs::create_dir_all(&orphan_dir).await.unwrap(); + tokio::fs::write(orphan_dir.join("left-pad-1.3.0.tgz"), b"tgz") + .await + .unwrap(); + + let out = run_vendor_gc(&common, &manifest_path, true).await; + assert_eq!(out.orphan_dirs, 1, "{out:?}"); + assert!(orphan_dir.exists(), "dry run keeps the orphan"); + + let out = run_vendor_gc(&common, &manifest_path, false).await; + assert_eq!(out.orphan_dirs, 1, "{out:?}"); + assert!(!orphan_dir.exists(), "wet run sweeps the orphan"); + // The recorded entry's dir survives the sweep. + assert!(tmp.path().join(format!(".socket/vendor/npm/{UUID}")).exists()); + } +} diff --git a/crates/socket-patch-core/src/patch/vendor/npm_flavor.rs b/crates/socket-patch-core/src/patch/vendor/npm_flavor.rs index eb04e9c..b0efc65 100644 --- a/crates/socket-patch-core/src/patch/vendor/npm_flavor.rs +++ b/crates/socket-patch-core/src/patch/vendor/npm_flavor.rs @@ -366,6 +366,54 @@ pub async fn vendor_npm_any( outcome } +/// Is this npm-vendored entry still consumed by its lockfile's dependency +/// graph? +/// +/// `Some(true)`: the lockfile still resolves something to the entry's +/// artifact. `Some(false)`: the lockfile is present and parses but no +/// resolution references `.socket/vendor/npm//` — the dependency +/// was removed and re-locked, so the vendoring is unused (an override/ +/// resolutions DECLARATION alone does not count: pnpm's mirrored +/// `overrides:` section is excluded by the flavor probe, and the other +/// flavors carry no declaration inside the lock at all). `None`: cannot +/// determine (missing lock, unknown flavor) — callers keep the entry, +/// fail-safe. Detached entries are lockfile-invisible BY DESIGN and must +/// never be routed here (the probe would always call them unused). +pub async fn vendored_entry_in_use(entry: &VendorEntry, project_root: &Path) -> Option { + match entry.flavor.as_deref() { + Some("pnpm") => super::pnpm_lock::pnpm_entry_in_use(entry, project_root).await, + // The remaining flavors wire resolutions into the lock itself + // (resolved URLs / file: ranges / package tuples), so a textual + // probe for the uuid dir is exact: the path appears iff some + // resolution still points at the artifact. shrinkwrap wins over + // package-lock, mirroring the vendor/revert lockfile selection. + None | Some("package-lock") => { + lock_text_mentions_uuid( + project_root, + &["npm-shrinkwrap.json", "package-lock.json"], + &entry.uuid, + ) + .await + } + Some("yarn-classic") | Some("yarn-berry") => { + lock_text_mentions_uuid(project_root, &["yarn.lock"], &entry.uuid).await + } + Some("bun") => lock_text_mentions_uuid(project_root, &["bun.lock"], &entry.uuid).await, + Some(_) => None, // unknown flavor: cannot determine + } +} + +/// First readable lockfile from `names`, probed for the uuid artifact dir. +async fn lock_text_mentions_uuid(project_root: &Path, names: &[&str], uuid: &str) -> Option { + let needle = format!(".socket/vendor/npm/{uuid}/"); + for name in names { + if let Ok(text) = tokio::fs::read_to_string(project_root.join(name)).await { + return Some(text.contains(&needle)); + } + } + None +} + /// Revert one recorded npm vendor entry through the flavor that wired it. /// Entries from before the flavor field existed (`None`) are package-lock /// wirings; an unknown flavor fails CLOSED (an older binary must not guess @@ -773,4 +821,85 @@ mod tests { assert!(outcome.success, "flavor {flavor:?}: {:?}", outcome.error); } } + + /// One minimal entry per flavor for the in-use probe. + fn probe_entry(flavor: Option<&str>) -> VendorEntry { + VendorEntry { + ecosystem: "npm".into(), + base_purl: "pkg:npm/left-pad@1.3.0".into(), + uuid: UUID.into(), + artifact: VendorArtifact { + path: format!(".socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz"), + sha256: String::new(), + size: None, + platform_locked: None, + }, + wiring: Vec::new(), + lock: None, + took_over_go_patches: false, + detached: false, + record: None, + flavor: flavor.map(str::to_string), + uv: None, + pnpm: None, + poetry: None, + pdm: None, + pipenv: None, + } + } + + /// The textual flavors: a resolution pointing at the uuid dir means in + /// use; a clean lock means unused; a missing lock or unknown flavor + /// cannot be determined (keep, fail-safe). + #[tokio::test] + async fn vendored_entry_in_use_textual_flavors() { + let entry = probe_entry(Some("package-lock")); + + // Missing lock: undeterminable. + let tmp = tempfile::tempdir().unwrap(); + assert_eq!(vendored_entry_in_use(&entry, tmp.path()).await, None); + + // Lock resolves to our artifact: in use. + touch( + tmp.path(), + "package-lock.json", + &format!( + "{{\"packages\":{{\"node_modules/left-pad\":{{\"resolved\":\"file:.socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz\"}}}}}}" + ), + ) + .await; + assert_eq!(vendored_entry_in_use(&entry, tmp.path()).await, Some(true)); + + // Dep removed + re-locked (no reference left): unused. + touch(tmp.path(), "package-lock.json", "{\"packages\":{}}").await; + assert_eq!(vendored_entry_in_use(&entry, tmp.path()).await, Some(false)); + + // shrinkwrap wins over package-lock (same precedence as vendoring). + touch( + tmp.path(), + "npm-shrinkwrap.json", + &format!( + "{{\"packages\":{{\"node_modules/left-pad\":{{\"resolved\":\"file:.socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz\"}}}}}}" + ), + ) + .await; + assert_eq!(vendored_entry_in_use(&entry, tmp.path()).await, Some(true)); + + // yarn flavors probe yarn.lock. + let entry = probe_entry(Some("yarn-classic")); + let tmp = tempfile::tempdir().unwrap(); + touch( + tmp.path(), + "yarn.lock", + &format!("left-pad@1.3.0:\n resolved \"file:./.socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz#abc\"\n"), + ) + .await; + assert_eq!(vendored_entry_in_use(&entry, tmp.path()).await, Some(true)); + touch(tmp.path(), "yarn.lock", "# yarn lockfile v1\n").await; + assert_eq!(vendored_entry_in_use(&entry, tmp.path()).await, Some(false)); + + // Unknown flavor: undeterminable, fail-safe keep. + let entry = probe_entry(Some("future-pm")); + assert_eq!(vendored_entry_in_use(&entry, tmp.path()).await, None); + } } diff --git a/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs b/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs index 433f3b2..7b21bac 100644 --- a/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs @@ -314,6 +314,46 @@ pub async fn vendor_pnpm( } } +/// Is this pnpm-vendored entry still consumed by the lock's dependency +/// graph? +/// +/// `Some(true)`: a `packages:`/`snapshots:` block resolves to the entry's +/// artifact (`@file:.socket/vendor/npm//...`) — some importer +/// still depends on the package. `Some(false)`: the lock parses cleanly +/// and carries NO such block — the dependency was removed and re-locked +/// (the `overrides:` declaration alone does NOT count as usage: pnpm +/// keeps it mirrored from package.json even when nothing matches it). +/// `None`: cannot determine (missing/unreadable/unsupported lock) — +/// callers must keep the entry, fail-safe. +pub async fn pnpm_entry_in_use(entry: &VendorEntry, project_root: &Path) -> Option { + let text = tokio::fs::read_to_string(project_root.join(PNPM_LOCK)) + .await + .ok()?; + if check_lock_version(&text).is_err() { + return None; + } + let lines = split_lines(&text); + for section in ["packages", "snapshots"] { + let Some((start, end)) = section_bounds(&lines, section) else { + continue; + }; + let mut i = start + 1; + while let Some(block) = next_block(&lines, i, end) { + let resolved_to_ours = block + .key + .find("@file:") + .map(|at| &block.key[at + 1..]) + .and_then(parse_vendor_path) + .is_some_and(|p| p.eco == "npm" && p.uuid == entry.uuid); + if resolved_to_ours { + return Some(true); + } + i = block.end; + } + } + Some(false) +} + /// Undo one pnpm-vendored package: restore the recorded pair fragments and /// remove the artifact dir. Reverse application order; per-record ownership /// is re-checked against the live fragment (drift ⇒ warning, left alone). @@ -2248,6 +2288,52 @@ snapshots: assert!(live_lock.contains("overrides:\n other-pkg: 2.0.0\n\nimporters:")); } + // ── in-use probe ─────────────────────────────────────────────────────── + + /// The prune-time in-use probe: a packages/snapshots block resolving to + /// the artifact means in use; an overrides declaration ALONE (the state + /// pnpm leaves after the dependency is removed and re-locked) does not; + /// a missing or unsupported-version lock is undeterminable (keep). + #[tokio::test] + async fn pnpm_entry_in_use_reflects_lock_graph() { + let fx = fixture_with(P1_BEFORE_PKG, P1_BEFORE_LOCK).await; + let (_, entry, _) = expect_done(fx.vendor(false).await); + let entry = entry.unwrap(); + + // Freshly vendored: the rekeyed file: blocks are in the graph. + assert_eq!(pnpm_entry_in_use(&entry, fx.root()).await, Some(true)); + + // Dep removed + re-locked: pnpm prunes the file: blocks but keeps + // the overrides declaration mirrored from package.json. + let removed_lock = format!( + "lockfileVersion: '9.0'\n\nsettings:\n autoInstallPeers: true\n\ + \noverrides:\n left-pad@1.3.0: file:{}\n\nimporters:\n\n .:\n \ + dependencies:\n consumer:\n specifier: file:./consumer\n \ + version: file:consumer\n\npackages:\n\n consumer@file:consumer:\n \ + resolution: {{directory: consumer, type: directory}}\n\nsnapshots:\n\n \ + consumer@file:consumer: {{}}\n", + fx.rel_tgz() + ); + tokio::fs::write(fx.root().join(PNPM_LOCK), &removed_lock) + .await + .unwrap(); + assert_eq!( + pnpm_entry_in_use(&entry, fx.root()).await, + Some(false), + "the lingering overrides declaration alone is not usage" + ); + + // Unsupported lock version: undeterminable. + tokio::fs::write(fx.root().join(PNPM_LOCK), "lockfileVersion: '6.0'\n") + .await + .unwrap(); + assert_eq!(pnpm_entry_in_use(&entry, fx.root()).await, None); + + // Missing lock: undeterminable. + tokio::fs::remove_file(fx.root().join(PNPM_LOCK)).await.unwrap(); + assert_eq!(pnpm_entry_in_use(&entry, fx.root()).await, None); + } + // ── exact-version pin takeover ───────────────────────────────────────── /// package.json with a user-authored override pin (`key: value`) plus the From 7042cbcb38a23b50af17aa9cf282003c721e447b Mon Sep 17 00:00:00 2001 From: Mikola Lysenko Date: Thu, 11 Jun 2026 16:15:06 -0400 Subject: [PATCH 05/19] test: e2e coverage for encoded scoped purls, mismatch annotation, prune lifecycle - scan_vendor_e2e: full pipeline with the API's percent-encoded scoped purl form (download -> vendor lookup against node_modules/@scope -> lock rewiring -> prune exemption); interactive pre-prompt baseline annotation + auto-force warning; scan --prune reverting an unused vendored entry (ledger + manifest + artifact + lock all reconciled) - clippy: too_many_arguments allow on stage_patch_pack, JsrPurlParts type alias Co-Authored-By: Claude Fable 5 --- .../socket-patch-cli/tests/scan_vendor_e2e.rs | 325 ++++++++++++++++++ .../src/patch/vendor/npm_common.rs | 1 + crates/socket-patch-core/src/utils/purl.rs | 6 +- 3 files changed, 331 insertions(+), 1 deletion(-) diff --git a/crates/socket-patch-cli/tests/scan_vendor_e2e.rs b/crates/socket-patch-cli/tests/scan_vendor_e2e.rs index cc13f29..64f0b28 100644 --- a/crates/socket-patch-cli/tests/scan_vendor_e2e.rs +++ b/crates/socket-patch-cli/tests/scan_vendor_e2e.rs @@ -412,3 +412,328 @@ async fn scan_vendor_flag_conflicts_are_clap_errors() { ); } } + +// ───────────── percent-encoded scoped purls (API canonical form) ───────────── + +const SCOPED_CRAWLER_PURL: &str = "pkg:npm/@scope/left-pad@1.3.0"; +const SCOPED_API_PURL: &str = "pkg:npm/%40scope/left-pad@1.3.0"; + +/// Like `write_fixture`, but the installed package is the SCOPED +/// `@scope/left-pad` (the crawler reports the literal `@scope` form). +fn write_scoped_fixture(root: &Path) { + std::fs::write( + root.join("package.json"), + r#"{ "name": "scan-vendor-test", "version": "0.0.0" }"#, + ) + .unwrap(); + let lock = serde_json::json!({ + "name": "scan-vendor-test", + "version": "0.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "scan-vendor-test", + "version": "0.0.0", + "dependencies": { "@scope/left-pad": "^1.3.0" } + }, + "node_modules/@scope/left-pad": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/@scope/left-pad/-/left-pad-1.3.0.tgz", + "integrity": "sha512-orig==", + "license": "WTFPL" + } + } + }); + let mut lock_bytes = serde_json::to_vec_pretty(&lock).unwrap(); + lock_bytes.push(b'\n'); + std::fs::write(root.join("package-lock.json"), lock_bytes).unwrap(); + + let pkg = root.join("node_modules/@scope/left-pad"); + std::fs::create_dir_all(&pkg).unwrap(); + std::fs::write( + pkg.join("package.json"), + br#"{"name":"@scope/left-pad","version":"1.3.0"}"#, + ) + .unwrap(); + std::fs::write(pkg.join("index.js"), BEFORE).unwrap(); +} + +/// Mock API that serves the patch under the percent-ENCODED purl (the +/// canonical form the production patches API returns for scoped packages), +/// while the batch request/response is keyed by the crawler's literal form. +async fn mount_scoped_patch_api(mock: &MockServer, uuid: &str) { + let before_hash = git_sha256(BEFORE); + let after_hash = git_sha256(AFTER); + Mock::given(method("POST")) + .and(path(format!("/v0/orgs/{ORG_SLUG}/patches/batch"))) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "packages": [{ + "purl": SCOPED_CRAWLER_PURL, + "patches": [{ + "uuid": uuid, + "purl": SCOPED_API_PURL, + "tier": "free", + "cveIds": ["CVE-2026-0001"], + "ghsaIds": [], + "severity": "high", + "title": "vendor target" + }] + }], + "canAccessPaidPatches": false, + }))) + .mount(mock) + .await; + // Per-package search: the crawler purl, urlencoded. + Mock::given(method("GET")) + .and(path(format!( + "/v0/orgs/{ORG_SLUG}/patches/by-package/pkg%3Anpm%2F%40scope%2Fleft-pad%401.3.0" + ))) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "patches": [{ + "uuid": uuid, + "purl": SCOPED_API_PURL, + "publishedAt": "2026-01-01T00:00:00Z", + "description": "Vendor patch", + "license": "MIT", + "tier": "free", + "vulnerabilities": {} + }], + "canAccessPaidPatches": false, + }))) + .mount(mock) + .await; + Mock::given(method("GET")) + .and(path(format!("/v0/orgs/{ORG_SLUG}/patches/view/{uuid}"))) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "uuid": uuid, + "purl": SCOPED_API_PURL, + "publishedAt": "2026-01-01T00:00:00Z", + "files": { + "package/index.js": { + "beforeHash": before_hash, + "afterHash": after_hash, + "blobContent": AFTER_B64, + } + }, + "vulnerabilities": {}, + "description": "Vendor patch", + "license": "MIT", + "tier": "free", + }))) + .mount(mock) + .await; +} + +/// The production patches API serves scoped purls percent-encoded +/// (`pkg:npm/%40scope/...`) and scan stores them verbatim as manifest keys. +/// The whole pipeline — download, vendor lookup against the literal +/// `node_modules/@scope/...` install, lock rewiring, prune exemption — must +/// bridge the two spellings. (Flowise regression: `%40modelcontextprotocol` +/// failed with `package not installed`.) +#[tokio::test] +async fn scan_vendor_resolves_percent_encoded_scoped_purl() { + let mock = MockServer::start().await; + mount_scoped_patch_api(&mock, UUID).await; + let tmp = tempfile::tempdir().unwrap(); + write_scoped_fixture(tmp.path()); + + // --prune in the same run: the freshly-downloaded ENCODED manifest + // entry must not be GC'd against the literal crawler purl. + let (code, stdout, stderr) = run_scan_vendor(tmp.path(), &mock.uri(), &["--prune"]); + assert_eq!(code, 0, "stdout={stdout}; stderr={stderr}"); + let v: serde_json::Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); + assert_eq!(v["status"], "success", "envelope={v}"); + + // Manifest keyed by the verbatim encoded purl — and NOT pruned. + let manifest: serde_json::Value = serde_json::from_str( + &std::fs::read_to_string(tmp.path().join(".socket/manifest.json")).unwrap(), + ) + .unwrap(); + assert_eq!( + manifest["patches"][SCOPED_API_PURL]["uuid"], UUID, + "manifest={manifest}" + ); + assert_eq!( + v["gc"]["prunedManifestEntries"], + serde_json::json!([]), + "the encoded entry must not look prunable: {v}" + ); + + // Vendored: artifact under the DECODED scope dir, lock rewired. + assert_eq!(v["vendor"]["summary"]["applied"], 1, "envelope={v}"); + let tgz = tmp.path().join(format!( + ".socket/vendor/npm/{UUID}/@scope/left-pad-1.3.0.tgz" + )); + assert!(tgz.is_file(), "tarball at the decoded scoped path"); + let lock = std::fs::read_to_string(tmp.path().join("package-lock.json")).unwrap(); + assert!( + lock.contains(&format!(".socket/vendor/npm/{UUID}/@scope/left-pad-1.3.0.tgz")), + "lock consumes the vendored tarball; lock={lock}" + ); + // Ledger keyed by the verbatim encoded purl. + let state: serde_json::Value = serde_json::from_str( + &std::fs::read_to_string(tmp.path().join(".socket/vendor/state.json")).unwrap(), + ) + .unwrap(); + assert_eq!(state["entries"][SCOPED_API_PURL]["uuid"], UUID, "{state}"); +} + +// ───────────────────── prune reconciles vendored state ───────────────────── + +/// After a dependency is removed and re-locked, `scan --prune` (without +/// `--vendor`) reverts the now-unused vendored entry: lock restored, ledger +/// entry + manifest entry dropped, artifact dir removed. +#[tokio::test] +async fn scan_prune_reverts_unused_vendored_entry() { + let mock = MockServer::start().await; + mount_patch_api(&mock, UUID).await; + let tmp = tempfile::tempdir().unwrap(); + write_fixture(tmp.path()); + + // A second installed package so the later prune run's crawl is + // non-empty (left-pad itself gets removed below). + let other = tmp.path().join("node_modules/keeper"); + std::fs::create_dir_all(&other).unwrap(); + std::fs::write( + other.join("package.json"), + br#"{"name":"keeper","version":"1.0.0"}"#, + ) + .unwrap(); + + let (code, stdout, stderr) = run_scan_vendor(tmp.path(), &mock.uri(), &[]); + assert_eq!(code, 0, "stdout={stdout}; stderr={stderr}"); + + // Simulate `npm uninstall left-pad` + re-lock: drop the dep from the + // lock graph and remove the installed copy. The override-free npm + // wiring leaves nothing else behind. + let lock = serde_json::json!({ + "name": "scan-vendor-test", + "version": "0.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { "name": "scan-vendor-test", "version": "0.0.0" } + } + }); + let mut lock_bytes = serde_json::to_vec_pretty(&lock).unwrap(); + lock_bytes.push(b'\n'); + std::fs::write(tmp.path().join("package-lock.json"), &lock_bytes).unwrap(); + std::fs::remove_dir_all(tmp.path().join("node_modules/left-pad")).unwrap(); + + // Plain prune scan (read-only discovery + GC; no --vendor, no --apply). + let out = Command::new(binary()) + .args([ + "scan", + "--json", + "--prune", + "--yes", + "--api-url", + &mock.uri(), + "--api-token", + "fake-token", + "--org", + ORG_SLUG, + ]) + .current_dir(tmp.path()) + .output() + .expect("run"); + let stdout = String::from_utf8_lossy(&out.stdout); + let code = out.status.code().unwrap_or(-1); + assert_eq!(code, 0, "stdout={stdout}"); + let v: serde_json::Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); + + assert_eq!( + v["gc"]["revertedVendoredEntries"], + serde_json::json!([PURL]), + "gc must report the reverted entry: {v}" + ); + + // Ledger empty (an emptied state file may be removed outright), + // manifest entry dropped, artifact gone. + match std::fs::read_to_string(tmp.path().join(".socket/vendor/state.json")) { + Ok(text) => { + let state: serde_json::Value = serde_json::from_str(&text).unwrap(); + assert!( + state["entries"].as_object().is_none_or(|m| m.is_empty()), + "ledger entry removed: {state}" + ); + } + Err(e) if e.kind() == std::io::ErrorKind::NotFound => {} + Err(e) => panic!("unexpected state.json read error: {e}"), + } + let manifest: serde_json::Value = serde_json::from_str( + &std::fs::read_to_string(tmp.path().join(".socket/manifest.json")).unwrap(), + ) + .unwrap(); + assert!( + manifest["patches"] + .as_object() + .is_none_or(|m| !m.contains_key(PURL)), + "manifest entry dropped: {manifest}" + ); + assert!( + !tmp.path().join(format!(".socket/vendor/npm/{UUID}")).exists(), + "artifact dir removed" + ); + // The (already left-pad-free) lock stays exactly as the user re-locked + // it — the revert had nothing to restore there. + assert_eq!( + std::fs::read(tmp.path().join("package-lock.json")).unwrap(), + lock_bytes + ); +} + +/// Interactive (non-JSON) `scan --vendor` pre-verifies patch baselines: +/// installed content matching NEITHER hash is annotated BEFORE the +/// confirm prompt, and the run still vendors (auto-force) with the +/// `vendor_content_mismatch_overwritten` warning on stderr. +#[tokio::test] +async fn scan_vendor_annotates_mismatched_baseline_and_vendors_anyway() { + let mock = MockServer::start().await; + mount_patch_api(&mock, UUID).await; + let tmp = tempfile::tempdir().unwrap(); + write_fixture(tmp.path()); + // Divergent installed bytes: neither BEFORE nor AFTER. + std::fs::write( + tmp.path().join("node_modules/left-pad/index.js"), + b"divergent\n", + ) + .unwrap(); + + let out = Command::new(binary()) + .args([ + "scan", + "--vendor", + "--yes", + "--api-url", + &mock.uri(), + "--api-token", + "fake-token", + "--org", + ORG_SLUG, + ]) + .current_dir(tmp.path()) + .output() + .expect("run"); + let stdout = String::from_utf8_lossy(&out.stdout); + let stderr = String::from_utf8_lossy(&out.stderr); + assert_eq!( + out.status.code().unwrap_or(-1), + 0, + "stdout={stdout}; stderr={stderr}" + ); + assert!( + stdout.contains("installed content differs from patch baseline"), + "pre-prompt annotation present; stdout={stdout}" + ); + assert!( + stderr.contains("vendor_content_mismatch_overwritten"), + "overwrite warning surfaced; stderr={stderr}" + ); + // Vendored despite the mismatch. + assert!(tmp + .path() + .join(format!(".socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz")) + .is_file()); +} diff --git a/crates/socket-patch-core/src/patch/vendor/npm_common.rs b/crates/socket-patch-core/src/patch/vendor/npm_common.rs index f54a893..e2b4cdb 100644 --- a/crates/socket-patch-core/src/patch/vendor/npm_common.rs +++ b/crates/socket-patch-core/src/patch/vendor/npm_common.rs @@ -121,6 +121,7 @@ pub(super) struct NpmStagedPack { /// verification — no pack, no dirs created). /// * `Ok((Some(staged), result))` — full success: the tarball is on disk at /// `staged.rel_tgz` and the caller proceeds to its lockfile wiring. +#[allow(clippy::too_many_arguments)] pub(super) async fn stage_patch_pack( purl: &str, installed_dir: &Path, diff --git a/crates/socket-patch-core/src/utils/purl.rs b/crates/socket-patch-core/src/utils/purl.rs index 393873d..4fa80ec 100644 --- a/crates/socket-patch-core/src/utils/purl.rs +++ b/crates/socket-patch-core/src/utils/purl.rs @@ -244,8 +244,12 @@ pub fn build_composer_purl(namespace: &str, name: &str, version: &str) -> String /// We follow the same shape as `parse_composer_purl` since both /// have a `/` namespace structure. The leading `@` on /// the scope is preserved (matching npm's `@scope/name` convention). +/// `((scope, name), version)` from a JSR purl, percent-decoded. #[cfg(feature = "deno")] -pub fn parse_jsr_purl(purl: &str) -> Option<((Cow<'_, str>, Cow<'_, str>), Cow<'_, str>)> { +pub type JsrPurlParts<'a> = ((Cow<'a, str>, Cow<'a, str>), Cow<'a, str>); + +#[cfg(feature = "deno")] +pub fn parse_jsr_purl(purl: &str) -> Option> { let base = strip_purl_qualifiers(purl); let rest = base.strip_prefix("pkg:jsr/")?; let at_idx = rest.rfind('@')?; From 1a2dc4a29ecfe2f7f208e01135c47bb32b8658f6 Mon Sep 17 00:00:00 2001 From: Mikola Lysenko Date: Thu, 11 Jun 2026 18:08:52 -0400 Subject: [PATCH 06/19] feat(vendor): lockfile inventory module for npm-family locks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Read-only inventories of the dependency set a lockfile resolves, independent of what is installed: name/version/purl plus the lock's artifact URL and content verifier (typed LockIntegrity: SRI, yarn sha1 fragment, berry cache-zip checksum, sha256 hex, go.sum h1 — the latter two for the ecosystems that follow). Powers scan's lockfile supplement and vendor's missing-package fetch. Covers all five npm flavors via detect_npm_lock_flavor (package-lock/ shrinkwrap, pnpm v9, yarn classic, yarn berry, bun). Fail-soft per entry, fail-closed per value (names/versions path-guarded; git/file/ link/workspace specs and our own vendored entries excluded; duplicate instances dedup preferring a verifier). lookup() bridges percent- encoded manifest purls. Reuses the wiring backends' parsers via pub(super) visibility bumps. Co-Authored-By: Claude Fable 5 --- .../src/patch/vendor/bun_lock.rs | 12 +- .../src/patch/vendor/lock_inventory.rs | 769 ++++++++++++++++++ .../socket-patch-core/src/patch/vendor/mod.rs | 1 + .../src/patch/vendor/pnpm_lock.rs | 14 +- .../src/patch/vendor/yarn_berry_lock.rs | 2 +- .../src/patch/vendor/yarn_classic_lock.rs | 2 +- 6 files changed, 785 insertions(+), 15 deletions(-) create mode 100644 crates/socket-patch-core/src/patch/vendor/lock_inventory.rs diff --git a/crates/socket-patch-core/src/patch/vendor/bun_lock.rs b/crates/socket-patch-core/src/patch/vendor/bun_lock.rs index f1b3c51..f35a2c2 100644 --- a/crates/socket-patch-core/src/patch/vendor/bun_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/bun_lock.rs @@ -423,7 +423,7 @@ fn revert_one_record( // ───────────────────────── conservative line grammar ────────────────────── /// One parsed single-line packages entry. -struct BunEntry { +pub(super) struct BunEntry { line_idx: usize, /// Leading whitespace, re-emitted verbatim. indent: String, @@ -432,7 +432,7 @@ struct BunEntry { /// The key token exactly as spelled (incl. quotes), re-emitted verbatim. key_raw: String, /// Verbatim top-level tuple elements (trimmed). - elems: Vec, + pub(super) elems: Vec, trailing_comma: bool, } @@ -472,14 +472,14 @@ fn classify(entry: &BunEntry, target_spec: &str, name: &str) -> Option Option<(&str, &str)> { +pub(super) fn split_name_spec(s: &str) -> Option<(&str, &str)> { let at = s.rfind('@').filter(|&i| i > 0)?; Some((&s[..at], &s[at + 1..])) } /// `"lockfileVersion": ` head check — only the fixture-pinned text /// lockfile version is spliced (fail-closed on anything newer/older). -fn check_lock_version(text: &str) -> Result<(), String> { +pub(super) fn check_lock_version(text: &str) -> Result<(), String> { let version = text.lines().take(5).find_map(|line| { line.trim() .strip_prefix("\"lockfileVersion\":") @@ -514,7 +514,7 @@ fn packages_bounds(lines: &[String]) -> Option<(usize, usize)> { /// Strictly parse every entry line of the packages section. Any line that /// is neither blank nor a single-line `"key": [tuple]` entry fails CLOSED. -fn parse_packages_section(lines: &[String]) -> Result, String> { +pub(super) fn parse_packages_section(lines: &[String]) -> Result, String> { let Some((start, end)) = packages_bounds(lines) else { // No (or unterminated) packages section: an empty lock simply has // no entries; an unterminated one is malformed. @@ -651,7 +651,7 @@ fn split_top_level(interior: &str) -> Result, String> { } /// Decode a verbatim JSON string token; `None` if it is not one. -fn decode_json_string(token: &str) -> Option { +pub(super) fn decode_json_string(token: &str) -> Option { if !token.starts_with('"') { return None; } diff --git a/crates/socket-patch-core/src/patch/vendor/lock_inventory.rs b/crates/socket-patch-core/src/patch/vendor/lock_inventory.rs new file mode 100644 index 0000000..c0c940a --- /dev/null +++ b/crates/socket-patch-core/src/patch/vendor/lock_inventory.rs @@ -0,0 +1,769 @@ +//! Read-only lockfile inventories: the dependency set a project's lockfile +//! resolves, independent of what is installed on disk. +//! +//! Two consumers: +//! +//! * `scan` supplements its installed-tree crawl with lockfile-only entries +//! (discovery on fresh clones and partial installs), warning that those +//! packages are not yet installed; +//! * `vendor` fetches the pristine artifact for a lockfile-resolved package +//! with no installed copy ([`super::registry_fetch`]), verifying the bytes +//! against the integrity the lock records — FAIL-CLOSED: an entry whose +//! lock carries no content verifier is never fetched. +//! +//! Parsing is fail-soft per entry (a malformed entry is skipped, never an +//! error; a malformed file yields `None`) and fail-closed per value: +//! names/versions are path-safety-guarded before an entry is emitted — the +//! lockfile is committed, tamperable input that later feeds filesystem paths +//! and download URLs. + +use std::collections::HashMap; +use std::path::Path; + +use serde_json::Value; + +use crate::patch::path_safety; +use crate::utils::purl::strip_purl_qualifiers; + +use super::npm_common::{is_safe_npm_name, parse_npm_purl}; +use super::npm_flavor::{detect_npm_lock_flavor, NpmLockFlavor}; +use super::path::parse_vendor_path; +use super::{bun_lock, pnpm_lock, yarn_berry_lock, yarn_classic_lock}; + +/// The content verifier a lockfile records for an entry. The fetch layer +/// refuses entries whose verifier is [`LockIntegrity::None`]. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum LockIntegrity { + /// SRI string (`sha512-`, possibly multi-hash space-separated) — + /// npm family; verified against the raw tarball bytes. + Sri(String), + /// yarn classic `resolved "...#"` fragment (40-hex) — verified + /// against the raw tarball bytes. + Sha1Hex(String), + /// yarn berry cache-zip checksum (`/`, e.g. `10c0/…`) — + /// verified by rebuilding the deterministic cache zip from the fetched + /// tarball and comparing (the lock never hashes the tarball itself). + BerryChecksum(String), + /// Hex sha256 of the artifact (Cargo.lock `checksum`, pypi file hashes, + /// Gemfile.lock `CHECKSUMS`). + Sha256Hex(String), + /// go.sum module-zip dirhash (`h1:`). + GoH1(String), + /// The lock records no content verifier. + None, +} + +/// One lockfile-resolved package. +#[derive(Debug, Clone)] +pub struct LockfileEntry { + /// Vendor-ecosystem tag (`npm`, `cargo`, `golang`, `pypi`, `gem`, + /// `composer`) — matches `VendorEntry::ecosystem`. + pub ecosystem: &'static str, + /// Literal (percent-decoded) package name, e.g. `@scope/name`. + pub name: String, + /// Exact resolved version. + pub version: String, + /// Canonical literal purl (`pkg:npm/@scope/name@1.0.0`) — the same form + /// the crawlers emit. + pub purl: String, + /// Artifact URL when the lock records one (package-lock `resolved`, + /// yarn `resolved` minus its `#sha1` fragment, pnpm `tarball:`); `None` + /// means the fetcher constructs the conventional registry URL. + pub resolved: Option, + pub integrity: LockIntegrity, +} + +impl LockfileEntry { + fn npm( + name: impl Into, + version: impl Into, + resolved: Option, + integrity: LockIntegrity, + ) -> Self { + let (name, version) = (name.into(), version.into()); + let purl = format!("pkg:npm/{name}@{version}"); + LockfileEntry { + ecosystem: "npm", + name, + version, + purl, + resolved, + integrity, + } + } +} + +/// Inventory the project's npm-family lockfile. Routes by +/// [`detect_npm_lock_flavor`] (PnP markers, bun.lockb, unsupported lock +/// versions, and a missing lockfile all yield `None`). +pub async fn inventory_npm_lock( + project_root: &Path, +) -> Option<(NpmLockFlavor, Vec)> { + let (flavor, _warnings) = detect_npm_lock_flavor(project_root).await.ok()?; + let raw = match flavor { + NpmLockFlavor::PackageLock => inventory_package_lock(project_root).await, + NpmLockFlavor::Pnpm => inventory_pnpm_lock(project_root).await, + NpmLockFlavor::YarnClassic => inventory_yarn_classic(project_root).await, + NpmLockFlavor::YarnBerry => inventory_yarn_berry(project_root).await, + NpmLockFlavor::Bun => inventory_bun(project_root).await, + }?; + Some((flavor, finalize_npm(raw))) +} + +/// Match a manifest/API purl (possibly percent-encoded, possibly carrying +/// qualifiers) against the inventory. npm purls decode via +/// [`parse_npm_purl`] so `pkg:npm/%40scope/x@1` matches the literal entry. +pub fn lookup<'a>(entries: &'a [LockfileEntry], purl: &str) -> Option<&'a LockfileEntry> { + let base = strip_purl_qualifiers(purl); + if base.starts_with("pkg:npm/") { + let (name, version) = parse_npm_purl(base)?; + return entries + .iter() + .find(|e| e.ecosystem == "npm" && e.name == name && e.version == version); + } + // Other ecosystems route here as their fetchers land. + None +} + +/// Guard + dedup the raw npm entries: unsafe names/versions are dropped +/// fail-closed; duplicate (name, version) instances collapse to one, +/// preferring the instance that carries a verifier. +fn finalize_npm(raw: Vec) -> Vec { + let mut seen: HashMap<(String, String), usize> = HashMap::new(); + let mut out: Vec = Vec::new(); + for entry in raw { + if !is_safe_npm_name(&entry.name) + || !path_safety::is_safe_single_segment(&entry.version) + { + continue; + } + let key = (entry.name.clone(), entry.version.clone()); + match seen.get(&key) { + Some(&i) => { + if out[i].integrity == LockIntegrity::None + && entry.integrity != LockIntegrity::None + { + out[i] = entry; + } + } + None => { + seen.insert(key, out.len()); + out.push(entry); + } + } + } + out +} + +/// Keep a lock-recorded URL only when it is a plain http(s) artifact URL +/// (drops `git+…`, `file:…`, `link:…` — content the registry conventions +/// cannot reproduce; such entries stay listed for discovery but the fetch +/// layer's integrity rule decides fetchability). +fn http_url(raw: &str) -> Option { + (raw.starts_with("https://") || raw.starts_with("http://")).then(|| raw.to_string()) +} + +// ──────────────────── package-lock.json / npm-shrinkwrap ──────────────────── + +async fn inventory_package_lock(root: &Path) -> Option> { + // Shrinkwrap wins, mirroring `npm_lock::select_lockfile`. + let mut bytes = None; + for lock in ["npm-shrinkwrap.json", "package-lock.json"] { + if let Ok(b) = tokio::fs::read(root.join(lock)).await { + bytes = Some(b); + break; + } + } + let doc: Value = serde_json::from_slice(&bytes?).ok()?; + // v1 legacy locks have no `packages` map — no inventory (documented). + let packages = doc.get("packages")?.as_object()?; + + let mut out = Vec::new(); + for (key, node) in packages { + // "" is the root project; keys without node_modules/ are workspace + // members (mirrors npm_lock::scan_lock_matches' member rule). + let Some((_, key_name)) = key.rsplit_once("node_modules/") else { + continue; + }; + if node.get("link").and_then(Value::as_bool).unwrap_or(false) + || node.get("inBundle").and_then(Value::as_bool).unwrap_or(false) + { + continue; + } + let name = node + .get("name") + .and_then(Value::as_str) + .unwrap_or(key_name) + .to_string(); + let Some(version) = node.get("version").and_then(Value::as_str) else { + continue; + }; + let resolved_raw = node.get("resolved").and_then(Value::as_str); + // Our own vendored spec: not a registry dependency. + if resolved_raw.is_some_and(|r| parse_vendor_path(r).is_some()) { + continue; + } + let integrity = node + .get("integrity") + .and_then(Value::as_str) + .map(|i| LockIntegrity::Sri(i.to_string())) + .unwrap_or(LockIntegrity::None); + out.push(LockfileEntry::npm( + name, + version, + resolved_raw.and_then(http_url), + integrity, + )); + } + Some(out) +} + +// ─────────────────────────── pnpm-lock.yaml v9 ─────────────────────────── + +/// Extract one value from an inline YAML map fragment like +/// `{integrity: sha512-…, tarball: file:…}` (values optionally quoted). +fn inline_map_value(fragment: &str, field: &str) -> Option { + let at = fragment.find(&format!("{field}:"))?; + let rest = fragment[at + field.len() + 1..].trim_start(); + let end = rest.find([',', '}']).unwrap_or(rest.len()); + let value = rest[..end].trim().trim_matches(['\'', '"']); + (!value.is_empty()).then(|| value.to_string()) +} + +async fn inventory_pnpm_lock(root: &Path) -> Option> { + let text = tokio::fs::read_to_string(root.join("pnpm-lock.yaml")) + .await + .ok()?; + let lines = pnpm_lock::split_lines(&text); + let (start, end) = pnpm_lock::section_bounds(&lines, "packages")?; + + let mut out = Vec::new(); + let mut i = start + 1; + while let Some(block) = pnpm_lock::next_block(&lines, i, end) { + i = block.end; + // Key grammar: `name@version` (name may be `@scope/name`), with + // optional peer-dep suffixes `(peer@1.2.3)…` after the version. + let base = match block.key.find('(') { + Some(p) => block.key[..p].trim_end(), + None => block.key.as_str(), + }; + let Some(at) = base.rfind('@').filter(|&p| p > 0) else { + continue; + }; + let (name, version) = (&base[..at], &base[at + 1..]); + // Only plain registry versions: `file:`/`link:`/`https:`/git specs + // are not registry-resolvable. + if !version.chars().next().is_some_and(|c| c.is_ascii_digit()) { + continue; + } + let mut integrity = LockIntegrity::None; + let mut tarball: Option = None; + for line in &lines[block.header + 1..block.end] { + let t = line.trim(); + if let Some(rest) = t.strip_prefix("resolution:") { + if let Some(v) = inline_map_value(rest, "integrity") { + integrity = LockIntegrity::Sri(v); + } + tarball = inline_map_value(rest, "tarball"); + break; + } + } + // Our own vendored spec: not a registry dependency. + if tarball.as_deref().is_some_and(|t| parse_vendor_path(t).is_some()) { + continue; + } + out.push(LockfileEntry::npm( + name, + version, + tarball.as_deref().and_then(http_url), + integrity, + )); + } + Some(out) +} + +// ───────────────────────────── yarn.lock (classic) ───────────────────────────── + +async fn inventory_yarn_classic(root: &Path) -> Option> { + let text = tokio::fs::read_to_string(root.join("yarn.lock")).await.ok()?; + let mut out = Vec::new(); + for block in yarn_classic_lock::scan_blocks(&text) { + // Our own vendored block: not a registry dependency. + if yarn_classic_lock::block_points_into_vendor(&block.lines) { + continue; + } + let patterns = yarn_classic_lock::split_key_patterns(&block.key); + let Some(name) = patterns + .first() + .and_then(|p| yarn_classic_lock::pattern_real_name(p)) + else { + continue; + }; + let Some(version) = yarn_classic_lock::classic_field(&block.lines, "version") else { + continue; + }; + let resolved_raw = yarn_classic_lock::classic_field(&block.lines, "resolved"); + // `resolved "url#sha1hex"` — the fragment is the legacy verifier. + let (resolved, sha1_hex) = match resolved_raw { + Some(raw) => match raw.split_once('#') { + Some((url, frag)) => ( + http_url(url), + (frag.len() == 40 && frag.bytes().all(|b| b.is_ascii_hexdigit())) + .then(|| frag.to_ascii_lowercase()), + ), + None => (http_url(raw), None), + }, + None => (None, None), + }; + let integrity = yarn_classic_lock::classic_field(&block.lines, "integrity") + .map(|i| LockIntegrity::Sri(i.to_string())) + .or(sha1_hex.map(LockIntegrity::Sha1Hex)) + .unwrap_or(LockIntegrity::None); + out.push(LockfileEntry::npm(name, version, resolved, integrity)); + } + Some(out) +} + +// ───────────────────────────── yarn.lock (berry) ───────────────────────────── + +async fn inventory_yarn_berry(root: &Path) -> Option> { + let text = tokio::fs::read_to_string(root.join("yarn.lock")).await.ok()?; + let mut out = Vec::new(); + // Berry reuses classic's block grammar (same scanner the berry backend + // imports); `__metadata` and workspace/patch/file resolutions are not + // registry packages. + for block in yarn_classic_lock::scan_blocks(&text) { + if block.key.starts_with("__metadata") { + continue; + } + let Some(resolution) = yarn_berry_lock::berry_field(&block.lines, "resolution") else { + continue; + }; + // Registry resolutions are `name@npm:` (a `::binding` + // suffix may follow). Anything else (workspace:/patch:/file:/link:) + // is skipped — including our own vendored file: resolutions. + let Some((name, reference)) = yarn_classic_lock::split_pattern(&resolution) else { + continue; + }; + let Some(reference) = reference.strip_prefix("npm:") else { + continue; + }; + let version_from_res = reference.split("::").next().unwrap_or(reference); + let version = yarn_berry_lock::berry_field(&block.lines, "version") + .unwrap_or(version_from_res); + let integrity = yarn_berry_lock::berry_field(&block.lines, "checksum") + .map(|c| LockIntegrity::BerryChecksum(c.to_string())) + .unwrap_or(LockIntegrity::None); + out.push(LockfileEntry::npm(name, version, None, integrity)); + } + Some(out) +} + +// ──────────────────────────────── bun.lock ──────────────────────────────── + +async fn inventory_bun(root: &Path) -> Option> { + let text = tokio::fs::read_to_string(root.join("bun.lock")).await.ok()?; + bun_lock::check_lock_version(&text).ok()?; + let lines: Vec = text.split('\n').map(str::to_string).collect(); + let entries = bun_lock::parse_packages_section(&lines).ok()?; + + let mut out = Vec::new(); + for entry in entries { + // Registry entries are 4-tuples `[spec, registry, {deps}, sha512]`; + // our vendored 3-tuples and other shapes are skipped. + if entry.elems.len() != 4 || !entry.elems[2].starts_with('{') { + continue; + } + let Some(spec) = entry.elems.first().and_then(|e| bun_lock::decode_json_string(e)) + else { + continue; + }; + let Some((name, version)) = bun_lock::split_name_spec(&spec) else { + continue; + }; + if !version.chars().next().is_some_and(|c| c.is_ascii_digit()) { + continue; + } + let Some(registry) = bun_lock::decode_json_string(&entry.elems[1]) else { + continue; + }; + let Some(integrity) = bun_lock::decode_json_string(&entry.elems[3]) else { + continue; + }; + // elem[1] is `""` for the default registry; a full `.tgz` URL is + // used verbatim; any other base falls back to conventional URL + // construction (the integrity check still gates the content). + let resolved = (registry.ends_with(".tgz")) + .then(|| http_url(®istry)) + .flatten(); + out.push(LockfileEntry::npm( + name, + version, + resolved, + LockIntegrity::Sri(integrity), + )); + } + Some(out) +} + +#[cfg(test)] +mod tests { + use super::*; + + async fn write(root: &Path, name: &str, content: &str) { + tokio::fs::write(root.join(name), content).await.unwrap(); + } + + fn entry<'a>(entries: &'a [LockfileEntry], name: &str) -> &'a LockfileEntry { + entries + .iter() + .find(|e| e.name == name) + .unwrap_or_else(|| panic!("no entry for {name}: {entries:?}")) + } + + // ── package-lock ────────────────────────────────────────────────────── + + const PACKAGE_LOCK: &str = r#"{ + "name": "fixture", + "version": "1.0.0", + "lockfileVersion": 3, + "packages": { + "": { "name": "fixture", "version": "1.0.0" }, + "packages/member": { "name": "member", "version": "0.0.1" }, + "node_modules/member": { "resolved": "packages/member", "link": true }, + "node_modules/left-pad": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz", + "integrity": "sha512-XI5MPz==" + }, + "node_modules/@scope/pkg": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/@scope/pkg/-/pkg-2.0.0.tgz", + "integrity": "sha512-scoped==" + }, + "node_modules/bundled-dep": { + "version": "1.0.0", + "inBundle": true + }, + "node_modules/git-dep": { + "version": "0.5.0", + "resolved": "git+ssh://git@github.com/x/git-dep.git#abc" + }, + "node_modules/vendored": { + "version": "3.0.0", + "resolved": "file:.socket/vendor/npm/9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f/vendored-3.0.0.tgz", + "integrity": "sha512-ours==" + }, + "node_modules/evil": { + "version": "../../escape", + "resolved": "https://registry.npmjs.org/evil/-/evil-1.0.0.tgz", + "integrity": "sha512-evil==" + } + } +} +"#; + + #[tokio::test] + async fn package_lock_inventories_registry_entries() { + let tmp = tempfile::tempdir().unwrap(); + write(tmp.path(), "package-lock.json", PACKAGE_LOCK).await; + + let (flavor, entries) = inventory_npm_lock(tmp.path()).await.unwrap(); + assert_eq!(flavor, NpmLockFlavor::PackageLock); + + let lp = entry(&entries, "left-pad"); + assert_eq!(lp.version, "1.3.0"); + assert_eq!(lp.purl, "pkg:npm/left-pad@1.3.0"); + assert_eq!( + lp.resolved.as_deref(), + Some("https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz") + ); + assert_eq!(lp.integrity, LockIntegrity::Sri("sha512-XI5MPz==".into())); + + let scoped = entry(&entries, "@scope/pkg"); + assert_eq!(scoped.purl, "pkg:npm/@scope/pkg@2.0.0"); + + // git deps stay listed (discovery) but carry no fetchable URL. + let git = entry(&entries, "git-dep"); + assert_eq!(git.resolved, None); + assert_eq!(git.integrity, LockIntegrity::None); + + // Workspace members, links, bundled deps, our vendored spec, and + // the unsafe-version entry are all absent. + for absent in ["member", "fixture", "bundled-dep", "vendored", "evil"] { + assert!( + !entries.iter().any(|e| e.name == absent), + "{absent} must not be inventoried: {entries:?}" + ); + } + } + + #[tokio::test] + async fn shrinkwrap_wins_over_package_lock() { + let tmp = tempfile::tempdir().unwrap(); + write(tmp.path(), "package-lock.json", PACKAGE_LOCK).await; + write( + tmp.path(), + "npm-shrinkwrap.json", + r#"{ "lockfileVersion": 3, "packages": { + "node_modules/only-in-shrinkwrap": { "version": "9.9.9" } } }"#, + ) + .await; + + let (_, entries) = inventory_npm_lock(tmp.path()).await.unwrap(); + assert!(entries.iter().any(|e| e.name == "only-in-shrinkwrap")); + assert!(!entries.iter().any(|e| e.name == "left-pad")); + } + + #[tokio::test] + async fn legacy_v1_lock_without_packages_map_yields_none() { + let tmp = tempfile::tempdir().unwrap(); + write( + tmp.path(), + "package-lock.json", + r#"{ "lockfileVersion": 1, "dependencies": { "left-pad": { "version": "1.3.0" } } }"#, + ) + .await; + assert!(inventory_npm_lock(tmp.path()).await.is_none()); + } + + // ── pnpm ────────────────────────────────────────────────────────────── + + const PNPM_LOCK: &str = "lockfileVersion: '9.0' + +settings: + autoInstallPeers: true + +importers: + + .: + dependencies: + left-pad: + specifier: 1.3.0 + version: 1.3.0 + +packages: + + left-pad@1.3.0: + resolution: {integrity: sha512-XI5MPz==} + + '@scope/pkg@2.0.0': + resolution: {integrity: sha512-scoped==} + + peer-user@4.0.0(left-pad@1.3.0): + resolution: {integrity: sha512-peer==} + + local-thing@file:packages/local: + resolution: {directory: packages/local, type: directory} + + vendored@file:.socket/vendor/npm/9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f/vendored-3.0.0.tgz: + resolution: {integrity: sha512-ours==, tarball: file:.socket/vendor/npm/9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f/vendored-3.0.0.tgz} + +snapshots: + + left-pad@1.3.0: {} +"; + + #[tokio::test] + async fn pnpm_v9_keys_parse_with_peer_suffix_and_scoped_quoting() { + let tmp = tempfile::tempdir().unwrap(); + write(tmp.path(), "pnpm-lock.yaml", PNPM_LOCK).await; + + let (flavor, entries) = inventory_npm_lock(tmp.path()).await.unwrap(); + assert_eq!(flavor, NpmLockFlavor::Pnpm); + + assert_eq!( + entry(&entries, "left-pad").integrity, + LockIntegrity::Sri("sha512-XI5MPz==".into()) + ); + assert_eq!(entry(&entries, "@scope/pkg").version, "2.0.0"); + assert_eq!(entry(&entries, "peer-user").version, "4.0.0"); + // registry entries carry no URL in v9 — constructed at fetch time. + assert_eq!(entry(&entries, "left-pad").resolved, None); + for absent in ["local-thing", "vendored"] { + assert!(!entries.iter().any(|e| e.name == absent), "{entries:?}"); + } + } + + // ── yarn classic ────────────────────────────────────────────────────── + + const YARN_CLASSIC: &str = "# THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY. +# yarn lockfile v1 + + +\"@scope/pkg@^2.0.0\": + version \"2.0.0\" + resolved \"https://registry.yarnpkg.com/@scope/pkg/-/pkg-2.0.0.tgz#aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\" + integrity sha512-scoped== + +left-pad@1.3.0, left-pad@^1.3.0: + version \"1.3.0\" + resolved \"https://registry.yarnpkg.com/left-pad/-/left-pad-1.3.0.tgz#bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb\" + integrity sha512-XI5MPz== + +old-school@0.1.0: + version \"0.1.0\" + resolved \"https://registry.yarnpkg.com/old-school/-/old-school-0.1.0.tgz#cccccccccccccccccccccccccccccccccccccccc\" + +aliased@npm:real-name@^3.0.0: + version \"3.0.0\" + resolved \"https://registry.yarnpkg.com/real-name/-/real-name-3.0.0.tgz#dddddddddddddddddddddddddddddddddddddddd\" + integrity sha512-alias== +"; + + #[tokio::test] + async fn yarn_classic_blocks_yield_resolved_sha1_and_integrity() { + let tmp = tempfile::tempdir().unwrap(); + write(tmp.path(), "yarn.lock", YARN_CLASSIC).await; + + let (flavor, entries) = inventory_npm_lock(tmp.path()).await.unwrap(); + assert_eq!(flavor, NpmLockFlavor::YarnClassic); + + let lp = entry(&entries, "left-pad"); + assert_eq!( + lp.resolved.as_deref(), + Some("https://registry.yarnpkg.com/left-pad/-/left-pad-1.3.0.tgz"), + "the #sha1 fragment is split off the URL" + ); + assert_eq!(lp.integrity, LockIntegrity::Sri("sha512-XI5MPz==".into())); + + // Integrity-less old locks fall back to the sha1 fragment. + assert_eq!( + entry(&entries, "old-school").integrity, + LockIntegrity::Sha1Hex("c".repeat(40)) + ); + + // `alias@npm:real@range` resolves to the real name. + assert!(entries.iter().any(|e| e.name == "real-name")); + assert_eq!(entry(&entries, "@scope/pkg").version, "2.0.0"); + } + + // ── yarn berry ──────────────────────────────────────────────────────── + + const YARN_BERRY: &str = "# This file is generated by running \"yarn install\" inside your project. +# Manifest files (package.json) are also used. + +__metadata: + version: 8 + cacheKey: 10c0 + +\"fixture@workspace:.\": + version: 0.0.0-use.local + resolution: \"fixture@workspace:.\" + languageName: unknown + linkType: soft + +\"left-pad@npm:1.3.0\": + version: 1.3.0 + resolution: \"left-pad@npm:1.3.0\" + checksum: 10c0/deadbeefcafe== + languageName: node + linkType: hard + +\"@scope/pkg@npm:^2.0.0\": + version: 2.0.0 + resolution: \"@scope/pkg@npm:2.0.0\" + checksum: 10c0/scopedchecksum== + languageName: node + linkType: hard +"; + + #[tokio::test] + async fn yarn_berry_registry_resolutions_inventory_with_checksums() { + let tmp = tempfile::tempdir().unwrap(); + write(tmp.path(), "yarn.lock", YARN_BERRY).await; + + let (flavor, entries) = inventory_npm_lock(tmp.path()).await.unwrap(); + assert_eq!(flavor, NpmLockFlavor::YarnBerry); + + let lp = entry(&entries, "left-pad"); + assert_eq!(lp.version, "1.3.0"); + assert_eq!( + lp.integrity, + LockIntegrity::BerryChecksum("10c0/deadbeefcafe==".into()) + ); + assert_eq!(entry(&entries, "@scope/pkg").version, "2.0.0"); + // The workspace root is not a registry package. + assert!(!entries.iter().any(|e| e.name == "fixture"), "{entries:?}"); + } + + // ── bun ─────────────────────────────────────────────────────────────── + + const BUN_LOCK: &str = r#"{ + "lockfileVersion": 1, + "workspaces": { + "": { "name": "fixture", "dependencies": { "left-pad": "1.3.0" } }, + }, + "packages": { + "left-pad": ["left-pad@1.3.0", "", {}, "sha512-XI5MPz=="], + "@scope/pkg": ["@scope/pkg@2.0.0", "", {}, "sha512-scoped=="], + "vendored": ["vendored@file:.socket/vendor/npm/9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f/vendored-3.0.0.tgz", {}], + "linked": ["linked@workspace:packages/linked", {}], + } +} +"#; + + #[tokio::test] + async fn bun_registry_tuples_parse_and_locals_are_skipped() { + let tmp = tempfile::tempdir().unwrap(); + write(tmp.path(), "bun.lock", BUN_LOCK).await; + + let (flavor, entries) = inventory_npm_lock(tmp.path()).await.unwrap(); + assert_eq!(flavor, NpmLockFlavor::Bun); + + assert_eq!( + entry(&entries, "left-pad").integrity, + LockIntegrity::Sri("sha512-XI5MPz==".into()) + ); + assert_eq!(entry(&entries, "left-pad").resolved, None); + assert_eq!(entry(&entries, "@scope/pkg").version, "2.0.0"); + for absent in ["vendored", "linked"] { + assert!(!entries.iter().any(|e| e.name == absent), "{entries:?}"); + } + } + + // ── shared semantics ────────────────────────────────────────────────── + + #[tokio::test] + async fn lookup_bridges_percent_encoded_purls() { + let entries = vec![ + LockfileEntry::npm("@scope/pkg", "2.0.0", None, LockIntegrity::None), + LockfileEntry::npm("left-pad", "1.3.0", None, LockIntegrity::None), + ]; + assert!(lookup(&entries, "pkg:npm/%40scope/pkg@2.0.0").is_some()); + assert!(lookup(&entries, "pkg:npm/@scope/pkg@2.0.0").is_some()); + assert!(lookup(&entries, "pkg:npm/left-pad@1.3.0?artifact_id=x").is_some()); + assert!(lookup(&entries, "pkg:npm/left-pad@9.9.9").is_none()); + assert!(lookup(&entries, "pkg:pypi/left-pad@1.3.0").is_none()); + } + + #[tokio::test] + async fn dedup_prefers_integrity_bearing_instance() { + let raw = vec![ + LockfileEntry::npm("dup", "1.0.0", None, LockIntegrity::None), + LockfileEntry::npm("dup", "1.0.0", None, LockIntegrity::Sri("sha512-x==".into())), + LockfileEntry::npm("dup", "1.0.0", None, LockIntegrity::None), + ]; + let out = finalize_npm(raw); + assert_eq!(out.len(), 1); + assert_eq!(out[0].integrity, LockIntegrity::Sri("sha512-x==".into())); + } + + #[tokio::test] + async fn unsupported_flavors_yield_none() { + // PnP marker wins over any lockfile. + let tmp = tempfile::tempdir().unwrap(); + write(tmp.path(), ".pnp.cjs", "/* pnp */").await; + write(tmp.path(), "package-lock.json", PACKAGE_LOCK).await; + assert!(inventory_npm_lock(tmp.path()).await.is_none()); + + // pnpm v6. + let tmp = tempfile::tempdir().unwrap(); + write(tmp.path(), "pnpm-lock.yaml", "lockfileVersion: '6.0'\n").await; + assert!(inventory_npm_lock(tmp.path()).await.is_none()); + + // No lockfile at all. + let tmp = tempfile::tempdir().unwrap(); + assert!(inventory_npm_lock(tmp.path()).await.is_none()); + } +} diff --git a/crates/socket-patch-core/src/patch/vendor/mod.rs b/crates/socket-patch-core/src/patch/vendor/mod.rs index 1aa70cc..77e6639 100644 --- a/crates/socket-patch-core/src/patch/vendor/mod.rs +++ b/crates/socket-patch-core/src/patch/vendor/mod.rs @@ -53,6 +53,7 @@ pub mod cargo_lock; #[cfg(feature = "composer")] pub mod composer_lock; pub mod gem; +pub mod lock_inventory; #[cfg(feature = "golang")] pub mod golang; mod npm_common; diff --git a/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs b/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs index 7b21bac..0174f6c 100644 --- a/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs @@ -1581,7 +1581,7 @@ async fn commit_pair( // pnpm-lock.yaml is machine-emitted with a fixed 2/4/6/8-space shape; these // helpers splice line blocks and never interpret YAML generically. -fn split_lines(text: &str) -> Vec { +pub(super) fn split_lines(text: &str) -> Vec { text.split('\n').map(str::to_string).collect() } @@ -1592,7 +1592,7 @@ fn join_lines(lines: &[String]) -> String { /// `(header_idx, end_idx)` of a top-level `name:` section; `end` is the /// first following column-0 line (exclusive), so trailing blank separator /// lines belong to the section. -fn section_bounds(lines: &[String], name: &str) -> Option<(usize, usize)> { +pub(super) fn section_bounds(lines: &[String], name: &str) -> Option<(usize, usize)> { let header = format!("{name}:"); let start = lines.iter().position(|l| l == &header)?; let end = lines @@ -1608,10 +1608,10 @@ fn section_bounds(lines: &[String], name: &str) -> Option<(usize, usize)> { /// One 2-space-keyed block inside a section (`[header, end)`; `end` stops at /// the blank separator / next block header, so the captured fragment is the /// verbatim entry without surrounding blanks). -struct YamlBlock { - header: usize, - end: usize, - key: String, +pub(super) struct YamlBlock { + pub(super) header: usize, + pub(super) end: usize, + pub(super) key: String, /// The key exactly as spelled in the file (incl. quotes) — rekeys /// preserve the file's quoting style. repr: String, @@ -1631,7 +1631,7 @@ impl YamlBlock { } /// The next block at or after line `i` (within `[i, end)`). -fn next_block(lines: &[String], mut i: usize, end: usize) -> Option { +pub(super) fn next_block(lines: &[String], mut i: usize, end: usize) -> Option { while i < end { if let Some((key, repr, rest)) = parse_key_line(&lines[i], 2) { let mut j = i + 1; diff --git a/crates/socket-patch-core/src/patch/vendor/yarn_berry_lock.rs b/crates/socket-patch-core/src/patch/vendor/yarn_berry_lock.rs index f4c7ecc..2dd101b 100644 --- a/crates/socket-patch-core/src/patch/vendor/yarn_berry_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/yarn_berry_lock.rs @@ -898,7 +898,7 @@ fn carried_sections(lines: &[String]) -> Vec { } /// Read a berry scalar field (`: `, value possibly quoted). -fn berry_field<'a>(lines: &'a [String], field: &str) -> Option<&'a str> { +pub(super) fn berry_field<'a>(lines: &'a [String], field: &str) -> Option<&'a str> { for line in lines.iter().skip(1) { let Some(rest) = body_field_line(line) else { continue; diff --git a/crates/socket-patch-core/src/patch/vendor/yarn_classic_lock.rs b/crates/socket-patch-core/src/patch/vendor/yarn_classic_lock.rs index fb25126..6278bb8 100644 --- a/crates/socket-patch-core/src/patch/vendor/yarn_classic_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/yarn_classic_lock.rs @@ -555,7 +555,7 @@ fn rewrite_classic_block( /// Does this block's `resolved` already point into `.socket/vendor/npm/` /// (ours — current or stale uuid)? -fn block_points_into_vendor(lines: &[String]) -> bool { +pub(super) fn block_points_into_vendor(lines: &[String]) -> bool { classic_field(lines, "resolved") .and_then(parse_vendor_path) .is_some_and(|p| p.eco == "npm") From 9c93b906e7406ebb9f01caaeb7697d59ed48f719 Mon Sep 17 00:00:00 2001 From: Mikola Lysenko Date: Thu, 11 Jun 2026 18:14:29 -0400 Subject: [PATCH 07/19] =?UTF-8?q?feat(vendor):=20registry=5Ffetch=20?= =?UTF-8?q?=E2=80=94=20verified=20pristine-artifact=20fetching?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Downloads the artifact a lockfile entry resolves (lock-recorded URL, else the conventional npm registry URL; SOCKET_NPM_REGISTRY override), verifies it against the lock-recorded integrity FAIL-CLOSED before any disk write (strongest hash of a multi-hash SRI; yarn sha1 fragment; sha256 hex), and extracts to a private tempdir the vendor pipeline can stage from. Entries with no verifier are refused without any network I/O (Unverifiable). Hardening: http(s)-only, download/decompression/entry-count/entry-size caps, regular-files-only extraction with first-component strip + is_safe_relative_subpath (fail-closed on traversal-bearing tarballs, nothing half-extracts), exec bits preserved so the deterministic re-pack keeps bin scripts executable. Co-Authored-By: Claude Fable 5 --- .../src/patch/vendor/lock_inventory.rs | 2 +- .../socket-patch-core/src/patch/vendor/mod.rs | 1 + .../src/patch/vendor/registry_fetch.rs | 568 ++++++++++++++++++ 3 files changed, 570 insertions(+), 1 deletion(-) create mode 100644 crates/socket-patch-core/src/patch/vendor/registry_fetch.rs diff --git a/crates/socket-patch-core/src/patch/vendor/lock_inventory.rs b/crates/socket-patch-core/src/patch/vendor/lock_inventory.rs index c0c940a..f663b51 100644 --- a/crates/socket-patch-core/src/patch/vendor/lock_inventory.rs +++ b/crates/socket-patch-core/src/patch/vendor/lock_inventory.rs @@ -342,7 +342,7 @@ async fn inventory_yarn_berry(root: &Path) -> Option> { // Registry resolutions are `name@npm:` (a `::binding` // suffix may follow). Anything else (workspace:/patch:/file:/link:) // is skipped — including our own vendored file: resolutions. - let Some((name, reference)) = yarn_classic_lock::split_pattern(&resolution) else { + let Some((name, reference)) = yarn_classic_lock::split_pattern(resolution) else { continue; }; let Some(reference) = reference.strip_prefix("npm:") else { diff --git a/crates/socket-patch-core/src/patch/vendor/mod.rs b/crates/socket-patch-core/src/patch/vendor/mod.rs index 77e6639..05fddbf 100644 --- a/crates/socket-patch-core/src/patch/vendor/mod.rs +++ b/crates/socket-patch-core/src/patch/vendor/mod.rs @@ -54,6 +54,7 @@ pub mod cargo_lock; pub mod composer_lock; pub mod gem; pub mod lock_inventory; +pub mod registry_fetch; #[cfg(feature = "golang")] pub mod golang; mod npm_common; diff --git a/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs b/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs new file mode 100644 index 0000000..80dc2d3 --- /dev/null +++ b/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs @@ -0,0 +1,568 @@ +//! Pristine-artifact fetching for lockfile-resolved packages with no +//! installed copy. +//! +//! `vendor` needs an installed package dir to stage from; on a fresh clone +//! there is none. This module downloads the pristine artifact the lockfile +//! resolves (the lock-recorded URL when present, the conventional registry +//! URL otherwise), verifies it against the integrity the lock records +//! **FAIL-CLOSED and before anything is written to the staging dir**, and +//! extracts it into a private tempdir the vendor pipeline then treats as +//! the installed dir. The project tree — node_modules included — is never +//! touched. +//! +//! Trust model: the URL comes from the user's own committed lockfile (or a +//! conventional construction from it); content trust comes from the +//! lock-recorded hash, not the transport — which is also why an entry with +//! no verifier ([`LockIntegrity::None`]) is refused outright +//! ([`FetchError::Unverifiable`]) without any network I/O. + +use std::path::{Path, PathBuf}; +use std::time::Duration; + +use base64::Engine as _; +use sha1::Sha1; +use sha2::{Digest, Sha256, Sha384, Sha512}; + +use crate::constants::USER_AGENT; +use crate::patch::apply::is_safe_relative_subpath; + +use super::lock_inventory::{LockIntegrity, LockfileEntry}; + +/// The default npm registry; override with `SOCKET_NPM_REGISTRY` (the +/// enterprise-mirror / test escape hatch — `.npmrc` parsing is out of +/// scope, but lock-recorded `resolved` URLs already carry custom hosts). +pub const DEFAULT_NPM_REGISTRY: &str = "https://registry.npmjs.org"; + +/// Whole-package caps — wider than `patch/package.rs`'s patch-archive caps +/// because these are full upstream packages, but still bounded so a +/// poisoned lockfile cannot turn the fetch into a disk/memory bomb. +const MAX_DOWNLOAD_BYTES: u64 = 128 * 1024 * 1024; +const MAX_TOTAL_DECOMPRESSED_BYTES: u64 = 512 * 1024 * 1024; +const MAX_ENTRY_BYTES: u64 = 128 * 1024 * 1024; +const MAX_ENTRIES: usize = 60_000; + +/// A fetched, verified, extracted package. The tempdir lives exactly as +/// long as this value — callers must hold it until the vendor pipeline has +/// finished staging from [`FetchedPackage::dir`]. +#[derive(Debug)] +pub struct FetchedPackage { + dir: PathBuf, + /// Where the bytes came from (surfaced in the fetch warning event). + pub url: String, + _tmp: tempfile::TempDir, +} + +impl FetchedPackage { + /// The extracted package root (`package.json` at the top for npm). + pub fn dir(&self) -> &Path { + &self.dir + } +} + +#[derive(Debug)] +pub enum FetchError { + /// The entry cannot be verified against the lockfile (no integrity + /// recorded, or no fetcher for its ecosystem) — decided BEFORE any + /// network I/O; the caller keeps its `package_not_installed` outcome. + Unverifiable(String), + /// The fetch was attempted and failed (HTTP error, size cap, integrity + /// mismatch, extraction failure). User-facing message. + Failed(String), +} + +/// One shared client for all fetches in a run. +pub fn build_registry_client() -> reqwest::Client { + reqwest::Client::builder() + .user_agent(USER_AGENT) + .timeout(Duration::from_secs(60)) + .build() + .unwrap_or_else(|_| reqwest::Client::new()) +} + +/// The npm registry base after the env override. +pub fn npm_registry_base() -> String { + std::env::var("SOCKET_NPM_REGISTRY") + .ok() + .map(|v| v.trim_end_matches('/').to_string()) + .filter(|v| !v.is_empty()) + .unwrap_or_else(|| DEFAULT_NPM_REGISTRY.to_string()) +} + +/// Conventional npm tarball URL: the scope stays in the package path, the +/// tarball leaf uses the bare name — +/// `{base}/@scope/name/-/name-1.0.0.tgz` / `{base}/name/-/name-1.0.0.tgz`. +pub fn npm_tarball_url(base: &str, name: &str, version: &str) -> String { + let leaf = name.rsplit('/').next().unwrap_or(name); + format!("{base}/{name}/-/{leaf}-{version}.tgz") +} + +/// Fetch + verify + extract one lockfile entry. Ecosystems without a +/// fetcher yet return [`FetchError::Unverifiable`] (callers keep their +/// not-installed outcome). +pub async fn fetch_and_stage( + entry: &LockfileEntry, + client: &reqwest::Client, +) -> Result { + if entry.integrity == LockIntegrity::None { + return Err(FetchError::Unverifiable(format!( + "the lockfile records no integrity hash for {}@{}; refusing to fetch \ + unverifiable content", + entry.name, entry.version + ))); + } + match entry.ecosystem { + "npm" => fetch_npm(entry, client).await, + other => Err(FetchError::Unverifiable(format!( + "no registry fetcher for ecosystem `{other}`" + ))), + } +} + +async fn fetch_npm( + entry: &LockfileEntry, + client: &reqwest::Client, +) -> Result { + let url = entry.resolved.clone().unwrap_or_else(|| { + npm_tarball_url(&npm_registry_base(), &entry.name, &entry.version) + }); + let bytes = download(client, &url).await.map_err(FetchError::Failed)?; + verify_integrity(&bytes, &entry.integrity)?; + + let tmp = tempfile::tempdir() + .map_err(|e| FetchError::Failed(format!("cannot create fetch tempdir: {e}")))?; + let dir = tmp.path().join("package"); + extract_tgz(&bytes, &dir).map_err(FetchError::Failed)?; + if tokio::fs::metadata(dir.join("package.json")).await.is_err() { + return Err(FetchError::Failed(format!( + "fetched tarball for {}@{} carries no package.json — not an npm package", + entry.name, entry.version + ))); + } + Ok(FetchedPackage { + dir, + url, + _tmp: tmp, + }) +} + +/// Capped download. http(s) only; the cap is enforced on the declared +/// Content-Length AND the actual stream (a lying server cannot blow past +/// it). +async fn download(client: &reqwest::Client, url: &str) -> Result, String> { + if !(url.starts_with("https://") || url.starts_with("http://")) { + return Err(format!("refusing non-http(s) artifact URL `{url}`")); + } + let mut resp = client + .get(url) + .send() + .await + .map_err(|e| format!("GET {url}: {e}"))?; + let status = resp.status(); + if !status.is_success() { + return Err(format!("GET {url}: HTTP {status}")); + } + if let Some(len) = resp.content_length() { + if len > MAX_DOWNLOAD_BYTES { + return Err(format!( + "{url}: artifact is {len} bytes (cap {MAX_DOWNLOAD_BYTES})" + )); + } + } + let mut bytes: Vec = Vec::new(); + while let Some(chunk) = resp + .chunk() + .await + .map_err(|e| format!("reading {url}: {e}"))? + { + if bytes.len() as u64 + chunk.len() as u64 > MAX_DOWNLOAD_BYTES { + return Err(format!( + "{url}: artifact exceeds the {MAX_DOWNLOAD_BYTES}-byte cap" + )); + } + bytes.extend_from_slice(&chunk); + } + Ok(bytes) +} + +/// Verify downloaded bytes against the lock-recorded verifier. Runs BEFORE +/// any disk write. Berry cache-zip checksums and go.sum dirhashes have +/// dedicated verifiers in their ecosystems' fetchers. +fn verify_integrity(bytes: &[u8], integrity: &LockIntegrity) -> Result<(), FetchError> { + match integrity { + LockIntegrity::Sri(sri) => verify_sri(bytes, sri).map_err(FetchError::Failed), + LockIntegrity::Sha1Hex(expect) => { + let actual = hex::encode(Sha1::digest(bytes)); + if &actual == expect { + Ok(()) + } else { + Err(FetchError::Failed(format!( + "sha1 mismatch: lockfile records {expect}, downloaded bytes hash to {actual}" + ))) + } + } + LockIntegrity::Sha256Hex(expect) => { + let actual = hex::encode(Sha256::digest(bytes)); + if actual.eq_ignore_ascii_case(expect) { + Ok(()) + } else { + Err(FetchError::Failed(format!( + "sha256 mismatch: lockfile records {expect}, downloaded bytes hash to {actual}" + ))) + } + } + LockIntegrity::BerryChecksum(_) | LockIntegrity::GoH1(_) => { + Err(FetchError::Unverifiable( + "verifier handled by a dedicated ecosystem fetcher".to_string(), + )) + } + LockIntegrity::None => Err(FetchError::Unverifiable( + "no integrity recorded".to_string(), + )), + } +} + +/// SRI verification: pick the strongest hash of a (possibly multi-hash, +/// whitespace-separated) SRI string and compare base64 digests. +fn verify_sri(bytes: &[u8], sri: &str) -> Result<(), String> { + let mut best: Option<(u8, &str, &str)> = None; + for token in sri.split_whitespace() { + let Some((algo, b64)) = token.split_once('-') else { + continue; + }; + let rank = match algo { + "sha512" => 3, + "sha384" => 2, + "sha256" => 1, + _ => continue, + }; + if best.map(|(r, _, _)| rank > r).unwrap_or(true) { + best = Some((rank, algo, b64)); + } + } + let Some((_, algo, expect)) = best else { + return Err(format!("no usable hash in SRI `{sri}`")); + }; + let b64 = base64::engine::general_purpose::STANDARD; + let actual = match algo { + "sha512" => b64.encode(Sha512::digest(bytes)), + "sha384" => b64.encode(Sha384::digest(bytes)), + _ => b64.encode(Sha256::digest(bytes)), + }; + if actual == expect { + Ok(()) + } else { + Err(format!( + "{algo} integrity mismatch: lockfile records {expect}, downloaded bytes hash to \ + {actual}" + )) + } +} + +/// Strip the FIRST path component (npm's tarball semantics — usually +/// `package/`, but registry tarballs may use any prefix dir). +fn strip_first_component(path: &Path) -> Option { + let mut components = path.components(); + components.next()?; + let rest = components.as_path(); + (!rest.as_os_str().is_empty()).then(|| rest.to_path_buf()) +} + +/// Traversal-guarded, mode-preserving tgz extraction (the same guard +/// family as `patch/package.rs::read_archive_to_map`, plus exec-bit +/// preservation: the deterministic re-pack reads modes from disk, so a +/// bytes-only extraction would silently strip bin scripts' exec bits). +/// Fails CLOSED on any traversal-shaped entry — a malicious tarball must +/// not half-extract. +fn extract_tgz(bytes: &[u8], dest: &Path) -> Result<(), String> { + use std::io::Read as _; + let gz = flate2::read::GzDecoder::new(bytes).take(MAX_TOTAL_DECOMPRESSED_BYTES); + let mut archive = tar::Archive::new(gz); + let mut count = 0usize; + for entry in archive + .entries() + .map_err(|e| format!("unreadable tarball: {e}"))? + { + let mut entry = entry.map_err(|e| format!("unreadable tarball entry: {e}"))?; + count += 1; + if count > MAX_ENTRIES { + return Err(format!("tarball exceeds {MAX_ENTRIES} entries")); + } + // Regular files only: symlinks/hardlinks/devices never extract + // (a symlink could redirect later entries out of the stage). + if !entry.header().entry_type().is_file() { + continue; + } + let raw = entry + .path() + .map_err(|e| format!("tarball entry has an undecodable path: {e}"))? + .into_owned(); + let Some(rel) = strip_first_component(&raw) else { + continue; // a bare prefix-level file — not package content + }; + let rel_str = rel.to_string_lossy(); + if !is_safe_relative_subpath(&rel_str) { + return Err(format!( + "tarball entry `{}` escapes the extraction dir — refusing the artifact", + raw.display() + )); + } + let size = entry.header().size().unwrap_or(u64::MAX); + if size > MAX_ENTRY_BYTES { + return Err(format!( + "tarball entry `{rel_str}` is {size} bytes (cap {MAX_ENTRY_BYTES})" + )); + } + let target = dest.join(&rel); + if let Some(parent) = target.parent() { + std::fs::create_dir_all(parent) + .map_err(|e| format!("cannot create {}: {e}", parent.display()))?; + } + let mut out = std::fs::File::create(&target) + .map_err(|e| format!("cannot create {}: {e}", target.display()))?; + std::io::copy(&mut entry, &mut out) + .map_err(|e| format!("cannot extract `{rel_str}`: {e}"))?; + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let mode = entry.header().mode().unwrap_or(0o644); + let perms = if mode & 0o111 != 0 { 0o755 } else { 0o644 }; + let _ = std::fs::set_permissions(&target, std::fs::Permissions::from_mode(perms)); + } + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use wiremock::matchers::{method, path as url_path}; + use wiremock::{Mock, MockServer, ResponseTemplate}; + + /// Build a gzipped tarball with the given `(path, bytes, exec)` entries. + fn make_tgz(entries: &[(&str, &[u8], bool)]) -> Vec { + let mut builder = tar::Builder::new(flate2::write::GzEncoder::new( + Vec::new(), + flate2::Compression::default(), + )); + for (path, bytes, exec) in entries { + let mut header = tar::Header::new_gnu(); + header.set_size(bytes.len() as u64); + header.set_mode(if *exec { 0o755 } else { 0o644 }); + header.set_cksum(); + builder.append_data(&mut header, path, *bytes).unwrap(); + } + builder.into_inner().unwrap().finish().unwrap() + } + + fn sri_of(bytes: &[u8]) -> String { + format!( + "sha512-{}", + base64::engine::general_purpose::STANDARD.encode(Sha512::digest(bytes)) + ) + } + + fn npm_entry(resolved: Option, integrity: LockIntegrity) -> LockfileEntry { + LockfileEntry { + ecosystem: "npm", + name: "left-pad".into(), + version: "1.3.0".into(), + purl: "pkg:npm/left-pad@1.3.0".into(), + resolved, + integrity, + } + } + + #[test] + fn tarball_url_forms() { + assert_eq!( + npm_tarball_url(DEFAULT_NPM_REGISTRY, "left-pad", "1.3.0"), + "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz" + ); + assert_eq!( + npm_tarball_url(DEFAULT_NPM_REGISTRY, "@scope/pkg", "2.0.0"), + "https://registry.npmjs.org/@scope/pkg/-/pkg-2.0.0.tgz", + "the scope stays in the path; the leaf uses the bare name" + ); + } + + #[test] + fn sri_picks_strongest_hash_and_compares() { + let bytes = b"hello"; + let good = sri_of(bytes); + assert!(verify_sri(bytes, &good).is_ok()); + // Multi-hash: a wrong sha256 alongside the right sha512 still passes + // (strongest wins), and vice versa fails. + let multi = format!("sha256-WRONG= {good}"); + assert!(verify_sri(bytes, &multi).is_ok()); + let bad = sri_of(b"other"); + assert!(verify_sri(bytes, &bad).is_err()); + assert!(verify_sri(bytes, "md5-abc=").is_err(), "unknown algos refuse"); + } + + #[tokio::test] + async fn fetch_verifies_sri_and_extracts_with_modes() { + let tgz = make_tgz(&[ + ("package/package.json", br#"{"name":"left-pad"}"#, false), + ("package/bin/cli.js", b"#!/usr/bin/env node\n", true), + ("package/index.js", b"module.exports = 1;\n", false), + ]); + let mock = MockServer::start().await; + Mock::given(method("GET")) + .and(url_path("/left-pad/-/left-pad-1.3.0.tgz")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(tgz.clone())) + .mount(&mock) + .await; + + let entry = npm_entry( + Some(format!("{}/left-pad/-/left-pad-1.3.0.tgz", mock.uri())), + LockIntegrity::Sri(sri_of(&tgz)), + ); + let fetched = fetch_and_stage(&entry, &build_registry_client()) + .await + .unwrap(); + assert!(fetched.dir().join("package.json").is_file()); + assert_eq!( + std::fs::read(fetched.dir().join("index.js")).unwrap(), + b"module.exports = 1;\n" + ); + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let mode = std::fs::metadata(fetched.dir().join("bin/cli.js")) + .unwrap() + .permissions() + .mode(); + assert_eq!(mode & 0o111, 0o111, "exec bit preserved"); + } + // The tempdir dies with the holder. + let dir = fetched.dir().to_path_buf(); + drop(fetched); + assert!(!dir.exists()); + } + + #[tokio::test] + async fn integrity_mismatch_fails_before_extraction() { + let tgz = make_tgz(&[("package/package.json", b"{}", false)]); + let mock = MockServer::start().await; + Mock::given(method("GET")) + .and(url_path("/left-pad/-/left-pad-1.3.0.tgz")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(tgz)) + .mount(&mock) + .await; + + let entry = npm_entry( + Some(format!("{}/left-pad/-/left-pad-1.3.0.tgz", mock.uri())), + LockIntegrity::Sri(sri_of(b"the lock expects different bytes")), + ); + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Failed(msg)) => { + assert!(msg.contains("mismatch"), "{msg}") + } + other => panic!("expected integrity failure, got {other:?}"), + } + } + + #[tokio::test] + async fn unverifiable_entry_refuses_without_network() { + // A URL that would hard-fail if contacted — Unverifiable proves the + // decision happened before any I/O. + let entry = npm_entry( + Some("http://127.0.0.1:1/nope.tgz".into()), + LockIntegrity::None, + ); + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Unverifiable(msg)) => { + assert!(msg.contains("no integrity"), "{msg}") + } + other => panic!("expected Unverifiable, got {other:?}"), + } + } + + #[tokio::test] + async fn http_error_and_scheme_guard_fail_closed() { + let mock = MockServer::start().await; + // No mounted route → 404. + let entry = npm_entry( + Some(format!("{}/missing.tgz", mock.uri())), + LockIntegrity::Sri(sri_of(b"x")), + ); + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Failed(msg)) => assert!(msg.contains("404"), "{msg}"), + other => panic!("expected HTTP failure, got {other:?}"), + } + + let entry = npm_entry( + Some("ftp://example.com/x.tgz".into()), + LockIntegrity::Sri(sri_of(b"x")), + ); + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Failed(msg)) => assert!(msg.contains("non-http"), "{msg}"), + other => panic!("expected scheme refusal, got {other:?}"), + } + } + + #[test] + fn extraction_strips_first_component_whatever_its_name() { + let tgz = make_tgz(&[("weird-prefix/package.json", b"{}", false)]); + let tmp = tempfile::tempdir().unwrap(); + extract_tgz(&tgz, tmp.path()).unwrap(); + assert!(tmp.path().join("package.json").is_file()); + } + + #[test] + fn traversal_entries_fail_closed() { + // The tar crate refuses to WRITE `..` paths, so craft the header + // name bytes directly — exactly what a hostile tarball would carry. + for evil in ["package/../../escape.js", "package/x/../../../up.js"] { + let mut builder = tar::Builder::new(flate2::write::GzEncoder::new( + Vec::new(), + flate2::Compression::default(), + )); + let mut header = tar::Header::new_gnu(); + { + let name = &mut header.as_gnu_mut().unwrap().name; + name[..evil.len()].copy_from_slice(evil.as_bytes()); + } + header.set_size(4); + header.set_mode(0o644); + header.set_cksum(); + builder.append(&header, &b"evil"[..]).unwrap(); + let tgz = builder.into_inner().unwrap().finish().unwrap(); + + let tmp = tempfile::tempdir().unwrap(); + let err = extract_tgz(&tgz, tmp.path()).unwrap_err(); + assert!(err.contains("escapes"), "{evil}: {err}"); + assert!( + std::fs::read_dir(tmp.path()).unwrap().next().is_none(), + "nothing may extract from a traversal-bearing tarball" + ); + } + } + + #[test] + fn oversized_entry_header_fails_closed() { + // A header CLAIMING more than the per-entry cap fails before any + // attempt to read that much data. + let mut builder = tar::Builder::new(flate2::write::GzEncoder::new( + Vec::new(), + flate2::Compression::default(), + )); + let mut header = tar::Header::new_gnu(); + header.set_path("package/huge.bin").unwrap(); + header.set_size(MAX_ENTRY_BYTES + 1); + header.set_mode(0o644); + header.set_cksum(); + // Intentionally append no data: the size check fires first. + let inner = { + use std::io::Write as _; + builder.get_mut().write_all(&header.as_bytes()[..]).unwrap(); + builder.into_inner().unwrap().finish().unwrap() + }; + let tmp = tempfile::tempdir().unwrap(); + let err = extract_tgz(&inner, tmp.path()).unwrap_err(); + assert!( + err.contains("cap") || err.contains("unreadable"), + "oversize header fails closed: {err}" + ); + } +} From 18822dc9888998c76a99a94c9d8f4f6911a9fa4f Mon Sep 17 00:00:00 2001 From: Mikola Lysenko Date: Thu, 11 Jun 2026 18:28:40 -0400 Subject: [PATCH 08/19] feat(vendor,scan): auto-fetch missing packages + lockfile/ledger discovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vendor: a manifest patch whose package has no installed copy is now satisfied automatically (no flag) instead of failing with package_not_installed: - already-vendored purls stage from their own committed artifact, sha256-verified against the vendor ledger (fresh-clone re-vendor and in-sync runs work offline, no registry traffic); - otherwise the lockfile-resolved pristine artifact is fetched (lock-recorded URL else conventional registry URL), verified against the lock's integrity FAIL-CLOSED, and staged from a private tempdir — the project tree is never touched. Reason codes: vendor_fetched_missing (skip-warning beside the Applied event), vendor_fetch_failed (distinct Failed, suppresses the duplicate not-installed skip), vendor_fetch_unverifiable (no lock integrity → calm skip). --offline keeps the calm skip and names the lockfile as the would-be source. scan: discovery now supplements the installed-tree crawl with (a) lockfile-only dependencies — warned '[NOT INSTALLED]' in the table + a stderr note, JSON lockfileOnlyPackages + packages[].notInstalled, counted as scanned so a wiped node_modules no longer prunes lockfile-listed entries, partitioned out of --apply BEFORE download (calm skipped/package_not_installed records, exit 0, no manifest writes) while --vendor passes them to the auto-fetch; and (b) vendored-ledger entries — the committed artifact IS the dependency, so updates[] detection and scan --vendor keep working on a fresh clone before any install. scan --json --vendor now vendors a completely fresh clone end-to-end (e2e-proven, second run already_vendored). Co-Authored-By: Claude Fable 5 --- Cargo.lock | 2 + crates/socket-patch-cli/Cargo.toml | 3 + crates/socket-patch-cli/src/commands/scan.rs | 245 ++++++++++- .../socket-patch-cli/src/commands/vendor.rs | 164 ++++++- .../socket-patch-cli/tests/scan_vendor_e2e.rs | 405 ++++++++++++++++++ .../src/patch/vendor/registry_fetch.rs | 42 ++ 6 files changed, 854 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ab20718..74f50ac 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2420,6 +2420,7 @@ dependencies = [ "base64", "clap", "dialoguer", + "flate2", "fs2", "hex", "indicatif", @@ -2431,6 +2432,7 @@ dependencies = [ "serial_test", "sha2", "socket-patch-core", + "tar", "tempfile", "testcontainers", "tokio", diff --git a/crates/socket-patch-cli/Cargo.toml b/crates/socket-patch-cli/Cargo.toml index cedba95..e983539 100644 --- a/crates/socket-patch-cli/Cargo.toml +++ b/crates/socket-patch-cli/Cargo.toml @@ -59,6 +59,9 @@ setup-e2e = [] [dev-dependencies] sha2 = { workspace = true } +# scan_vendor_e2e builds pristine registry tarballs for the auto-fetch tests. +tar = { workspace = true } +flate2 = { workspace = true } hex = { workspace = true } wiremock = { workspace = true } portable-pty = { workspace = true } diff --git a/crates/socket-patch-cli/src/commands/scan.rs b/crates/socket-patch-cli/src/commands/scan.rs index e9ebd7f..4c562e1 100644 --- a/crates/socket-patch-cli/src/commands/scan.rs +++ b/crates/socket-patch-cli/src/commands/scan.rs @@ -281,6 +281,123 @@ pub(crate) fn detect_prunable( .collect() } +/// Lockfile-only packages: dependencies the project's lockfile resolves +/// that have no crawled (installed) counterpart. +#[derive(Default)] +struct LockfileSupplement { + packages: Vec, + /// Literal crawler-form purls, for fast membership tests. + purls: HashSet, + /// The lockfile the entries came from, for messages. + source: &'static str, +} + +/// Inventory the project's lockfile(s) and fabricate crawl entries for +/// dependencies that are not installed. The fabricated `path` is the +/// WOULD-BE install dir — every consumer degrades safely on a nonexistent +/// path (hash verify → NotFound, apply → partitioned skip, vendor → +/// auto-fetch). Global scans target the machine's global tree, not this +/// project's lockfile, so they get no supplement. +async fn lockfile_supplement( + common: &GlobalArgs, + crawled: &[socket_patch_core::crawlers::types::CrawledPackage], +) -> LockfileSupplement { + use socket_patch_core::patch::vendor::lock_inventory; + use socket_patch_core::patch::vendor::npm_flavor::NpmLockFlavor; + + let mut out = LockfileSupplement::default(); + if common.global || common.global_prefix.is_some() { + return out; + } + let Some((flavor, entries)) = lock_inventory::inventory_npm_lock(&common.cwd).await else { + return out; + }; + out.source = match flavor { + NpmLockFlavor::PackageLock => "package-lock.json", + NpmLockFlavor::Pnpm => "pnpm-lock.yaml", + NpmLockFlavor::YarnClassic | NpmLockFlavor::YarnBerry => "yarn.lock", + NpmLockFlavor::Bun => "bun.lock", + }; + let crawled_purls: HashSet<&str> = crawled.iter().map(|p| p.purl.as_str()).collect(); + for entry in entries { + if crawled_purls.contains(entry.purl.as_str()) { + continue; + } + let (namespace, name) = match entry.name.split_once('/') { + Some((scope, bare)) => (Some(scope.to_string()), bare.to_string()), + None => (None, entry.name.clone()), + }; + out.purls.insert(entry.purl.clone()); + out.packages.push(socket_patch_core::crawlers::types::CrawledPackage { + name, + version: entry.version.clone(), + namespace, + purl: entry.purl.clone(), + path: common.cwd.join("node_modules").join(&entry.name), + }); + } + out +} + +/// A displayable crawl entry fabricated from a purl (decoded form). The +/// path is a placeholder consumers degrade safely on. +fn crawled_from_purl( + purl: &str, + cwd: &std::path::Path, +) -> Option { + let decoded = normalize_purl(strip_purl_qualifiers(purl)).into_owned(); + let rest = decoded.strip_prefix("pkg:")?; + let (_eco, rest) = rest.split_once('/')?; + let at = rest.rfind('@').filter(|&i| i > 0)?; + let (name_part, version) = (&rest[..at], &rest[at + 1..]); + let (namespace, name) = match name_part.rsplit_once('/') { + Some((ns, n)) => (Some(ns.to_string()), n.to_string()), + None => (None, name_part.to_string()), + }; + Some(socket_patch_core::crawlers::types::CrawledPackage { + name, + version: version.to_string(), + namespace, + purl: decoded.clone(), + path: cwd.join("node_modules").join(name_part), + }) +} + +/// Vendored-ledger packages with no crawled counterpart: on a fresh clone +/// the committed artifact IS the dependency, so these stay discoverable +/// (updates[] detection, the table, and `scan --vendor` re-vendor/in-sync +/// runs all keep working before any install). They are NOT "lockfile-only" +/// — nothing needs installing; the artifact satisfies the lock. +async fn vendored_ledger_supplement( + common: &GlobalArgs, + crawled: &[socket_patch_core::crawlers::types::CrawledPackage], +) -> Vec { + if common.global || common.global_prefix.is_some() { + return Vec::new(); + } + let Ok(state) = socket_patch_core::patch::vendor::load_state(&common.cwd).await else { + return Vec::new(); + }; + let crawled_norm: HashSet = crawled + .iter() + .map(|p| normalize_purl(&p.purl).into_owned()) + .collect(); + let mut seen: HashSet = HashSet::new(); + let mut out = Vec::new(); + for entry in state.entries.values() { + let base = strip_purl_qualifiers(&entry.base_purl); + let norm = normalize_purl(base).into_owned(); + if crawled_norm.contains(&norm) || !seen.insert(norm) { + continue; + } + if let Some(pkg) = crawled_from_purl(base, &common.cwd) { + out.push(pkg); + } + } + out.sort_by(|a, b| a.purl.cmp(&b.purl)); + out +} + /// Vendor-mode pre-prompt check: uuids of selected patches whose installed /// files match NEITHER beforeHash nor afterHash — the patch was built /// against different bytes than the installed artifact. Vendoring still @@ -296,6 +413,7 @@ async fn preverify_vendor_baselines( org_slug: Option<&str>, selected: &[PatchSearchResult], crawled: &[socket_patch_core::crawlers::types::CrawledPackage], + lockfile_only: &HashSet, ) -> HashSet { use socket_patch_core::manifest::schema::PatchFileInfo; use socket_patch_core::patch::apply::{verify_file_patch, VerifyStatus}; @@ -306,6 +424,11 @@ async fn preverify_vendor_baselines( // API purls come percent-encoded, crawler purls literal — purl_eq // bridges the two spellings. let base = strip_purl_qualifiers(&patch.purl); + // Lockfile-only packages have no installed bytes to compare — the + // vendor engine fetches them pristine (nothing to annotate). + if lockfile_only.contains(normalize_purl(base).as_ref()) { + continue; + } let Some(pkg) = crawled.iter().find(|c| purl_eq(&c.purl, base)) else { continue; }; @@ -951,6 +1074,39 @@ fn partition_vendored_selected( (kept, vendored_records) } +/// Lockfile-only patches are skipped BEFORE download in apply mode: the +/// package is not on disk to patch in place, and downloading its patch +/// into the manifest would create a not-yet-appliable entry (and flip the +/// apply path's exit code). `scan --vendor` is the route that handles them +/// (the vendor engine auto-fetches lockfile-resolved packages). Matching +/// bridges API purl encoding via `normalize_purl`. Same shape/mechanics as +/// [`partition_vendored_selected`]. +fn partition_not_installed_selected( + selected: Vec, + lockfile_only: &HashSet, +) -> (Vec, Vec) { + if lockfile_only.is_empty() { + return (selected, Vec::new()); + } + let is_lockfile_only = |p: &str| { + lockfile_only.contains(normalize_purl(strip_purl_qualifiers(p)).as_ref()) + }; + let (not_installed, kept): (Vec<_>, Vec<_>) = selected + .into_iter() + .partition(|p| is_lockfile_only(&p.purl)); + let mut records: Vec = not_installed + .iter() + .map(|p| { + serde_json::json!({ + "purl": p.purl, "uuid": p.uuid, + "action": "skipped", "errorCode": "package_not_installed", + }) + }) + .collect(); + records.sort_by(|a, b| a["purl"].as_str().cmp(&b["purl"].as_str())); + (kept, records) +} + /// Fold the pre-download vendored skips into the apply report returned by /// `download_and_apply_patches`: they were "found" by discovery and /// skipped here, never downloaded. Also strips the inner `status` (scan @@ -1184,7 +1340,24 @@ pub async fn run(args: ScanArgs) -> i32 { } // Crawl packages - let (all_crawled, eco_counts) = crawl_all_ecosystems(&crawler_options).await; + let (mut all_crawled, mut eco_counts) = crawl_all_ecosystems(&crawler_options).await; + + // Lockfile supplement: dependencies the project's lockfile resolves + // that have NO installed copy (fresh clone, partial install). They join + // discovery — counts, API lookup, table, the prune "scanned" set — and + // are flagged "not yet installed" everywhere a user could act on them. + let lockfile_only = lockfile_supplement(&args.common, &all_crawled).await; + if !lockfile_only.packages.is_empty() { + *eco_counts.entry(Ecosystem::Npm).or_insert(0) += lockfile_only.packages.len(); + all_crawled.extend(lockfile_only.packages.iter().cloned()); + } + let ledger_supplement = vendored_ledger_supplement(&args.common, &all_crawled).await; + for pkg in &ledger_supplement { + if let Some(eco) = Ecosystem::from_purl(&pkg.purl) { + *eco_counts.entry(eco).or_insert(0) += 1; + } + } + all_crawled.extend(ledger_supplement); // Every PURL the crawl found, captured BEFORE the `--ecosystems` // display/query filter is applied. Prune (below) must reference the @@ -1193,6 +1366,9 @@ pub async fn run(args: ScanArgs) -> i32 { // prune used the filtered set instead, `scan --ecosystems npm --prune` // would treat every cargo/go/pypi/gem manifest entry as "uninstalled" // and delete it (plus its blobs) — silent cross-ecosystem data loss. + // Lockfile-only purls are deliberately included: a dependency the + // lockfile still resolves must not be pruned just because node_modules + // is wiped or partially installed. let installed_purls: HashSet = all_crawled.iter().map(|p| p.purl.clone()).collect(); // Vendor-ledger purl keys, loaded once and shared by the prune @@ -1250,6 +1426,7 @@ pub async fn run(args: ScanArgs) -> i32 { let mut result = serde_json::json!({ "status": "success", "scannedPackages": 0, + "lockfileOnlyPackages": 0, "packagesWithPatches": 0, "totalPatches": 0, "freePatches": 0, @@ -1310,6 +1487,13 @@ pub async fn run(args: ScanArgs) -> i32 { } else { eprintln!("Found {package_count} packages{eco_summary}"); } + if !lockfile_only.purls.is_empty() { + eprintln!( + "Note: {} package(s) from {} are not yet installed (lockfile-only).", + lockfile_only.purls.len(), + lockfile_only.source, + ); + } } // Query API in batches @@ -1495,6 +1679,7 @@ pub async fn run(args: ScanArgs) -> i32 { let mut result = serde_json::json!({ "status": "success", "scannedPackages": package_count, + "lockfileOnlyPackages": lockfile_only.purls.len(), "packagesWithPatches": all_packages_with_patches.len(), "totalPatches": total_patches, "freePatches": free_patches, @@ -1507,6 +1692,19 @@ pub async fn run(args: ScanArgs) -> i32 { "newUuid": u.new_uuid, })).collect::>(), }); + // Flag lockfile-only packages so JSON consumers can tell "patch + // available but not installed" from the installed case. Additive + // field; absent means installed. + if let Some(packages) = result["packages"].as_array_mut() { + for pkg in packages { + let is_lockfile_only = pkg["purl"] + .as_str() + .is_some_and(|p| lockfile_only.purls.contains(p)); + if is_lockfile_only { + pkg["notInstalled"] = serde_json::json!(true); + } + } + } // `apply` and `prune` are computed once at the top of run() // (factoring in --sync, which implies both). They're independent @@ -1549,6 +1747,17 @@ pub async fn run(args: ScanArgs) -> i32 { // operator's signal to run `scan --vendor` (or `vendor`). let (selected, vendored_records) = partition_vendored_selected(selected, &vendored_purls); + // Lockfile-only purls leave the apply selection here (calm + // skip records, never an error); the union rides the same + // bookkeeping as the vendored skips. + let (selected, vendored_records) = { + let (kept, not_installed) = + partition_not_installed_selected(selected, &lockfile_only.purls); + let mut all = vendored_records; + all.extend(not_installed); + all.sort_by(|a, b| a["purl"].as_str().cmp(&b["purl"].as_str())); + (kept, all) + }; let mut apply_code = 0i32; if dry { @@ -1791,14 +2000,22 @@ pub async fn run(args: ScanArgs) -> i32 { } else { String::new() }; + // Lockfile-only packages can be patched by `scan --vendor` + // (which fetches them pristine) but not applied in place. + let not_installed_marker = if lockfile_only.purls.contains(pkg.purl.as_str()) { + color(" [NOT INSTALLED]", "33", use_color) + } else { + String::new() + }; println!( - "{:<40} {:>8} {:<16} {}{}", + "{:<40} {:>8} {:<16} {}{}{}", display_purl, count_str, format_severity(severity, use_color), vuln_str, update_marker, + not_installed_marker, ); } @@ -1930,6 +2147,29 @@ pub async fn run(args: ScanArgs) -> i32 { } } + // Lockfile-only purls leave the in-place apply selection (calm skip, + // mirrors the JSON path). In `--vendor` mode they stay: the vendor + // engine fetches lockfile-resolved packages pristine. + let (selected, not_installed_selected): (Vec<_>, Vec) = if args.vendor { + (selected, Vec::new()) + } else { + let (kept, skipped) = partition_not_installed_selected(selected, &lockfile_only.purls); + let printed: Vec = skipped + .iter() + .filter_map(|r| r["purl"].as_str().map(str::to_string)) + .collect(); + (kept, printed) + }; + if !args.common.silent { + for purl in ¬_installed_selected { + println!( + " [skip] {} (not installed — run your package manager's install first, \ + or `scan --vendor` to vendor it from the lockfile)", + normalize_purl(purl) + ); + } + } + if selected.is_empty() && !args.vendor { if !args.common.silent { println!("No patches selected."); @@ -1946,6 +2186,7 @@ pub async fn run(args: ScanArgs) -> i32 { effective_org_slug, &selected, &filtered_crawled, + &lockfile_only.purls, ) .await } else { diff --git a/crates/socket-patch-cli/src/commands/vendor.rs b/crates/socket-patch-cli/src/commands/vendor.rs index d032439..8c0c417 100644 --- a/crates/socket-patch-cli/src/commands/vendor.rs +++ b/crates/socket-patch-cli/src/commands/vendor.rs @@ -525,13 +525,140 @@ pub(crate) async fn vendor_records( global_prefix: common.global_prefix.clone(), batch_size: 100, }; - let all_packages = find_packages_for_purls( + let mut all_packages = find_packages_for_purls( &vendorable_partition, &crawler_options, common.silent || common.json, ) .await; + // ── Auto-fetch: lockfile-resolved packages with no installed copy ──── + // A manifest patch whose package is not on disk but IS resolvable from + // the project's lockfile is fetched pristine from its registry (lock- + // recorded URL else the conventional one), verified against the lock's + // integrity FAIL-CLOSED, and staged from a private tempdir — the + // project tree is never touched, and the lock wiring works without an + // installed copy (it keys off lock entries). The holders keep the + // tempdirs alive until the dispatch loop below has staged from them. + let mut fetched_holders: Vec = + Vec::new(); + // Fetch failures must keep their distinct Failed event; this set + // suppresses the later duplicate `package_not_installed` skip. + let mut fetch_failed: HashSet = HashSet::new(); + { + use socket_patch_core::patch::vendor::{lock_inventory, registry_fetch}; + let missing: Vec = vendorable + .iter() + .filter(|p| !all_packages.contains_key(*p)) + .cloned() + .collect(); + if !missing.is_empty() { + // The inventory is a local file read — fine offline; only the + // fetch itself needs the network. + let inventory = lock_inventory::inventory_npm_lock(&common.cwd) + .await + .map(|(_, entries)| entries) + .unwrap_or_default(); + let client = registry_fetch::build_registry_client(); + // Pre-loaded vendor ledger for the artifact-staging path: an + // already-vendored purl with no installed copy (fresh clone) + // stages from its own committed artifact, sha256-verified + // against the ledger — offline-safe, no registry traffic. + let ledger = load_state(&common.cwd).await.unwrap_or_default(); + for purl in &missing { + if let Some(entry) = ledger + .entries + .get(purl) + .or_else(|| ledger.entries.values().find(|e| &e.base_purl == purl)) + .filter(|e| e.ecosystem == "npm" && e.artifact.path.ends_with(".tgz")) + { + let tgz = common.cwd.join(&entry.artifact.path); + match registry_fetch::stage_local_artifact(&tgz, &entry.artifact.sha256) + .await + { + Ok(staged) => { + all_packages.insert(purl.clone(), staged.dir().to_path_buf()); + fetched_holders.push(staged); + continue; + } + Err(registry_fetch::FetchError::Failed(detail)) => { + // A corrupt committed artifact is worth a loud + // failure — re-vendoring over it would mask the + // corruption. + fetch_failed.insert(purl.clone()); + env.record( + PatchEvent::new(PatchAction::Failed, purl.clone()) + .with_error("vendor_fetch_failed", detail.clone()), + ); + if !common.silent && !common.json { + eprintln!( + "Cannot vendor {}: {detail}", + normalize_purl(purl) + ); + } + continue; + } + Err(registry_fetch::FetchError::Unverifiable(_)) => { + // No recorded hash (legacy ledger) — fall + // through to the lockfile/registry path. + } + } + } + let Some(entry) = lock_inventory::lookup(&inventory, purl) else { + continue; // not lockfile-resolvable → package_not_installed + }; + if common.offline { + // The enriched skip detail lands below in the unmatched + // pass (the purl stays unmatched). + continue; + } + match registry_fetch::fetch_and_stage(entry, &client).await { + Ok(fetched) => { + record_warning( + env, + purl, + &VendorWarning::new( + "vendor_fetched_missing", + format!( + "{}@{} is not installed; fetched the pristine artifact \ + from {} (integrity verified against the lockfile) and \ + vendored from that copy — the project tree was not \ + touched", + entry.name, entry.version, fetched.url + ), + ), + common, + ); + all_packages.insert(purl.clone(), fetched.dir().to_path_buf()); + fetched_holders.push(fetched); + } + Err(registry_fetch::FetchError::Unverifiable(detail)) => { + record_warning( + env, + purl, + &VendorWarning::new("vendor_fetch_unverifiable", detail), + common, + ); + // Falls through to package_not_installed below. + } + Err(registry_fetch::FetchError::Failed(detail)) => { + fetch_failed.insert(purl.clone()); + env.record( + PatchEvent::new(PatchAction::Failed, purl.clone()) + .with_error("vendor_fetch_failed", detail.clone()), + ); + if !common.silent && !common.json { + eprintln!( + "Cannot vendor {}: fetch failed: {detail}", + normalize_purl(purl) + ); + } + } + } + } + } + } + let vendored_at = now_rfc3339(); let mut state = match load_state(&common.cwd).await { Ok(s) => s, @@ -763,10 +890,10 @@ pub(crate) async fn vendor_records( } // Manifest entries that targeted in-scope ecosystems but had no - // installed package on disk. + // installed package on disk (and could not be auto-fetched). let mut unmatched: Vec = vendorable .iter() - .filter(|p| !matched.contains(*p)) + .filter(|p| !matched.contains(*p) && !fetch_failed.contains(*p)) .cloned() .collect(); unmatched.sort(); @@ -776,15 +903,42 @@ pub(crate) async fn vendor_records( .map(|p| strip_purl_qualifiers(p).to_string()) .collect(); unmatched.retain(|p| !vendored_bases.contains(strip_purl_qualifiers(p))); + has_errors |= !fetch_failed.is_empty(); if !unmatched.is_empty() { has_errors = true; + // Offline runs name the packages the lockfile COULD have fetched — + // the inventory is a local file read, allowed offline. + let lock_resolvable: HashSet = if common.offline { + let entries = socket_patch_core::patch::vendor::lock_inventory::inventory_npm_lock( + &common.cwd, + ) + .await + .map(|(_, e)| e) + .unwrap_or_default(); + unmatched + .iter() + .filter(|p| { + socket_patch_core::patch::vendor::lock_inventory::lookup(&entries, p) + .is_some() + }) + .cloned() + .collect() + } else { + HashSet::new() + }; for purl in &unmatched { + let detail = if lock_resolvable.contains(purl) { + "no installed package found; --offline prevents fetching it from the \ + registry (the lockfile resolves it)" + } else { + "no installed package found" + }; env.record( PatchEvent::new(PatchAction::Skipped, purl.clone()) - .with_reason("package_not_installed", "no installed package found"), + .with_reason("package_not_installed", detail), ); if !common.silent && !common.json { - eprintln!("Cannot vendor {}: package not installed", normalize_purl(purl)); + eprintln!("Cannot vendor {}: {detail}", normalize_purl(purl)); } } } diff --git a/crates/socket-patch-cli/tests/scan_vendor_e2e.rs b/crates/socket-patch-cli/tests/scan_vendor_e2e.rs index 64f0b28..d3e8cb5 100644 --- a/crates/socket-patch-cli/tests/scan_vendor_e2e.rs +++ b/crates/socket-patch-cli/tests/scan_vendor_e2e.rs @@ -737,3 +737,408 @@ async fn scan_vendor_annotates_mismatched_baseline_and_vendors_anyway() { .join(format!(".socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz")) .is_file()); } + +// ───────────── lockfile auto-fetch + scan lockfile supplement ───────────── + +/// sha512 SRI of the given bytes (what an npm-family lock records). +fn sri_of(bytes: &[u8]) -> String { + use base64::Engine as _; + use sha2::Sha512; + format!( + "sha512-{}", + base64::engine::general_purpose::STANDARD.encode(Sha512::digest(bytes)) + ) +} + +/// A pristine registry tarball for left-pad@1.3.0 whose index.js carries +/// the patch's BEFORE bytes. +fn pristine_tgz() -> Vec { + let mut builder = tar::Builder::new(flate2::write::GzEncoder::new( + Vec::new(), + flate2::Compression::default(), + )); + for (path, bytes) in [ + ( + "package/package.json", + br#"{"name":"left-pad","version":"1.3.0"}"#.as_slice(), + ), + ("package/index.js", BEFORE), + ] { + let mut header = tar::Header::new_gnu(); + header.set_size(bytes.len() as u64); + header.set_mode(0o644); + header.set_cksum(); + builder.append_data(&mut header, path, bytes).unwrap(); + } + builder.into_inner().unwrap().finish().unwrap() +} + +/// Project fixture with a lockfile but NO node_modules: package.json + +/// package-lock.json whose left-pad entry resolves to `resolved_url` with +/// `integrity`. +fn write_lockfile_only_fixture(root: &Path, resolved_url: &str, integrity: &str) { + std::fs::write( + root.join("package.json"), + r#"{ "name": "scan-vendor-test", "version": "0.0.0", "dependencies": { "left-pad": "^1.3.0" } }"#, + ) + .unwrap(); + let lock = serde_json::json!({ + "name": "scan-vendor-test", + "version": "0.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "scan-vendor-test", + "version": "0.0.0", + "dependencies": { "left-pad": "^1.3.0" } + }, + "node_modules/left-pad": { + "version": "1.3.0", + "resolved": resolved_url, + "integrity": integrity, + "license": "WTFPL" + } + } + }); + let mut lock_bytes = serde_json::to_vec_pretty(&lock).unwrap(); + lock_bytes.push(b'\n'); + std::fs::write(root.join("package-lock.json"), lock_bytes).unwrap(); +} + +/// Pre-seed `.socket/manifest.json` + the after-blob so a standalone +/// `vendor` run has local patch sources (no patch-API traffic). +fn seed_manifest_and_blob(root: &Path) { + let socket = root.join(".socket"); + std::fs::create_dir_all(socket.join("blobs")).unwrap(); + let manifest = serde_json::json!({ + "patches": { + PURL: { + "uuid": UUID, + "exportedAt": "2026-01-01T00:00:00Z", + "files": { + "package/index.js": { + "beforeHash": git_sha256(BEFORE), + "afterHash": git_sha256(AFTER), + } + }, + "vulnerabilities": {}, + "description": "synthetic", + "license": "MIT", + "tier": "free" + } + } + }); + std::fs::write( + socket.join("manifest.json"), + serde_json::to_vec_pretty(&manifest).unwrap(), + ) + .unwrap(); + std::fs::write(socket.join("blobs").join(git_sha256(AFTER)), AFTER).unwrap(); +} + +async fn mount_registry_tarball(mock: &MockServer, tgz: Vec) { + Mock::given(method("GET")) + .and(path("/left-pad/-/left-pad-1.3.0.tgz")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(tgz)) + .mount(mock) + .await; +} + +fn run_vendor(root: &Path, extra: &[&str]) -> (i32, serde_json::Value, String) { + let mut argv = vec!["vendor", "--json"]; + argv.extend_from_slice(extra); + let out = Command::new(binary()) + .args(&argv) + .current_dir(root) + .env("SOCKET_TELEMETRY_DISABLED", "1") + .output() + .expect("run vendor"); + let stdout = String::from_utf8_lossy(&out.stdout).into_owned(); + let stderr = String::from_utf8_lossy(&out.stderr).into_owned(); + let v: serde_json::Value = serde_json::from_str(stdout.trim()) + .unwrap_or_else(|e| panic!("vendor --json must emit JSON: {e}\n{stdout}\n{stderr}")); + (out.status.code().unwrap_or(-1), v, stderr) +} + +/// A manifest patch whose package is NOT installed but IS lockfile-resolved +/// is fetched pristine from the registry (integrity-verified against the +/// lock) and vendored — node_modules never appears. +#[tokio::test] +async fn vendor_auto_fetches_missing_package_from_lockfile() { + let mock = MockServer::start().await; + let tgz = pristine_tgz(); + let integrity = sri_of(&tgz); + mount_registry_tarball(&mock, tgz).await; + + let tmp = tempfile::tempdir().unwrap(); + write_lockfile_only_fixture( + tmp.path(), + &format!("{}/left-pad/-/left-pad-1.3.0.tgz", mock.uri()), + &integrity, + ); + seed_manifest_and_blob(tmp.path()); + + let (code, v, _) = run_vendor(tmp.path(), &[]); + assert_eq!(code, 0, "{v:#}"); + let events = v["events"].as_array().unwrap(); + assert!( + events.iter().any(|e| e["action"] == "applied" && e["purl"] == PURL), + "{v:#}" + ); + assert!( + events + .iter() + .any(|e| e["errorCode"] == "vendor_fetched_missing"), + "fetch surfaced as a warning event: {v:#}" + ); + assert!(tmp + .path() + .join(format!(".socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz")) + .is_file()); + let lock = std::fs::read_to_string(tmp.path().join("package-lock.json")).unwrap(); + assert!(lock.contains(&format!(".socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz"))); + assert!( + !tmp.path().join("node_modules").exists(), + "the project tree is never touched" + ); +} + +/// Integrity mismatch between the lock and the served bytes is a distinct +/// vendor_fetch_failed failure — and nothing is written. +#[tokio::test] +async fn vendor_fetch_integrity_mismatch_is_vendor_fetch_failed() { + let mock = MockServer::start().await; + mount_registry_tarball(&mock, pristine_tgz()).await; + + let tmp = tempfile::tempdir().unwrap(); + write_lockfile_only_fixture( + tmp.path(), + &format!("{}/left-pad/-/left-pad-1.3.0.tgz", mock.uri()), + &sri_of(b"the lock expects different bytes"), + ); + seed_manifest_and_blob(tmp.path()); + + let (code, v, _) = run_vendor(tmp.path(), &[]); + assert_ne!(code, 0, "{v:#}"); + let events = v["events"].as_array().unwrap(); + assert!( + events + .iter() + .any(|e| e["action"] == "failed" && e["errorCode"] == "vendor_fetch_failed"), + "{v:#}" + ); + assert!( + !events + .iter() + .any(|e| e["errorCode"] == "package_not_installed"), + "no duplicate not-installed skip: {v:#}" + ); + assert!(!tmp.path().join(".socket/vendor").exists()); +} + +/// --offline refuses the fetch with a calm package_not_installed skip that +/// names the lockfile as the would-be source. No HTTP traffic happens (no +/// registry route is mounted — a request would 404 and fail differently). +#[tokio::test] +async fn vendor_offline_refuses_fetch_with_calm_skip() { + let tmp = tempfile::tempdir().unwrap(); + write_lockfile_only_fixture( + tmp.path(), + "http://127.0.0.1:1/left-pad/-/left-pad-1.3.0.tgz", + &sri_of(b"irrelevant"), + ); + seed_manifest_and_blob(tmp.path()); + + let (code, v, _) = run_vendor(tmp.path(), &["--offline"]); + assert_ne!(code, 0, "not-installed stays a non-benign skip: {v:#}"); + let events = v["events"].as_array().unwrap(); + let skip = events + .iter() + .find(|e| e["errorCode"] == "package_not_installed") + .unwrap_or_else(|| panic!("{v:#}")); + assert!( + skip["reason"] + .as_str() + .unwrap_or("") + .contains("--offline prevents fetching"), + "offline detail names the lockfile resolution: {v:#}" + ); +} + +/// An entry whose lock records no integrity is never fetched (fail-closed) +/// and keeps the plain not-installed outcome plus an explanatory warning. +#[tokio::test] +async fn vendor_fetch_unverifiable_lock_entry_stays_not_installed() { + let tmp = tempfile::tempdir().unwrap(); + // Hand-write a lock whose entry has no integrity field. + std::fs::write( + tmp.path().join("package.json"), + r#"{ "name": "x", "version": "0.0.0" }"#, + ) + .unwrap(); + std::fs::write( + tmp.path().join("package-lock.json"), + serde_json::to_vec_pretty(&serde_json::json!({ + "name": "x", "version": "0.0.0", "lockfileVersion": 3, + "packages": { + "": { "name": "x", "version": "0.0.0" }, + "node_modules/left-pad": { + "version": "1.3.0", + "resolved": "http://127.0.0.1:1/left-pad/-/left-pad-1.3.0.tgz" + } + } + })) + .unwrap(), + ) + .unwrap(); + seed_manifest_and_blob(tmp.path()); + + let (code, v, _) = run_vendor(tmp.path(), &[]); + assert_ne!(code, 0, "{v:#}"); + let events = v["events"].as_array().unwrap(); + assert!( + events + .iter() + .any(|e| e["errorCode"] == "vendor_fetch_unverifiable"), + "{v:#}" + ); + assert!( + events + .iter() + .any(|e| e["errorCode"] == "package_not_installed"), + "{v:#}" + ); +} + +/// The headline flow: a COMPLETELY fresh clone (lockfile, no node_modules, +/// no .socket) discovers from the lockfile and `scan --vendor` vendors +/// end-to-end via the registry fetch. +#[tokio::test] +async fn scan_vendor_works_on_a_completely_fresh_clone() { + let mock = MockServer::start().await; + mount_patch_api(&mock, UUID).await; + let tgz = pristine_tgz(); + let integrity = sri_of(&tgz); + mount_registry_tarball(&mock, tgz).await; + + let tmp = tempfile::tempdir().unwrap(); + write_lockfile_only_fixture( + tmp.path(), + &format!("{}/left-pad/-/left-pad-1.3.0.tgz", mock.uri()), + &integrity, + ); + + let (code, stdout, stderr) = run_scan_vendor(tmp.path(), &mock.uri(), &[]); + assert_eq!(code, 0, "stdout={stdout}; stderr={stderr}"); + let v: serde_json::Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); + assert_eq!(v["lockfileOnlyPackages"], 1, "{v}"); + assert_eq!(v["vendor"]["summary"]["applied"], 1, "{v}"); + assert!(tmp + .path() + .join(format!(".socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz")) + .is_file()); + assert!(!tmp.path().join("node_modules").exists()); + + // Second run: in sync. + let (code, stdout, stderr) = run_scan_vendor(tmp.path(), &mock.uri(), &[]); + assert_eq!(code, 0, "stdout={stdout}; stderr={stderr}"); + let v: serde_json::Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); + let events = v["vendor"]["events"].as_array().unwrap(); + assert!( + events + .iter() + .any(|e| e["errorCode"] == "already_vendored"), + "{v}" + ); +} + +/// Read-only discovery flags lockfile-only packages in JSON and the human +/// table. +#[tokio::test] +async fn scan_discovers_lockfile_only_packages_with_warning() { + let mock = MockServer::start().await; + mount_patch_api(&mock, UUID).await; + let tmp = tempfile::tempdir().unwrap(); + write_lockfile_only_fixture( + tmp.path(), + "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz", + &sri_of(b"unused for discovery"), + ); + + // JSON shape. + let out = Command::new(binary()) + .args([ + "scan", "--json", "--api-url", &mock.uri(), "--api-token", "fake-token", + "--org", ORG_SLUG, + ]) + .current_dir(tmp.path()) + .env("SOCKET_TELEMETRY_DISABLED", "1") + .output() + .expect("run"); + let stdout = String::from_utf8_lossy(&out.stdout); + let v: serde_json::Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); + assert_eq!(v["scannedPackages"], 1, "{v}"); + assert_eq!(v["lockfileOnlyPackages"], 1, "{v}"); + assert_eq!(v["packages"][0]["notInstalled"], true, "{v}"); + + // Human output: the table marker + the note. + let out = Command::new(binary()) + .args([ + "scan", "--api-url", &mock.uri(), "--api-token", "fake-token", + "--org", ORG_SLUG, "--dry-run", "--yes", + ]) + .current_dir(tmp.path()) + .env("SOCKET_TELEMETRY_DISABLED", "1") + .output() + .expect("run"); + let stdout = String::from_utf8_lossy(&out.stdout); + let stderr = String::from_utf8_lossy(&out.stderr); + assert!( + stdout.contains("[NOT INSTALLED]"), + "stdout={stdout}; stderr={stderr}" + ); + assert!( + stderr.contains("not yet installed (lockfile-only)"), + "stderr={stderr}" + ); +} + +/// `scan --apply` skips lockfile-only patches calmly: exit 0, a skipped +/// record with package_not_installed, and NO manifest entry written. +#[tokio::test] +async fn scan_apply_skips_lockfile_only_without_error() { + let mock = MockServer::start().await; + mount_patch_api(&mock, UUID).await; + let tmp = tempfile::tempdir().unwrap(); + write_lockfile_only_fixture( + tmp.path(), + "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz", + &sri_of(b"unused"), + ); + + let out = Command::new(binary()) + .args([ + "scan", "--json", "--apply", "--yes", "--api-url", &mock.uri(), + "--api-token", "fake-token", "--org", ORG_SLUG, + ]) + .current_dir(tmp.path()) + .env("SOCKET_TELEMETRY_DISABLED", "1") + .output() + .expect("run"); + let stdout = String::from_utf8_lossy(&out.stdout); + let code = out.status.code().unwrap_or(-1); + let v: serde_json::Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); + assert_eq!(code, 0, "lockfile-only must not flip the exit code: {v}"); + assert_eq!(v["status"], "success", "{v}"); + let patches = v["apply"]["patches"].as_array().unwrap(); + assert!( + patches.iter().any(|p| p["action"] == "skipped" + && p["errorCode"] == "package_not_installed"), + "{v}" + ); + assert!( + !tmp.path().join(".socket/manifest.json").exists(), + "no manifest entry is written for a not-installed package" + ); +} diff --git a/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs b/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs index 80dc2d3..6c8c0a1 100644 --- a/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs +++ b/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs @@ -145,6 +145,48 @@ async fn fetch_npm( }) } +/// Stage a package from an on-disk vendored tarball (the fresh-clone +/// re-vendor path: the project has our committed artifact but no installed +/// copy). The bytes are verified against the LEDGER-recorded sha256 before +/// extraction — same fail-closed posture as the registry path; an entry +/// with no recorded hash is refused. +pub async fn stage_local_artifact( + tgz_path: &Path, + expected_sha256_hex: &str, +) -> Result { + if expected_sha256_hex.is_empty() { + return Err(FetchError::Unverifiable( + "the vendor ledger records no sha256 for the artifact".to_string(), + )); + } + let bytes = tokio::fs::read(tgz_path) + .await + .map_err(|e| FetchError::Failed(format!("cannot read {}: {e}", tgz_path.display())))?; + if bytes.len() as u64 > MAX_DOWNLOAD_BYTES { + return Err(FetchError::Failed(format!( + "{}: artifact exceeds the {MAX_DOWNLOAD_BYTES}-byte cap", + tgz_path.display() + ))); + } + let actual = hex::encode(Sha256::digest(&bytes)); + if !actual.eq_ignore_ascii_case(expected_sha256_hex) { + return Err(FetchError::Failed(format!( + "{}: sha256 mismatch against the vendor ledger (recorded {expected_sha256_hex}, \ + on-disk bytes hash to {actual})", + tgz_path.display() + ))); + } + let tmp = tempfile::tempdir() + .map_err(|e| FetchError::Failed(format!("cannot create staging tempdir: {e}")))?; + let dir = tmp.path().join("package"); + extract_tgz(&bytes, &dir).map_err(FetchError::Failed)?; + Ok(FetchedPackage { + dir, + url: format!("file:{}", tgz_path.display()), + _tmp: tmp, + }) +} + /// Capped download. http(s) only; the cap is enforced on the declared /// Content-Length AND the actual stream (a lying server cannot blow past /// it). From 430145a213a331ccf919a0a6b3ce1a1ec81b696b Mon Sep 17 00:00:00 2001 From: Mikola Lysenko Date: Thu, 11 Jun 2026 18:30:36 -0400 Subject: [PATCH 09/19] feat(vendor): yarn berry checksum-verified fetch + ledger artifact staging tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Berry locks never hash the tarball — the checksum is sha512 of the deterministic cache zip. The fetch rebuilds that zip from the fetched bytes via the same spike-pinned berry_zip recipe the wiring uses and compares the 10c0/ value fail-closed (foreign cacheKeys are Unverifiable). Plus unit coverage for stage_local_artifact's ledger-sha256 gate. Co-Authored-By: Claude Fable 5 --- .../src/patch/vendor/registry_fetch.rs | 88 ++++++++++++++++++- 1 file changed, 87 insertions(+), 1 deletion(-) diff --git a/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs b/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs index 6c8c0a1..1ab31b8 100644 --- a/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs +++ b/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs @@ -126,7 +126,29 @@ async fn fetch_npm( npm_tarball_url(&npm_registry_base(), &entry.name, &entry.version) }); let bytes = download(client, &url).await.map_err(FetchError::Failed)?; - verify_integrity(&bytes, &entry.integrity)?; + match &entry.integrity { + // yarn berry locks never hash the tarball itself — the checksum is + // sha512 of the deterministic cache zip. Rebuild it from the fetched + // bytes (the same spike-pinned recipe the berry wiring uses) and + // compare. Only cacheKey 10c0 (yarn 4 default) is reproducible. + LockIntegrity::BerryChecksum(expected) => { + if !expected.starts_with("10c0/") { + return Err(FetchError::Unverifiable(format!( + "yarn berry checksum `{expected}` uses a cacheKey other than 10c0; the \ + cache-zip recipe is not reproducible for it" + ))); + } + let actual = super::berry_zip::berry_cache_checksum_10c0(&bytes, &entry.name) + .map_err(FetchError::Failed)?; + if &actual != expected { + return Err(FetchError::Failed(format!( + "yarn berry cache checksum mismatch: lockfile records {expected}, the \ + fetched tarball rebuilds to {actual}" + ))); + } + } + other => verify_integrity(&bytes, other)?, + } let tmp = tempfile::tempdir() .map_err(|e| FetchError::Failed(format!("cannot create fetch tempdir: {e}")))?; @@ -581,6 +603,70 @@ mod tests { } } + #[tokio::test] + async fn berry_checksum_verifies_via_cache_zip_rebuild() { + let tgz = make_tgz(&[ + ("package/package.json", br#"{"name":"left-pad"}"#, false), + ("package/index.js", b"module.exports = 1;\n", false), + ]); + let expected = + super::super::berry_zip::berry_cache_checksum_10c0(&tgz, "left-pad").unwrap(); + let mock = MockServer::start().await; + Mock::given(method("GET")) + .and(url_path("/left-pad/-/left-pad-1.3.0.tgz")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(tgz)) + .mount(&mock) + .await; + + let entry = npm_entry( + Some(format!("{}/left-pad/-/left-pad-1.3.0.tgz", mock.uri())), + LockIntegrity::BerryChecksum(expected), + ); + let fetched = fetch_and_stage(&entry, &build_registry_client()) + .await + .unwrap(); + assert!(fetched.dir().join("package.json").is_file()); + + // Tampered checksum → Failed; foreign cacheKey → Unverifiable. + let entry = npm_entry( + Some(format!("{}/left-pad/-/left-pad-1.3.0.tgz", mock.uri())), + LockIntegrity::BerryChecksum(format!("10c0/{}", "0".repeat(128))), + ); + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Failed(msg)) => assert!(msg.contains("mismatch"), "{msg}"), + other => panic!("expected mismatch, got {other:?}"), + } + let entry = npm_entry( + Some(format!("{}/left-pad/-/left-pad-1.3.0.tgz", mock.uri())), + LockIntegrity::BerryChecksum(format!("9/{}", "0".repeat(128))), + ); + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Unverifiable(msg)) => assert!(msg.contains("cacheKey"), "{msg}"), + other => panic!("expected Unverifiable, got {other:?}"), + } + } + + #[tokio::test] + async fn stage_local_artifact_verifies_ledger_sha256() { + let tgz = make_tgz(&[("package/package.json", b"{}", false)]); + let tmp = tempfile::tempdir().unwrap(); + let tgz_path = tmp.path().join("left-pad-1.3.0.tgz"); + std::fs::write(&tgz_path, &tgz).unwrap(); + let sha = hex::encode(Sha256::digest(&tgz)); + + let staged = stage_local_artifact(&tgz_path, &sha).await.unwrap(); + assert!(staged.dir().join("package.json").is_file()); + + match stage_local_artifact(&tgz_path, &"0".repeat(64)).await { + Err(FetchError::Failed(msg)) => assert!(msg.contains("mismatch"), "{msg}"), + other => panic!("expected ledger mismatch, got {other:?}"), + } + match stage_local_artifact(&tgz_path, "").await { + Err(FetchError::Unverifiable(_)) => {} + other => panic!("expected Unverifiable for empty hash, got {other:?}"), + } + } + #[test] fn oversized_entry_header_fails_closed() { // A header CLAIMING more than the per-entry cap fails before any From 16e7e54d426dd67e3863f5aa1a9f7b280c92ed83 Mon Sep 17 00:00:00 2001 From: Mikola Lysenko Date: Thu, 11 Jun 2026 18:37:21 -0400 Subject: [PATCH 10/19] feat(vendor): cargo + golang lockfile inventory and verified fetch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Cargo.lock [[package]] inventory: crates.io-sourced entries carry their sha256 .crate checksum (Sha256Hex); workspace members skipped, git/custom-registry sources discovery-only. Fetch from static.crates.io (SOCKET_CRATES_REGISTRY override), verify, extract ({name}-{version}/ top dir) — feeds vendor_cargo_crate's pristine_src. - go.sum inventory: module-zip h1: lines (the /go.mod manifests-only lines skipped). Fetch from the module proxy (SOCKET_GOPROXY, else the standard GOPROXY's first non-direct element, else proxy.golang.org) with Go's !uppercase path escaping; verify the dirhash Hash1/HashZip in memory BEFORE extraction (algorithm validated against a live sum.golang.org lookup for golang.org/x/text@v0.14.0); extract with the explicit module@version/ prefix (module paths contain slashes, so a first-component strip would be wrong) — feeds vendor_go_module. - lookup() generalized across ecosystems; inventory_project() returns the union the scan supplement and vendor auto-fetch consume. Co-Authored-By: Claude Fable 5 --- .../src/patch/vendor/lock_inventory.rs | 284 ++++++++++++- .../src/patch/vendor/registry_fetch.rs | 384 ++++++++++++++++++ 2 files changed, 652 insertions(+), 16 deletions(-) diff --git a/crates/socket-patch-core/src/patch/vendor/lock_inventory.rs b/crates/socket-patch-core/src/patch/vendor/lock_inventory.rs index f663b51..c391681 100644 --- a/crates/socket-patch-core/src/patch/vendor/lock_inventory.rs +++ b/crates/socket-patch-core/src/patch/vendor/lock_inventory.rs @@ -25,7 +25,7 @@ use serde_json::Value; use crate::patch::path_safety; use crate::utils::purl::strip_purl_qualifiers; -use super::npm_common::{is_safe_npm_name, parse_npm_purl}; +use super::npm_common::is_safe_npm_name; use super::npm_flavor::{detect_npm_lock_flavor, NpmLockFlavor}; use super::path::parse_vendor_path; use super::{bun_lock, pnpm_lock, yarn_berry_lock, yarn_classic_lock}; @@ -111,32 +111,67 @@ pub async fn inventory_npm_lock( } /// Match a manifest/API purl (possibly percent-encoded, possibly carrying -/// qualifiers) against the inventory. npm purls decode via -/// [`parse_npm_purl`] so `pkg:npm/%40scope/x@1` matches the literal entry. +/// qualifiers) against the inventory: components decode via +/// [`crate::utils::purl::normalize_purl`], so `pkg:npm/%40scope/x@1` +/// matches the literal entry. pub fn lookup<'a>(entries: &'a [LockfileEntry], purl: &str) -> Option<&'a LockfileEntry> { - let base = strip_purl_qualifiers(purl); - if base.starts_with("pkg:npm/") { - let (name, version) = parse_npm_purl(base)?; - return entries - .iter() - .find(|e| e.ecosystem == "npm" && e.name == name && e.version == version); + let decoded = crate::utils::purl::normalize_purl(strip_purl_qualifiers(purl)).into_owned(); + let rest = decoded.strip_prefix("pkg:")?; + let (purl_type, rest) = rest.split_once('/')?; + // purl type → vendor-ecosystem tag (same mapping the dispatcher uses). + let eco = match purl_type { + "npm" => "npm", + "cargo" => "cargo", + "golang" => "golang", + "pypi" => "pypi", + "gem" => "gem", + "composer" => "composer", + _ => return None, + }; + let at = rest.rfind('@').filter(|&i| i > 0)?; + let (name, version) = (&rest[..at], &rest[at + 1..]); + entries + .iter() + .find(|e| e.ecosystem == eco && e.name == name && e.version == version) +} + +/// Everything every recognized lockfile in the project resolves — the +/// union the scan supplement and the vendor auto-fetch consume. +pub async fn inventory_project(project_root: &Path) -> Vec { + let mut out: Vec = Vec::new(); + if let Some((_, entries)) = inventory_npm_lock(project_root).await { + out.extend(entries); + } + #[cfg(feature = "cargo")] + if let Some(entries) = inventory_cargo_lock(project_root).await { + out.extend(entries); } - // Other ecosystems route here as their fetchers land. - None + #[cfg(feature = "golang")] + if let Some(entries) = inventory_go_sum(project_root).await { + out.extend(entries); + } + out } /// Guard + dedup the raw npm entries: unsafe names/versions are dropped /// fail-closed; duplicate (name, version) instances collapse to one, /// preferring the instance that carries a verifier. fn finalize_npm(raw: Vec) -> Vec { + dedup_prefer_integrity( + raw.into_iter() + .filter(|e| { + is_safe_npm_name(&e.name) && path_safety::is_safe_single_segment(&e.version) + }) + .collect(), + ) +} + +/// Collapse duplicate (name, version) instances, preferring one that +/// carries a verifier. +fn dedup_prefer_integrity(raw: Vec) -> Vec { let mut seen: HashMap<(String, String), usize> = HashMap::new(); let mut out: Vec = Vec::new(); for entry in raw { - if !is_safe_npm_name(&entry.name) - || !path_safety::is_safe_single_segment(&entry.version) - { - continue; - } let key = (entry.name.clone(), entry.version.clone()); match seen.get(&key) { Some(&i) => { @@ -155,6 +190,117 @@ fn finalize_npm(raw: Vec) -> Vec { out } +// ──────────────────────────────── Cargo.lock ──────────────────────────────── + +/// Inventory `Cargo.lock` `[[package]]` blocks. Only crates.io-sourced +/// entries are fetchable (their `checksum` is the sha256 of the `.crate` +/// file); workspace members (no `source`) are skipped, and git/custom- +/// registry sources stay listed for discovery without a verifier. +#[cfg(feature = "cargo")] +pub async fn inventory_cargo_lock(project_root: &Path) -> Option> { + let text = tokio::fs::read_to_string(project_root.join("Cargo.lock")) + .await + .ok()?; + let mut out = Vec::new(); + let mut cur: Option<(Option, Option, Option, Option)> = None; + let flush = |cur: &mut Option<(Option, Option, Option, Option)>, + out: &mut Vec| { + if let Some((Some(name), Some(version), source, checksum)) = cur.take() { + let Some(source) = source else { + return; // workspace member + }; + if !path_safety::is_safe_single_segment(&name) + || !path_safety::is_safe_single_segment(&version) + { + return; + } + let crates_io = source.contains("github.com/rust-lang/crates.io-index") + || source.contains("index.crates.io"); + let integrity = match checksum { + Some(c) if crates_io && c.len() == 64 && c.bytes().all(|b| b.is_ascii_hexdigit()) => { + LockIntegrity::Sha256Hex(c) + } + _ => LockIntegrity::None, + }; + let purl = format!("pkg:cargo/{name}@{version}"); + out.push(LockfileEntry { + ecosystem: "cargo", + name, + version, + purl, + resolved: None, + integrity, + }); + } + }; + for line in text.lines() { + let line = line.trim(); + if line == "[[package]]" { + flush(&mut cur, &mut out); + cur = Some((None, None, None, None)); + continue; + } + if line.starts_with('[') { + flush(&mut cur, &mut out); + continue; + } + let Some(slot) = cur.as_mut() else { continue }; + let Some((key, value)) = line.split_once('=') else { + continue; + }; + let value = value.trim().trim_matches('"').to_string(); + match key.trim() { + "name" => slot.0 = Some(value), + "version" => slot.1 = Some(value), + "source" => slot.2 = Some(value), + "checksum" => slot.3 = Some(value), + _ => {} + } + } + flush(&mut cur, &mut out); + Some(dedup_prefer_integrity(out)) +} + +// ────────────────────────────────── go.sum ────────────────────────────────── + +/// Inventory `go.sum` module-zip lines (` h1:`); the +/// `/go.mod`-suffixed lines hash only the manifest and are skipped. go.sum +/// may list more modules than the final build graph — acceptable for +/// discovery, and the manifest decides what actually gets vendored. +#[cfg(feature = "golang")] +pub async fn inventory_go_sum(project_root: &Path) -> Option> { + let text = tokio::fs::read_to_string(project_root.join("go.sum")) + .await + .ok()?; + let mut out = Vec::new(); + for line in text.lines() { + let mut parts = line.split_whitespace(); + let (Some(module), Some(version), Some(hash)) = + (parts.next(), parts.next(), parts.next()) + else { + continue; + }; + if version.ends_with("/go.mod") || !hash.starts_with("h1:") { + continue; + } + // SECURITY: module path segments and the version feed paths/URLs. + if !path_safety::is_safe_multi_segment(module) + || !path_safety::is_safe_single_segment(version) + { + continue; + } + out.push(LockfileEntry { + ecosystem: "golang", + name: module.to_string(), + version: version.to_string(), + purl: format!("pkg:golang/{module}@{version}"), + resolved: None, + integrity: LockIntegrity::GoH1(hash.to_string()), + }); + } + Some(dedup_prefer_integrity(out)) +} + /// Keep a lock-recorded URL only when it is a plain http(s) artifact URL /// (drops `git+…`, `file:…`, `link:…` — content the registry conventions /// cannot reproduce; such entries stay listed for discovery but the fetch @@ -749,6 +895,112 @@ __metadata: assert_eq!(out[0].integrity, LockIntegrity::Sri("sha512-x==".into())); } + #[cfg(feature = "cargo")] + #[tokio::test] + async fn cargo_lock_inventories_crates_io_entries() { + let tmp = tempfile::tempdir().unwrap(); + write( + tmp.path(), + "Cargo.lock", + r#"# This file is automatically @generated by Cargo. +version = 4 + +[[package]] +name = "fixture" +version = "0.1.0" + +[[package]] +name = "serde" +version = "1.0.200" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddc6f9cc94d67c0e21aaf7eda3a010fd3af78ebf6e096aa6e2e13c79749cce4f" + +[[package]] +name = "git-dep" +version = "0.5.0" +source = "git+https://github.com/x/git-dep?rev=abc#abc" + +[[package]] +name = "sparse-crate" +version = "2.0.0" +source = "sparse+https://index.crates.io/" +checksum = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" +"#, + ) + .await; + + let entries = inventory_cargo_lock(tmp.path()).await.unwrap(); + let serde_entry = entry(&entries, "serde"); + assert_eq!(serde_entry.version, "1.0.200"); + assert_eq!(serde_entry.purl, "pkg:cargo/serde@1.0.200"); + assert_eq!( + serde_entry.integrity, + LockIntegrity::Sha256Hex( + "ddc6f9cc94d67c0e21aaf7eda3a010fd3af78ebf6e096aa6e2e13c79749cce4f".into() + ) + ); + assert!(matches!( + entry(&entries, "sparse-crate").integrity, + LockIntegrity::Sha256Hex(_) + )); + // Workspace member (no source) excluded; git source unverifiable. + assert!(!entries.iter().any(|e| e.name == "fixture")); + assert_eq!(entry(&entries, "git-dep").integrity, LockIntegrity::None); + } + + #[cfg(feature = "golang")] + #[tokio::test] + async fn go_sum_inventories_module_zip_lines() { + let tmp = tempfile::tempdir().unwrap(); + write( + tmp.path(), + "go.sum", + "github.com/gin-gonic/gin v1.9.1 h1:4idEAncQnU5cB7BeOkPtxjfCSye0AAm1R0RVIqJ+Jmg=\n\ + github.com/gin-gonic/gin v1.9.1/go.mod h1:hPrL7YrpYKXt5YId3A/Tnip5kqbEAP+KLuI3SUcPTeU=\n\ + golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ=\n", + ) + .await; + + let entries = inventory_go_sum(tmp.path()).await.unwrap(); + assert_eq!(entries.len(), 2, "the /go.mod line is skipped: {entries:?}"); + let gin = entry(&entries, "github.com/gin-gonic/gin"); + assert_eq!(gin.version, "v1.9.1"); + assert_eq!(gin.purl, "pkg:golang/github.com/gin-gonic/gin@v1.9.1"); + assert_eq!( + gin.integrity, + LockIntegrity::GoH1("h1:4idEAncQnU5cB7BeOkPtxjfCSye0AAm1R0RVIqJ+Jmg=".into()) + ); + } + + #[tokio::test] + async fn lookup_matches_cargo_and_golang_purls() { + let entries = vec![ + LockfileEntry { + ecosystem: "cargo", + name: "serde".into(), + version: "1.0.200".into(), + purl: "pkg:cargo/serde@1.0.200".into(), + resolved: None, + integrity: LockIntegrity::None, + }, + LockfileEntry { + ecosystem: "golang", + name: "github.com/x/y".into(), + version: "v1.0.0".into(), + purl: "pkg:golang/github.com/x/y@v1.0.0".into(), + resolved: None, + integrity: LockIntegrity::None, + }, + ]; + assert!(lookup(&entries, "pkg:cargo/serde@1.0.200").is_some()); + assert!(lookup(&entries, "pkg:golang/github.com/x/y@v1.0.0").is_some()); + assert!(lookup(&entries, "pkg:cargo/serde@9.9.9").is_none()); + assert!( + lookup(&entries, "pkg:npm/serde@1.0.200").is_none(), + "ecosystem tags must match, not just name@version" + ); + } + #[tokio::test] async fn unsupported_flavors_yield_none() { // PnP marker wins over any lockfile. diff --git a/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs b/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs index 1ab31b8..2acd74a 100644 --- a/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs +++ b/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs @@ -112,12 +112,250 @@ pub async fn fetch_and_stage( } match entry.ecosystem { "npm" => fetch_npm(entry, client).await, + #[cfg(feature = "cargo")] + "cargo" => fetch_cargo(entry, client).await, + #[cfg(feature = "golang")] + "golang" => fetch_golang(entry, client).await, other => Err(FetchError::Unverifiable(format!( "no registry fetcher for ecosystem `{other}`" ))), } } +/// crates.io static download host; override with `SOCKET_CRATES_REGISTRY`. +#[cfg(feature = "cargo")] +pub const DEFAULT_CRATES_REGISTRY: &str = "https://static.crates.io/crates"; + +#[cfg(feature = "cargo")] +fn crates_registry_base() -> String { + std::env::var("SOCKET_CRATES_REGISTRY") + .ok() + .map(|v| v.trim_end_matches('/').to_string()) + .filter(|v| !v.is_empty()) + .unwrap_or_else(|| DEFAULT_CRATES_REGISTRY.to_string()) +} + +/// `.crate` files are tar.gz with a `{name}-{version}/` top dir — the same +/// extraction path as npm tarballs. The Cargo.lock `checksum` is the sha256 +/// of the `.crate` bytes. +#[cfg(feature = "cargo")] +async fn fetch_cargo( + entry: &LockfileEntry, + client: &reqwest::Client, +) -> Result { + let url = entry.resolved.clone().unwrap_or_else(|| { + format!( + "{}/{}/{}-{}.crate", + crates_registry_base(), + entry.name, + entry.name, + entry.version + ) + }); + let bytes = download(client, &url).await.map_err(FetchError::Failed)?; + verify_integrity(&bytes, &entry.integrity)?; + + let tmp = tempfile::tempdir() + .map_err(|e| FetchError::Failed(format!("cannot create fetch tempdir: {e}")))?; + let dir = tmp.path().join("crate"); + extract_tgz(&bytes, &dir).map_err(FetchError::Failed)?; + if tokio::fs::metadata(dir.join("Cargo.toml")).await.is_err() { + return Err(FetchError::Failed(format!( + "fetched .crate for {}@{} carries no Cargo.toml — not a crate", + entry.name, entry.version + ))); + } + Ok(FetchedPackage { + dir, + url, + _tmp: tmp, + }) +} + +/// Default Go module proxy; `SOCKET_GOPROXY` wins, else the standard +/// `GOPROXY` env (first element that isn't `direct`/`off`). +#[cfg(feature = "golang")] +pub const DEFAULT_GOPROXY: &str = "https://proxy.golang.org"; + +#[cfg(feature = "golang")] +fn goproxy_base() -> String { + if let Ok(v) = std::env::var("SOCKET_GOPROXY") { + let v = v.trim_end_matches('/').to_string(); + if !v.is_empty() { + return v; + } + } + if let Ok(v) = std::env::var("GOPROXY") { + for part in v.split(',') { + let part = part.trim().trim_end_matches('/'); + if !part.is_empty() && part != "direct" && part != "off" { + return part.to_string(); + } + } + } + DEFAULT_GOPROXY.to_string() +} + +/// Go's module-path case encoding for proxy URLs: an uppercase letter `X` +/// becomes `!x` (applies to the module path and the version). +#[cfg(feature = "golang")] +fn go_escape(s: &str) -> String { + let mut out = String::with_capacity(s.len()); + for c in s.chars() { + if c.is_ascii_uppercase() { + out.push('!'); + out.push(c.to_ascii_lowercase()); + } else { + out.push(c); + } + } + out +} + +/// go.sum's `h1:` dirhash over a module zip: sha256 of the sorted +/// `"{sha256hex(content)} {entry name}\n"` lines, base64-encoded +/// (golang.org/x/mod/sumdb/dirhash Hash1/HashZip). Computed in memory +/// BEFORE extraction. +#[cfg(feature = "golang")] +fn go_h1_of_zip(bytes: &[u8]) -> Result { + use std::io::Read as _; + let mut archive = zip::ZipArchive::new(std::io::Cursor::new(bytes)) + .map_err(|e| format!("unreadable module zip: {e}"))?; + if archive.len() > MAX_ENTRIES { + return Err(format!("module zip exceeds {MAX_ENTRIES} entries")); + } + let mut files: Vec<(String, String)> = Vec::new(); + let mut total: u64 = 0; + for i in 0..archive.len() { + let mut file = archive + .by_index(i) + .map_err(|e| format!("unreadable module zip entry: {e}"))?; + if file.is_dir() { + continue; // go module zips carry files only + } + let name = file.name().to_string(); + if name.contains('\n') { + return Err("module zip entry name contains a newline".to_string()); + } + if file.size() > MAX_ENTRY_BYTES { + return Err(format!( + "module zip entry `{name}` is {} bytes (cap {MAX_ENTRY_BYTES})", + file.size() + )); + } + total += file.size(); + if total > MAX_TOTAL_DECOMPRESSED_BYTES { + return Err(format!( + "module zip decompresses past the {MAX_TOTAL_DECOMPRESSED_BYTES}-byte cap" + )); + } + let mut hasher = Sha256::new(); + let mut buf = [0u8; 64 * 1024]; + loop { + let n = file + .read(&mut buf) + .map_err(|e| format!("cannot read module zip entry `{name}`: {e}"))?; + if n == 0 { + break; + } + hasher.update(&buf[..n]); + } + files.push((name, hex::encode(hasher.finalize()))); + } + files.sort_by(|a, b| a.0.cmp(&b.0)); + let mut h = Sha256::new(); + for (name, content_hex) in &files { + h.update(format!("{content_hex} {name}\n").as_bytes()); + } + Ok(format!( + "h1:{}", + base64::engine::general_purpose::STANDARD.encode(h.finalize()) + )) +} + +/// Traversal-guarded zip extraction with an EXPLICIT required prefix +/// (`@/` — go module paths contain slashes, so a +/// first-component strip would be wrong). Same guard family as +/// [`extract_tgz`]; an entry outside the prefix fails the whole artifact. +#[cfg(feature = "golang")] +fn extract_zip_with_prefix(bytes: &[u8], dest: &Path, prefix: &str) -> Result<(), String> { + let mut archive = zip::ZipArchive::new(std::io::Cursor::new(bytes)) + .map_err(|e| format!("unreadable module zip: {e}"))?; + for i in 0..archive.len() { + let mut file = archive + .by_index(i) + .map_err(|e| format!("unreadable module zip entry: {e}"))?; + if file.is_dir() { + continue; + } + let name = file.name().to_string(); + let Some(rel) = name.strip_prefix(prefix) else { + return Err(format!( + "module zip entry `{name}` lies outside `{prefix}` — refusing the artifact" + )); + }; + if !is_safe_relative_subpath(rel) { + return Err(format!( + "module zip entry `{name}` escapes the extraction dir — refusing the artifact" + )); + } + let target = dest.join(rel); + if let Some(parent) = target.parent() { + std::fs::create_dir_all(parent) + .map_err(|e| format!("cannot create {}: {e}", parent.display()))?; + } + let mut out = std::fs::File::create(&target) + .map_err(|e| format!("cannot create {}: {e}", target.display()))?; + std::io::copy(&mut file, &mut out).map_err(|e| format!("cannot extract `{rel}`: {e}"))?; + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let exec = file.unix_mode().is_some_and(|m| m & 0o111 != 0); + let perms = if exec { 0o755 } else { 0o644 }; + let _ = std::fs::set_permissions(&target, std::fs::Permissions::from_mode(perms)); + } + } + Ok(()) +} + +#[cfg(feature = "golang")] +async fn fetch_golang( + entry: &LockfileEntry, + client: &reqwest::Client, +) -> Result { + let LockIntegrity::GoH1(expected) = &entry.integrity else { + return Err(FetchError::Unverifiable( + "go module entries verify via the go.sum h1 dirhash only".to_string(), + )); + }; + let url = entry.resolved.clone().unwrap_or_else(|| { + format!( + "{}/{}/@v/{}.zip", + goproxy_base(), + go_escape(&entry.name), + go_escape(&entry.version) + ) + }); + let bytes = download(client, &url).await.map_err(FetchError::Failed)?; + let actual = go_h1_of_zip(&bytes).map_err(FetchError::Failed)?; + if &actual != expected { + return Err(FetchError::Failed(format!( + "go.sum dirhash mismatch: lockfile records {expected}, the fetched module zip \ + hashes to {actual}" + ))); + } + let tmp = tempfile::tempdir() + .map_err(|e| FetchError::Failed(format!("cannot create fetch tempdir: {e}")))?; + let dir = tmp.path().join("module"); + let prefix = format!("{}@{}/", entry.name, entry.version); + extract_zip_with_prefix(&bytes, &dir, &prefix).map_err(FetchError::Failed)?; + Ok(FetchedPackage { + dir, + url, + _tmp: tmp, + }) +} + async fn fetch_npm( entry: &LockfileEntry, client: &reqwest::Client, @@ -667,6 +905,152 @@ mod tests { } } + #[cfg(feature = "cargo")] + #[tokio::test] + async fn cargo_crate_fetch_verifies_sha256_and_extracts() { + // .crate = tar.gz with a {name}-{version}/ top dir. + let crate_bytes = make_tgz(&[ + ("left-pad-1.3.0/Cargo.toml", b"[package]\nname = \"left-pad\"\n", false), + ("left-pad-1.3.0/src/lib.rs", b"pub fn pad() {}\n", false), + ]); + let sha = hex::encode(Sha256::digest(&crate_bytes)); + let mock = MockServer::start().await; + Mock::given(method("GET")) + .and(url_path("/left-pad/left-pad-1.3.0.crate")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(crate_bytes)) + .mount(&mock) + .await; + + let entry = LockfileEntry { + ecosystem: "cargo", + name: "left-pad".into(), + version: "1.3.0".into(), + purl: "pkg:cargo/left-pad@1.3.0".into(), + resolved: Some(format!("{}/left-pad/left-pad-1.3.0.crate", mock.uri())), + integrity: LockIntegrity::Sha256Hex(sha), + }; + let fetched = fetch_and_stage(&entry, &build_registry_client()) + .await + .unwrap(); + assert!(fetched.dir().join("Cargo.toml").is_file()); + assert!(fetched.dir().join("src/lib.rs").is_file()); + + // Tampered checksum fails closed. + let entry = LockfileEntry { + integrity: LockIntegrity::Sha256Hex("0".repeat(64)), + ..entry + }; + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Failed(msg)) => assert!(msg.contains("mismatch"), "{msg}"), + other => panic!("expected mismatch, got {other:?}"), + } + } + + /// Build a go module zip in memory (files only, `module@version/` + /// prefix — the go zip layout). + #[cfg(feature = "golang")] + fn make_module_zip(prefix: &str, files: &[(&str, &[u8])]) -> Vec { + use std::io::Write as _; + let mut writer = zip::ZipWriter::new(std::io::Cursor::new(Vec::new())); + for (name, bytes) in files { + writer + .start_file( + format!("{prefix}{name}"), + zip::write::SimpleFileOptions::default() + .compression_method(zip::CompressionMethod::Deflated), + ) + .unwrap(); + writer.write_all(bytes).unwrap(); + } + writer.finish().unwrap().into_inner() + } + + /// Independent spec-mirror of dirhash Hash1/HashZip, structured + /// differently from the production fn to catch encoding slips. + #[cfg(feature = "golang")] + fn spec_h1(files: &[(&str, &[u8])], prefix: &str) -> String { + // dirhash.Hash1 sorts the FILE NAMES, then emits one line per file. + let mut named: Vec<(String, &[u8])> = files + .iter() + .map(|(name, bytes)| (format!("{prefix}{name}"), *bytes)) + .collect(); + named.sort_by(|a, b| a.0.cmp(&b.0)); + let lines: Vec = named + .iter() + .map(|(name, bytes)| format!("{} {name}\n", hex::encode(Sha256::digest(bytes)))) + .collect(); + let digest = Sha256::digest(lines.concat().as_bytes()); + format!( + "h1:{}", + base64::engine::general_purpose::STANDARD.encode(digest) + ) + } + + #[cfg(feature = "golang")] + #[tokio::test] + async fn golang_module_fetch_verifies_h1_dirhash_and_extracts() { + // Out-of-order files prove the sort; nested module path proves the + // explicit-prefix strip (a first-component strip would be wrong). + let prefix = "github.com/x/y@v1.0.0/"; + let files: [(&str, &[u8]); 3] = [ + ("go.mod", b"module github.com/x/y\n"), + ("a/b.go", b"package a\n"), + ("README.md", b"# y\n"), + ]; + let zip_bytes = make_module_zip(prefix, &files); + let expected = spec_h1(&files, prefix); + assert_eq!( + go_h1_of_zip(&zip_bytes).unwrap(), + expected, + "production dirhash matches the spec mirror" + ); + + let mock = MockServer::start().await; + Mock::given(method("GET")) + .and(url_path("/github.com/x/y/@v/v1.0.0.zip")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(zip_bytes)) + .mount(&mock) + .await; + + let entry = LockfileEntry { + ecosystem: "golang", + name: "github.com/x/y".into(), + version: "v1.0.0".into(), + purl: "pkg:golang/github.com/x/y@v1.0.0".into(), + resolved: Some(format!("{}/github.com/x/y/@v/v1.0.0.zip", mock.uri())), + integrity: LockIntegrity::GoH1(expected), + }; + let fetched = fetch_and_stage(&entry, &build_registry_client()) + .await + .unwrap(); + assert!(fetched.dir().join("go.mod").is_file()); + assert!(fetched.dir().join("a/b.go").is_file()); + + // Tampered h1 fails closed. + let entry = LockfileEntry { + integrity: LockIntegrity::GoH1("h1:AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=".into()), + ..entry + }; + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Failed(msg)) => assert!(msg.contains("mismatch"), "{msg}"), + other => panic!("expected mismatch, got {other:?}"), + } + } + + #[cfg(feature = "golang")] + #[test] + fn go_escape_uppercase_and_zip_prefix_guards() { + assert_eq!(go_escape("github.com/Azure/azure-sdk"), "github.com/!azure/azure-sdk"); + assert_eq!(go_escape("v1.0.0-RC1"), "v1.0.0-!r!c1"); + + // An entry outside the module prefix fails the whole artifact. + let zip_bytes = make_module_zip("github.com/x/y@v1.0.0/", &[("go.mod", b"m\n")]); + let tmp = tempfile::tempdir().unwrap(); + let err = extract_zip_with_prefix(&zip_bytes, tmp.path(), "github.com/OTHER@v1/") + .unwrap_err(); + assert!(err.contains("outside"), "{err}"); + } + #[test] fn oversized_entry_header_fails_closed() { // A header CLAIMING more than the per-entry cap fails before any From b5e9f6366d050e559211aa9aca01a839c67fb7fc Mon Sep 17 00:00:00 2001 From: Mikola Lysenko Date: Thu, 11 Jun 2026 18:43:10 -0400 Subject: [PATCH 11/19] feat(vendor): composer + gem + pypi lockfile inventory and verified fetch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - composer.lock packages[]/packages-dev[]: zip dists with their sha1 shasum (frequently empty → discovery-only); names lowercased to the packagist form, pretty leading v dropped; path dists (ours) excluded. Fetch verifies sha1 and strips the variable zipball top dir. - Gemfile.lock GEM/specs + bundler 2.6 CHECKSUMS sha256 (older locks discovery-only); the GEM remote drives the /downloads/ URL. Platform-suffixed specs skipped (unsupported for vendoring). The fetched .gem (plain tar) is sha256-verified whole, then data.tar.gz extracts at the root (no prefix strip). - pypi: uv.lock registry packages with a pure py3-none-any wheel carry a fetchable URL + sha256; poetry.lock and ==-pinned requirements.txt contribute discovery-only entries (PEP 503-normalized names). The unzipped wheel is a site-packages-shaped stage for the pypi backend. Co-Authored-By: Claude Fable 5 --- .../src/patch/vendor/lock_inventory.rs | 561 +++++++++++++++++- .../src/patch/vendor/registry_fetch.rs | 359 ++++++++++- 2 files changed, 915 insertions(+), 5 deletions(-) diff --git a/crates/socket-patch-core/src/patch/vendor/lock_inventory.rs b/crates/socket-patch-core/src/patch/vendor/lock_inventory.rs index c391681..476353c 100644 --- a/crates/socket-patch-core/src/patch/vendor/lock_inventory.rs +++ b/crates/socket-patch-core/src/patch/vendor/lock_inventory.rs @@ -130,6 +130,12 @@ pub fn lookup<'a>(entries: &'a [LockfileEntry], purl: &str) -> Option<&'a Lockfi }; let at = rest.rfind('@').filter(|&i| i > 0)?; let (name, version) = (&rest[..at], &rest[at + 1..]); + // pypi names compare in PEP 503 normalized form. + let name = if eco == "pypi" { + pep503(name) + } else { + name.to_string() + }; entries .iter() .find(|e| e.ecosystem == eco && e.name == name && e.version == version) @@ -150,6 +156,16 @@ pub async fn inventory_project(project_root: &Path) -> Vec { if let Some(entries) = inventory_go_sum(project_root).await { out.extend(entries); } + #[cfg(feature = "composer")] + if let Some(entries) = inventory_composer_lock(project_root).await { + out.extend(entries); + } + if let Some(entries) = inventory_gemfile_lock(project_root).await { + out.extend(entries); + } + if let Some(entries) = inventory_pypi_locks(project_root).await { + out.extend(entries); + } out } @@ -201,10 +217,11 @@ pub async fn inventory_cargo_lock(project_root: &Path) -> Option, Option, Option, Option); let mut out = Vec::new(); - let mut cur: Option<(Option, Option, Option, Option)> = None; - let flush = |cur: &mut Option<(Option, Option, Option, Option)>, - out: &mut Vec| { + let mut cur: Option = None; + let flush = |cur: &mut Option, out: &mut Vec| { if let Some((Some(name), Some(version), source, checksum)) = cur.take() { let Some(source) = source else { return; // workspace member @@ -552,6 +569,390 @@ async fn inventory_bun(root: &Path) -> Option> { Some(out) } +// ────────────────────────────── composer.lock ────────────────────────────── + +/// Inventory `composer.lock` `packages`/`packages-dev`. The `dist.shasum` +/// (sha1 of the dist zip) is frequently empty — such entries stay +/// discovery-only. Names lowercase to the canonical packagist form; +/// versions drop the pretty leading `v`. +#[cfg(feature = "composer")] +pub async fn inventory_composer_lock(project_root: &Path) -> Option> { + let bytes = tokio::fs::read(project_root.join("composer.lock")).await.ok()?; + let doc: Value = serde_json::from_slice(&bytes).ok()?; + let mut out = Vec::new(); + for section in ["packages", "packages-dev"] { + let Some(list) = doc.get(section).and_then(Value::as_array) else { + continue; + }; + for pkg in list { + let Some(name) = pkg.get("name").and_then(Value::as_str) else { + continue; + }; + let Some(version) = pkg.get("version").and_then(Value::as_str) else { + continue; + }; + let name = name.to_ascii_lowercase(); + let version = version + .strip_prefix('v') + .filter(|r| r.chars().next().is_some_and(|c| c.is_ascii_digit())) + .unwrap_or(version) + .to_string(); + if !path_safety::is_safe_multi_segment(&name) + || name.split('/').count() != 2 + || !path_safety::is_safe_single_segment(&version) + { + continue; + } + let dist = pkg.get("dist"); + let dist_url = dist + .and_then(|d| d.get("url")) + .and_then(Value::as_str) + .unwrap_or(""); + // Our own vendored entries use a path dist — skip. + if dist + .and_then(|d| d.get("type")) + .and_then(Value::as_str) + .is_some_and(|t| t == "path") + || parse_vendor_path(dist_url).is_some() + { + continue; + } + let is_zip = dist + .and_then(|d| d.get("type")) + .and_then(Value::as_str) + .is_some_and(|t| t == "zip"); + let shasum = dist + .and_then(|d| d.get("shasum")) + .and_then(Value::as_str) + .unwrap_or(""); + let integrity = if is_zip + && shasum.len() == 40 + && shasum.bytes().all(|b| b.is_ascii_hexdigit()) + { + LockIntegrity::Sha1Hex(shasum.to_ascii_lowercase()) + } else { + LockIntegrity::None + }; + let purl = format!("pkg:composer/{name}@{version}"); + out.push(LockfileEntry { + ecosystem: "composer", + name, + version, + purl, + resolved: is_zip.then(|| http_url(dist_url)).flatten(), + integrity, + }); + } + } + Some(dedup_prefer_integrity(out)) +} + +// ────────────────────────────── Gemfile.lock ────────────────────────────── + +/// Inventory `Gemfile.lock`: `GEM`-section `specs:` entries (4-space +/// indent; deeper lines are dependency ranges) plus the bundler ≥ 2.6 +/// `CHECKSUMS` section's sha256 values when present (older locks stay +/// discovery-only). Platform-suffixed specs (`nokogiri (1.16.5-arm64-…)`) +/// are skipped — platform gems are unsupported for vendoring anyway. +pub async fn inventory_gemfile_lock(project_root: &Path) -> Option> { + let text = tokio::fs::read_to_string(project_root.join("Gemfile.lock")) + .await + .ok()?; + let mut remote: Option = None; + let mut checksums: HashMap<(String, String), String> = HashMap::new(); + let mut specs: Vec<(String, String)> = Vec::new(); + + let mut section = ""; + let mut in_specs = false; + for line in text.lines() { + if !line.starts_with(' ') { + section = line.trim(); + in_specs = false; + continue; + } + let trimmed = line.trim_start(); + let indent = line.len() - trimmed.len(); + match section { + "GEM" => { + if indent == 2 { + if let Some(r) = trimmed.strip_prefix("remote:") { + let r = r.trim().trim_end_matches('/'); + if remote.is_none() && !r.is_empty() { + remote = Some(r.to_string()); + } + } + in_specs = trimmed == "specs:"; + } else if in_specs && indent == 4 { + if let Some((name, version)) = parse_gem_spec_line(trimmed) { + specs.push((name, version)); + } + } + } + "CHECKSUMS" => { + // ` name (version) sha256=hex` + if let Some((spec_part, hash_part)) = + trimmed.rsplit_once(" sha256=").map(|(s, h)| (s, h.trim())) + { + if let Some((name, version)) = parse_gem_spec_line(spec_part) { + if hash_part.len() == 64 + && hash_part.bytes().all(|b| b.is_ascii_hexdigit()) + { + checksums + .insert((name, version), hash_part.to_ascii_lowercase()); + } + } + } + } + _ => {} + } + } + if specs.is_empty() { + return None; + } + let base = remote.unwrap_or_else(|| "https://rubygems.org".to_string()); + let mut out = Vec::new(); + for (name, version) in specs { + if !path_safety::is_safe_single_segment(&name) + || !path_safety::is_safe_single_segment(&version) + { + continue; + } + let integrity = checksums + .get(&(name.clone(), version.clone())) + .map(|h| LockIntegrity::Sha256Hex(h.clone())) + .unwrap_or(LockIntegrity::None); + out.push(LockfileEntry { + ecosystem: "gem", + purl: format!("pkg:gem/{name}@{version}"), + resolved: http_url(&format!("{base}/downloads/{name}-{version}.gem")), + name, + version, + integrity, + }); + } + Some(dedup_prefer_integrity(out)) +} + +/// `name (version)` → parts; platform-suffixed versions (`1.2.3-x86_64…`) +/// and dependency lines (no parens / range operators) yield `None`. +fn parse_gem_spec_line(line: &str) -> Option<(String, String)> { + let (name, rest) = line.split_once(" (")?; + let version = rest.strip_suffix(')')?; + if name.is_empty() + || version.is_empty() + || version.contains(' ') + || version.contains('-') + || !version.chars().next().is_some_and(|c| c.is_ascii_digit()) + { + return None; + } + Some((name.to_string(), version.to_string())) +} + +// ─────────────────────────────── pypi locks ─────────────────────────────── + +/// PEP 503 name normalization (`Foo._Bar` → `foo-bar`) — pypi purls and +/// lock entries must compare in this form. +fn pep503(name: &str) -> String { + let mut out = String::with_capacity(name.len()); + let mut last_dash = false; + for c in name.chars() { + let c = c.to_ascii_lowercase(); + if c == '-' || c == '_' || c == '.' { + if !last_dash { + out.push('-'); + last_dash = true; + } + } else { + out.push(c); + last_dash = false; + } + } + out +} + +/// Inventory the pypi lock the project carries. Fetchable resolution +/// (URL + sha256 of a pure `py3-none-any` wheel) comes from `uv.lock`; +/// `poetry.lock` and `--hash`-pinned `requirements.txt` contribute +/// DISCOVERY-only entries (no recorded URL; platform-independent wheel +/// choice is not derivable offline). Pipenv/pdm locks: not yet read. +pub async fn inventory_pypi_locks(project_root: &Path) -> Option> { + if let Some(out) = inventory_uv_lock(project_root).await { + return Some(out); + } + if let Some(out) = inventory_poetry_lock(project_root).await { + return Some(out); + } + inventory_requirements_txt(project_root).await +} + +/// uv.lock: TOML `[[package]]` blocks with `name`/`version` and +/// `wheels = [{ url, hash = "sha256:…" }, …]` entries. +async fn inventory_uv_lock(project_root: &Path) -> Option> { + let text = tokio::fs::read_to_string(project_root.join("uv.lock")) + .await + .ok()?; + let mut out = Vec::new(); + // Line-oriented: uv emits `[[package]]` blocks; wheels live either as + // inline `{ url = "…", hash = "sha256:…" }` table rows or one-line + // arrays. A pure-python wheel ends `py3-none-any.whl`. + let mut name: Option = None; + let mut version: Option = None; + let mut sourced_registry = true; + let mut wheel: Option<(String, String)> = None; + let flush = |name: &mut Option, + version: &mut Option, + sourced_registry: &mut bool, + wheel: &mut Option<(String, String)>, + out: &mut Vec| { + if let (Some(n), Some(v)) = (name.take(), version.take()) { + let canonical = pep503(&n); + if *sourced_registry + && path_safety::is_safe_single_segment(&canonical) + && path_safety::is_safe_single_segment(&v) + { + let (resolved, integrity) = match wheel.take() { + Some((url, sha)) => (http_url(&url), LockIntegrity::Sha256Hex(sha)), + None => (None, LockIntegrity::None), + }; + out.push(LockfileEntry { + ecosystem: "pypi", + purl: format!("pkg:pypi/{canonical}@{v}"), + name: canonical, + version: v, + resolved, + integrity, + }); + } + } + *sourced_registry = true; + *wheel = None; + }; + for line in text.lines() { + let t = line.trim(); + if t == "[[package]]" { + flush(&mut name, &mut version, &mut sourced_registry, &mut wheel, &mut out); + continue; + } + if let Some(v) = t.strip_prefix("name = ") { + name = Some(v.trim_matches('"').to_string()); + } else if let Some(v) = t.strip_prefix("version = ") { + version = Some(v.trim_matches('"').to_string()); + } else if t.starts_with("source = ") { + // Registry packages: `source = { registry = "…" }`; editable/ + // virtual/path/git sources are not fetchable artifacts. + sourced_registry = t.contains("registry"); + } else if wheel.is_none() && t.contains("py3-none-any.whl") { + // `{ url = "…py3-none-any.whl", hash = "sha256:…" }` + let url = t + .split("url = \"") + .nth(1) + .and_then(|r| r.split('"').next()) + .unwrap_or(""); + let sha = t + .split("hash = \"sha256:") + .nth(1) + .and_then(|r| r.split('"').next()) + .unwrap_or(""); + if !url.is_empty() && sha.len() == 64 && sha.bytes().all(|b| b.is_ascii_hexdigit()) { + wheel = Some((url.to_string(), sha.to_ascii_lowercase())); + } + } + } + flush(&mut name, &mut version, &mut sourced_registry, &mut wheel, &mut out); + Some(dedup_prefer_integrity(out)) +} + +/// poetry.lock: `[[package]]` blocks with `name`/`version` — discovery +/// only (file hashes exist but carry no URLs and no platform choice). +async fn inventory_poetry_lock(project_root: &Path) -> Option> { + let text = tokio::fs::read_to_string(project_root.join("poetry.lock")) + .await + .ok()?; + let mut out = Vec::new(); + let mut in_package = false; + let mut name: Option = None; + for line in text.lines() { + let t = line.trim(); + if t == "[[package]]" { + in_package = true; + name = None; + continue; + } + if t.starts_with('[') && t != "[[package]]" { + in_package = false; + continue; + } + if !in_package { + continue; + } + if let Some(v) = t.strip_prefix("name = ") { + name = Some(pep503(v.trim_matches('"'))); + } else if let Some(v) = t.strip_prefix("version = ") { + if let Some(n) = name.take() { + let v = v.trim_matches('"').to_string(); + if path_safety::is_safe_single_segment(&n) + && path_safety::is_safe_single_segment(&v) + { + out.push(LockfileEntry { + ecosystem: "pypi", + purl: format!("pkg:pypi/{n}@{v}"), + name: n, + version: v, + resolved: None, + integrity: LockIntegrity::None, + }); + } + } + } + } + if out.is_empty() { + return None; + } + Some(dedup_prefer_integrity(out)) +} + +/// requirements.txt with exact `==` pins — discovery only. +async fn inventory_requirements_txt(project_root: &Path) -> Option> { + let text = tokio::fs::read_to_string(project_root.join("requirements.txt")) + .await + .ok()?; + let mut out = Vec::new(); + for line in text.lines() { + let t = line.trim(); + if t.is_empty() || t.starts_with('#') || t.starts_with('-') { + continue; + } + // `name==version` (strip extras, env markers, hash continuations). + let spec = t.split(';').next().unwrap_or(t).trim(); + let spec = spec.split_whitespace().next().unwrap_or(spec); + let Some((raw_name, version)) = spec.split_once("==") else { + continue; + }; + let name = pep503(raw_name.split('[').next().unwrap_or(raw_name).trim()); + let version = version.trim().to_string(); + if name.is_empty() + || !path_safety::is_safe_single_segment(&name) + || !path_safety::is_safe_single_segment(&version) + || !version.chars().next().is_some_and(|c| c.is_ascii_digit()) + { + continue; + } + out.push(LockfileEntry { + ecosystem: "pypi", + purl: format!("pkg:pypi/{name}@{version}"), + name, + version, + resolved: None, + integrity: LockIntegrity::None, + }); + } + if out.is_empty() { + return None; + } + Some(dedup_prefer_integrity(out)) +} + #[cfg(test)] mod tests { use super::*; @@ -1001,6 +1402,160 @@ checksum = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" ); } + #[cfg(feature = "composer")] + #[tokio::test] + async fn composer_lock_inventories_dist_entries() { + let tmp = tempfile::tempdir().unwrap(); + write( + tmp.path(), + "composer.lock", + r#"{ + "packages": [ + { + "name": "Monolog/Monolog", + "version": "v3.5.0", + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/Seldaek/monolog/zipball/abc", + "shasum": "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" + } + }, + { + "name": "vendored/pkg", + "version": "1.0.0", + "dist": { "type": "path", "url": ".socket/vendor/composer/9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f/vendored/pkg@1.0.0" } + } + ], + "packages-dev": [ + { + "name": "symfony/console", + "version": "v6.4.1", + "dist": { "type": "zip", "url": "https://example.com/console.zip", "shasum": "" } + } + ] +}"#, + ) + .await; + + let entries = inventory_composer_lock(tmp.path()).await.unwrap(); + let monolog = entry(&entries, "monolog/monolog"); + assert_eq!(monolog.version, "3.5.0", "leading v dropped, name lowercased"); + assert_eq!(monolog.purl, "pkg:composer/monolog/monolog@3.5.0"); + assert!(matches!(monolog.integrity, LockIntegrity::Sha1Hex(_))); + assert!(monolog.resolved.as_deref().unwrap().contains("zipball")); + // Empty shasum → discovery-only; path dist (ours) excluded. + assert_eq!( + entry(&entries, "symfony/console").integrity, + LockIntegrity::None + ); + assert!(!entries.iter().any(|e| e.name == "vendored/pkg")); + } + + #[tokio::test] + async fn gemfile_lock_inventories_specs_and_checksums() { + let tmp = tempfile::tempdir().unwrap(); + write( + tmp.path(), + "Gemfile.lock", + "GEM\n remote: https://rubygems.org/\n specs:\n rails (7.1.0)\n \ + actionpack (= 7.1.0)\n rack (3.0.8)\n nokogiri (1.16.5-arm64-darwin)\n\n\ + PLATFORMS\n ruby\n\nDEPENDENCIES\n rails\n\nCHECKSUMS\n \ + rails (7.1.0) sha256=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n\n\ + BUNDLED WITH\n 2.6.0\n", + ) + .await; + + let entries = inventory_gemfile_lock(tmp.path()).await.unwrap(); + let rails = entry(&entries, "rails"); + assert_eq!(rails.version, "7.1.0"); + assert_eq!(rails.purl, "pkg:gem/rails@7.1.0"); + assert!(matches!(rails.integrity, LockIntegrity::Sha256Hex(_))); + assert_eq!( + rails.resolved.as_deref(), + Some("https://rubygems.org/downloads/rails-7.1.0.gem") + ); + // No CHECKSUMS entry → discovery-only; platform gem skipped; + // dependency range lines never parse as specs. + assert_eq!(entry(&entries, "rack").integrity, LockIntegrity::None); + assert!(!entries.iter().any(|e| e.name == "nokogiri")); + assert!(!entries.iter().any(|e| e.name == "actionpack")); + } + + #[tokio::test] + async fn uv_lock_inventories_pure_wheels() { + let tmp = tempfile::tempdir().unwrap(); + write( + tmp.path(), + "uv.lock", + r#"version = 1 + +[[package]] +name = "Requests" +version = "2.28.0" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/aa/requests-2.28.0-py3-none-any.whl", hash = "sha256:aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" }, +] + +[[package]] +name = "native-only" +version = "1.0.0" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bb/native_only-1.0.0-cp312-macosx.whl", hash = "sha256:bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" }, +] + +[[package]] +name = "local-proj" +version = "0.0.1" +source = { editable = "." } +"#, + ) + .await; + + let entries = inventory_pypi_locks(tmp.path()).await.unwrap(); + let requests = entry(&entries, "requests"); + assert_eq!(requests.purl, "pkg:pypi/requests@2.28.0", "PEP 503 name"); + assert!(matches!(requests.integrity, LockIntegrity::Sha256Hex(_))); + assert!(requests + .resolved + .as_deref() + .unwrap() + .ends_with("py3-none-any.whl")); + // Platform-only wheels → discovery-only; editable sources excluded. + assert_eq!( + entry(&entries, "native-only").integrity, + LockIntegrity::None + ); + assert!(!entries.iter().any(|e| e.name == "local-proj")); + } + + #[tokio::test] + async fn poetry_and_requirements_are_discovery_only() { + let tmp = tempfile::tempdir().unwrap(); + write( + tmp.path(), + "poetry.lock", + "[[package]]\nname = \"Flask_Login\"\nversion = \"0.6.3\"\n\n[metadata]\nlock-version = \"2.0\"\n", + ) + .await; + let entries = inventory_pypi_locks(tmp.path()).await.unwrap(); + let fl = entry(&entries, "flask-login"); + assert_eq!(fl.purl, "pkg:pypi/flask-login@0.6.3"); + assert_eq!(fl.integrity, LockIntegrity::None); + + let tmp = tempfile::tempdir().unwrap(); + write( + tmp.path(), + "requirements.txt", + "# pinned\nrequests[security]==2.28.0 --hash=sha256:abc \\\n --hash=sha256:def\nflask>=2.0\n-e .\n", + ) + .await; + let entries = inventory_pypi_locks(tmp.path()).await.unwrap(); + assert_eq!(entries.len(), 1, "{entries:?}"); + assert_eq!(entries[0].purl, "pkg:pypi/requests@2.28.0"); + } + #[tokio::test] async fn unsupported_flavors_yield_none() { // PnP marker wins over any lockfile. diff --git a/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs b/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs index 2acd74a..3cef46b 100644 --- a/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs +++ b/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs @@ -116,12 +116,198 @@ pub async fn fetch_and_stage( "cargo" => fetch_cargo(entry, client).await, #[cfg(feature = "golang")] "golang" => fetch_golang(entry, client).await, + #[cfg(feature = "composer")] + "composer" => fetch_composer(entry, client).await, + "gem" => fetch_gem(entry, client).await, + "pypi" => fetch_pypi(entry, client).await, other => Err(FetchError::Unverifiable(format!( "no registry fetcher for ecosystem `{other}`" ))), } } +/// Traversal-guarded zip extraction. `strip_first` mirrors the tar +/// behavior (composer dist zips carry a variable top dir; wheels carry +/// content at the root). +fn extract_zip(bytes: &[u8], dest: &Path, strip_first: bool) -> Result<(), String> { + let mut archive = zip::ZipArchive::new(std::io::Cursor::new(bytes)) + .map_err(|e| format!("unreadable zip: {e}"))?; + if archive.len() > MAX_ENTRIES { + return Err(format!("zip exceeds {MAX_ENTRIES} entries")); + } + let mut total: u64 = 0; + for i in 0..archive.len() { + let mut file = archive + .by_index(i) + .map_err(|e| format!("unreadable zip entry: {e}"))?; + if file.is_dir() { + continue; + } + let raw = PathBuf::from(file.name()); + let rel = if strip_first { + match strip_first_component(&raw) { + Some(rel) => rel, + None => continue, + } + } else { + raw.clone() + }; + let rel_str = rel.to_string_lossy().into_owned(); + if !is_safe_relative_subpath(&rel_str) { + return Err(format!( + "zip entry `{}` escapes the extraction dir — refusing the artifact", + raw.display() + )); + } + if file.size() > MAX_ENTRY_BYTES { + return Err(format!( + "zip entry `{rel_str}` is {} bytes (cap {MAX_ENTRY_BYTES})", + file.size() + )); + } + total += file.size(); + if total > MAX_TOTAL_DECOMPRESSED_BYTES { + return Err(format!( + "zip decompresses past the {MAX_TOTAL_DECOMPRESSED_BYTES}-byte cap" + )); + } + let target = dest.join(&rel); + if let Some(parent) = target.parent() { + std::fs::create_dir_all(parent) + .map_err(|e| format!("cannot create {}: {e}", parent.display()))?; + } + let mut out = std::fs::File::create(&target) + .map_err(|e| format!("cannot create {}: {e}", target.display()))?; + std::io::copy(&mut file, &mut out) + .map_err(|e| format!("cannot extract `{rel_str}`: {e}"))?; + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let exec = file.unix_mode().is_some_and(|m| m & 0o111 != 0); + let perms = if exec { 0o755 } else { 0o644 }; + let _ = std::fs::set_permissions(&target, std::fs::Permissions::from_mode(perms)); + } + } + Ok(()) +} + +/// Composer dist zips (packagist/GitHub zipballs): sha1-verified, variable +/// top dir stripped. The extracted dir plays the installed package dir. +#[cfg(feature = "composer")] +async fn fetch_composer( + entry: &LockfileEntry, + client: &reqwest::Client, +) -> Result { + let Some(url) = entry.resolved.clone() else { + return Err(FetchError::Unverifiable(format!( + "composer.lock records no dist URL for {}@{}", + entry.name, entry.version + ))); + }; + let bytes = download(client, &url).await.map_err(FetchError::Failed)?; + verify_integrity(&bytes, &entry.integrity)?; + let tmp = tempfile::tempdir() + .map_err(|e| FetchError::Failed(format!("cannot create fetch tempdir: {e}")))?; + let dir = tmp.path().join("package"); + extract_zip(&bytes, &dir, /*strip_first=*/ true).map_err(FetchError::Failed)?; + if tokio::fs::metadata(dir.join("composer.json")).await.is_err() { + return Err(FetchError::Failed(format!( + "fetched dist for {}@{} carries no composer.json", + entry.name, entry.version + ))); + } + Ok(FetchedPackage { + dir, + url, + _tmp: tmp, + }) +} + +/// `.gem` files are plain tar containers holding `data.tar.gz` (the +/// package content, no prefix dir) + metadata. The whole `.gem` is +/// sha256-verified against the Gemfile.lock CHECKSUMS entry first. +async fn fetch_gem( + entry: &LockfileEntry, + client: &reqwest::Client, +) -> Result { + let Some(url) = entry.resolved.clone() else { + return Err(FetchError::Unverifiable(format!( + "no download URL for {}@{}", + entry.name, entry.version + ))); + }; + let bytes = download(client, &url).await.map_err(FetchError::Failed)?; + verify_integrity(&bytes, &entry.integrity)?; + + // Locate data.tar.gz inside the (uncompressed) outer tar. + let mut archive = tar::Archive::new(bytes.as_slice()); + let mut data: Option> = None; + for e in archive + .entries() + .map_err(|e| FetchError::Failed(format!("unreadable .gem: {e}")))? + { + use std::io::Read as _; + let mut e = e.map_err(|err| FetchError::Failed(format!("unreadable .gem entry: {err}")))?; + let is_data = e + .path() + .ok() + .is_some_and(|p| p.as_os_str() == "data.tar.gz"); + if !is_data { + continue; + } + if e.header().size().unwrap_or(u64::MAX) > MAX_DOWNLOAD_BYTES { + return Err(FetchError::Failed("data.tar.gz exceeds the size cap".into())); + } + let mut buf = Vec::new(); + e.read_to_end(&mut buf) + .map_err(|err| FetchError::Failed(format!("cannot read data.tar.gz: {err}")))?; + data = Some(buf); + break; + } + let Some(data) = data else { + return Err(FetchError::Failed(format!( + "fetched .gem for {}@{} carries no data.tar.gz", + entry.name, entry.version + ))); + }; + let tmp = tempfile::tempdir() + .map_err(|e| FetchError::Failed(format!("cannot create fetch tempdir: {e}")))?; + let dir = tmp.path().join("gem"); + extract_tgz_no_strip(&data, &dir).map_err(FetchError::Failed)?; + Ok(FetchedPackage { + dir, + url, + _tmp: tmp, + }) +} + +/// Pure-python wheels recorded by uv.lock (URL + sha256): the unzipped +/// wheel IS a site-packages layout (package dirs + `.dist-info/RECORD` at +/// the root), which is exactly the shape the pypi vendor backend stages +/// from. +async fn fetch_pypi( + entry: &LockfileEntry, + client: &reqwest::Client, +) -> Result { + let Some(url) = entry.resolved.clone() else { + return Err(FetchError::Unverifiable(format!( + "the lockfile records no platform-independent wheel URL for {}@{} (only uv.lock carries fetchable wheel resolutions today)", + entry.name, entry.version + ))); + }; + let bytes = download(client, &url).await.map_err(FetchError::Failed)?; + verify_integrity(&bytes, &entry.integrity)?; + let tmp = tempfile::tempdir() + .map_err(|e| FetchError::Failed(format!("cannot create fetch tempdir: {e}")))?; + let dir = tmp.path().join("site-packages"); + extract_zip(&bytes, &dir, /*strip_first=*/ false).map_err(FetchError::Failed)?; + Ok(FetchedPackage { + dir, + url, + _tmp: tmp, + }) +} + /// crates.io static download host; override with `SOCKET_CRATES_REGISTRY`. #[cfg(feature = "cargo")] pub const DEFAULT_CRATES_REGISTRY: &str = "https://static.crates.io/crates"; @@ -576,6 +762,17 @@ fn strip_first_component(path: &Path) -> Option { /// Fails CLOSED on any traversal-shaped entry — a malicious tarball must /// not half-extract. fn extract_tgz(bytes: &[u8], dest: &Path) -> Result<(), String> { + extract_tar_gz(bytes, dest, /*strip_first=*/ true) +} + +/// Like [`extract_tgz`] but keeps entry paths verbatim (gem `data.tar.gz` +/// archives carry package content at the root, no prefix dir). +#[allow(dead_code)] // used by the gem fetcher (feature-independent helper) +fn extract_tgz_no_strip(bytes: &[u8], dest: &Path) -> Result<(), String> { + extract_tar_gz(bytes, dest, /*strip_first=*/ false) +} + +fn extract_tar_gz(bytes: &[u8], dest: &Path, strip_first: bool) -> Result<(), String> { use std::io::Read as _; let gz = flate2::read::GzDecoder::new(bytes).take(MAX_TOTAL_DECOMPRESSED_BYTES); let mut archive = tar::Archive::new(gz); @@ -598,8 +795,13 @@ fn extract_tgz(bytes: &[u8], dest: &Path) -> Result<(), String> { .path() .map_err(|e| format!("tarball entry has an undecodable path: {e}"))? .into_owned(); - let Some(rel) = strip_first_component(&raw) else { - continue; // a bare prefix-level file — not package content + let rel = if strip_first { + match strip_first_component(&raw) { + Some(rel) => rel, + None => continue, // a bare prefix-level file — not package content + } + } else { + raw.clone() }; let rel_str = rel.to_string_lossy(); if !is_safe_relative_subpath(&rel_str) { @@ -1051,6 +1253,159 @@ mod tests { assert!(err.contains("outside"), "{err}"); } + /// Build a zip with the given `(path, bytes)` entries. + fn make_zip(files: &[(&str, &[u8])]) -> Vec { + use std::io::Write as _; + let mut writer = zip::ZipWriter::new(std::io::Cursor::new(Vec::new())); + for (name, bytes) in files { + writer + .start_file( + name.to_string(), + zip::write::SimpleFileOptions::default() + .compression_method(zip::CompressionMethod::Deflated), + ) + .unwrap(); + writer.write_all(bytes).unwrap(); + } + writer.finish().unwrap().into_inner() + } + + #[cfg(feature = "composer")] + #[tokio::test] + async fn composer_dist_fetch_verifies_sha1_and_strips_top_dir() { + // GitHub zipballs carry an `owner-repo-sha/` top dir. + let zip_bytes = make_zip(&[ + ("Seldaek-monolog-abc123/composer.json", br#"{"name":"monolog/monolog"}"#), + ("Seldaek-monolog-abc123/src/Logger.php", b" assert!(msg.contains("mismatch"), "{msg}"), + other => panic!("expected mismatch, got {other:?}"), + } + } + + #[tokio::test] + async fn gem_fetch_verifies_sha256_and_extracts_data_tar() { + // .gem = plain tar holding data.tar.gz (content at the ROOT — no + // prefix dir) + metadata.gz. + let data_tgz = make_tgz(&[ + ("lib/rails.rb", b"module Rails; end\n", false), + ("README.md", b"# rails\n", false), + ]); + let mut outer = tar::Builder::new(Vec::new()); + for (name, bytes) in [("metadata.gz", b"meta".as_slice()), ("data.tar.gz", &data_tgz)] { + let mut header = tar::Header::new_gnu(); + header.set_size(bytes.len() as u64); + header.set_mode(0o644); + header.set_cksum(); + outer.append_data(&mut header, name, bytes).unwrap(); + } + let gem_bytes = outer.into_inner().unwrap(); + let sha = hex::encode(Sha256::digest(&gem_bytes)); + + let mock = MockServer::start().await; + Mock::given(method("GET")) + .and(url_path("/downloads/rails-7.1.0.gem")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(gem_bytes)) + .mount(&mock) + .await; + + let entry = LockfileEntry { + ecosystem: "gem", + name: "rails".into(), + version: "7.1.0".into(), + purl: "pkg:gem/rails@7.1.0".into(), + resolved: Some(format!("{}/downloads/rails-7.1.0.gem", mock.uri())), + integrity: LockIntegrity::Sha256Hex(sha), + }; + let fetched = fetch_and_stage(&entry, &build_registry_client()) + .await + .unwrap(); + assert!( + fetched.dir().join("lib/rails.rb").is_file(), + "data.tar.gz content extracts at the root (no strip)" + ); + assert!(fetched.dir().join("README.md").is_file()); + } + + #[tokio::test] + async fn pypi_wheel_fetch_extracts_site_packages_layout() { + let wheel = make_zip(&[ + ("requests/__init__.py", b"__version__ = '2.28.0'\n"), + ( + "requests-2.28.0.dist-info/RECORD", + b"requests/__init__.py,sha256=abc,24\n", + ), + ("requests-2.28.0.dist-info/WHEEL", b"Wheel-Version: 1.0\n"), + ]); + let sha = hex::encode(Sha256::digest(&wheel)); + let mock = MockServer::start().await; + Mock::given(method("GET")) + .and(url_path("/packages/requests-2.28.0-py3-none-any.whl")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(wheel)) + .mount(&mock) + .await; + + let entry = LockfileEntry { + ecosystem: "pypi", + name: "requests".into(), + version: "2.28.0".into(), + purl: "pkg:pypi/requests@2.28.0".into(), + resolved: Some(format!( + "{}/packages/requests-2.28.0-py3-none-any.whl", + mock.uri() + )), + integrity: LockIntegrity::Sha256Hex(sha), + }; + let fetched = fetch_and_stage(&entry, &build_registry_client()) + .await + .unwrap(); + // Wheel content at the root: a site-packages-shaped dir with the + // dist-info RECORD the pypi vendor backend stages from. + assert!(fetched.dir().join("requests/__init__.py").is_file()); + assert!(fetched + .dir() + .join("requests-2.28.0.dist-info/RECORD") + .is_file()); + + // No recorded wheel URL (poetry/requirements) → Unverifiable. + let entry = LockfileEntry { + resolved: None, + integrity: LockIntegrity::Sha256Hex("0".repeat(64)), + ..entry + }; + match fetch_and_stage(&entry, &build_registry_client()).await { + Err(FetchError::Unverifiable(msg)) => assert!(msg.contains("wheel"), "{msg}"), + other => panic!("expected Unverifiable, got {other:?}"), + } + } + #[test] fn oversized_entry_header_fails_closed() { // A header CLAIMING more than the per-entry cap fails before any From 772f98d6fb5ab71bc756e93993f9ce9371aa31b8 Mon Sep 17 00:00:00 2001 From: Mikola Lysenko Date: Thu, 11 Jun 2026 19:03:25 -0400 Subject: [PATCH 12/19] feat(scan): all-ecosystem lockfile supplement + docs scan's lockfile supplement now consumes inventory_project (npm-family, Cargo.lock, go.sum, composer.lock, Gemfile.lock, uv/poetry/requirements) with per-ecosystem counts; the vendor auto-fetch pass likewise serves every inventoried ecosystem. CLI_CONTRACT.md gains the lockfile- supplement and vendor-auto-fetch sections + the three reason codes; README notes the fresh-clone flow; the exact-shape empty-scan contract test pins the additive lockfileOnlyPackages field; the cargo build e2e scrubs ambient CARGO_TARGET_DIR from child builds. Co-Authored-By: Claude Fable 5 --- README.md | 5 ++- crates/socket-patch-cli/CLI_CONTRACT.md | 7 ++++ crates/socket-patch-cli/src/commands/scan.rs | 36 ++++++++----------- .../socket-patch-cli/src/commands/vendor.rs | 14 +++----- .../socket-patch-cli/tests/cli_parse_scan.rs | 1 + .../tests/e2e_vendor_cargo_build.rs | 4 +++ 6 files changed, 35 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index 72809b5..19fec6a 100644 --- a/README.md +++ b/README.md @@ -233,7 +233,10 @@ socket-patch scan -g # Scan + apply + emit an OpenVEX attestation in one pass socket-patch scan --json --sync --yes --vex socket.vex.json -# Vendor every patched dependency (committable; see the vendor command) +# Vendor every patched dependency (committable; see the vendor command). +# Works on a completely fresh clone: dependencies listed in the lockfile +# but not yet installed are fetched pristine from their registry and +# integrity-verified against the lockfile before vendoring. socket-patch scan --json --vendor --yes # Same, but keep the manifest out of it entirely diff --git a/crates/socket-patch-cli/CLI_CONTRACT.md b/crates/socket-patch-cli/CLI_CONTRACT.md index de9ef89..48103b0 100644 --- a/crates/socket-patch-cli/CLI_CONTRACT.md +++ b/crates/socket-patch-cli/CLI_CONTRACT.md @@ -75,6 +75,10 @@ Beyond the globals above, each subcommand defines a small set of local arguments `scan` queries the patch API in `--batch-size` chunks. Authenticated runs POST `/v0/orgs/{slug}/patches/batch`; token-less runs POST `{proxy}/patch/batch` on the public proxy and degrade to per-package `GET /patch/by-package/:purl` requests in two cases: the deployed proxy predates the batch endpoint (legacy proxies answer the POST with their `400 "Unsupported endpoint"` catch-all), or the all-or-nothing batch validation rejects the chunk (e.g. a crawled PURL type the server doesn't recognize, such as `pkg:jsr/…` — the per-package path tolerates those individually, preserving the pre-batch scan semantics). Rate limits and over-capacity 503s surface instead of silently degrading. +**Lockfile supplement (v3.4)**: `scan` discovery is no longer limited to installed trees. The project's lockfiles (`package-lock.json`/`npm-shrinkwrap.json`, `pnpm-lock.yaml` v9, `yarn.lock` classic + berry, `bun.lock`, `Cargo.lock`, `go.sum`, `composer.lock`, `Gemfile.lock`, `uv.lock`/`poetry.lock`/pinned `requirements.txt`) are inventoried and dependencies with NO installed copy join discovery — counts, the API lookup, the table (flagged ` [NOT INSTALLED]`, plus a stderr note), and the prune "scanned" set (a wiped node_modules no longer prunes lockfile-listed entries). JSON gains a top-level `lockfileOnlyPackages` count and an additive `notInstalled: true` on matching `packages[]` entries. `--apply` partitions lockfile-only patches out BEFORE download (calm `skipped`/`package_not_installed` records — never an error exit, never a manifest write); `--vendor` passes them through to the vendor engine's auto-fetch. Vendored-ledger entries likewise stay discoverable on a fresh clone (the committed artifact is the dependency). Global scans (`--global`) get no supplement. + +**Vendor auto-fetch (v3.4)**: `vendor`/`scan --vendor` no longer fail on lockfile-resolved packages with no installed copy. Already-vendored purls stage from their committed artifact (sha256-verified against the vendor ledger; offline-safe). Otherwise the pristine artifact is fetched per the lockfile resolution and verified against the lock's recorded integrity FAIL-CLOSED before any write: npm SRI (or yarn classic's sha1 fragment), yarn berry's cache-zip checksum (rebuilt from the fetched tarball; cacheKey 10c0 only), Cargo.lock sha256 over the .crate, go.sum `h1:` dirhash over the module zip, composer `dist.shasum` (sha1), Gemfile.lock `CHECKSUMS` sha256, uv.lock wheel sha256 (pure `py3-none-any` wheels only). Entries the lock cannot verify are NEVER fetched (`vendor_fetch_unverifiable` warning + the calm `package_not_installed` skip). Registry bases honor `SOCKET_NPM_REGISTRY`, `SOCKET_CRATES_REGISTRY`, `SOCKET_GOPROXY` (else `GOPROXY`); npm/yarn/composer/gem/uv lock-recorded URLs are used verbatim. `--offline` refuses the fetch with the calm skip (the detail names the lockfile resolution). The fetch stages into a private tempdir — the project tree is never touched. + `scan --sync` is sugar for `--apply --prune` — the canonical single-flag bot invocation. `scan --json --sync --yes` discovers, applies, and reconciles state in one pass. `scan --vendor` swaps the in-place apply for the vendor pipeline: discover → download (manifest written, as `--apply`) → vendor every patched dependency via the same engine as the `vendor` command (under the same lock). The whole manifest is vendored, so a package vendored at an older patch uuid is **re-vendored automatically** (its old uuid dir is removed — `vendor_stale_artifact_removed`); same-uuid re-runs are `already_vendored` skips. With `--prune`, GC runs **before** the vendor step so stale manifest entries don't fail vendoring with `package_not_installed`. JSON output gains a `download` sub-object (the download phase; no `applied` field — nothing is applied in place) and a `vendor` sub-object (a full vendor Envelope). `--dry-run` previews per-patch `would_vendor` | `would_revendor` (+`oldUuid`) | `already_vendored` without network downloads or disk writes. Interactive mode prompts "Download and vendor N patch(es)?". @@ -604,6 +608,9 @@ Every `--json` invocation emits a single JSON object that follows the **unified | `vendor_override_conflict` | `failed` | vendor (pnpm/yarn-berry): a user-authored override/resolution for the package already exists. | | `vendor_integrity_unverified` | `skipped` (warning) | vendor (pipenv): the lockfile format does not hash-check file entries; the committed wheel bytes are the protection. | | `vendor_content_mismatch_overwritten` | `skipped` (warning) | vendor: a staged file matched NEITHER beforeHash nor afterHash (patch built against different bytes, or local edits); the stage was overwritten with the verified patched content and the vendor succeeded. | +| `vendor_fetched_missing` | `skipped` (warning) | vendor: the package was not installed; its pristine artifact was fetched per the lockfile resolution (or staged from the committed vendor artifact), integrity-verified, and vendored — the project tree was not touched. | +| `vendor_fetch_failed` | `failed` | vendor: the lockfile-resolved fetch was attempted and failed (HTTP error, size cap, integrity mismatch, or a corrupt committed artifact). Suppresses the duplicate `package_not_installed` skip. | +| `vendor_fetch_unverifiable` | `skipped` (warning) | vendor: the lockfile records no usable integrity for the missing package; nothing was fetched (fail-closed) and the `package_not_installed` skip follows. | | `vendor_lock_checksums_unsupported` / `vendor_stale_lock_checksum` | `failed` | vendor (gem): an ambiguous/platform CHECKSUMS entry, or a v1-wired lock whose stale token blocks the hot path (run `vendor --revert` + re-vendor). | | `pypi_{poetry,pdm,pipenv}_no_lockfile` | `failed` | vendor (pypi): a lock-less tool marker with no `requirements.txt` fallback — run ` lock`. | | `vendor_*` / `pypi_*` / `gemfile_*` / `lock_*` / `locked_version_mismatch` / `user_authored_*` / `native_extensions_unsupported` / `platform_gem_unsupported` | `failed`/`skipped` | vendor: per-ecosystem refusal + drift vocabulary; see the Vendor command contract section. New tags are additive (MINOR). | diff --git a/crates/socket-patch-cli/src/commands/scan.rs b/crates/socket-patch-cli/src/commands/scan.rs index 4c562e1..06a97b0 100644 --- a/crates/socket-patch-cli/src/commands/scan.rs +++ b/crates/socket-patch-cli/src/commands/scan.rs @@ -303,38 +303,28 @@ async fn lockfile_supplement( crawled: &[socket_patch_core::crawlers::types::CrawledPackage], ) -> LockfileSupplement { use socket_patch_core::patch::vendor::lock_inventory; - use socket_patch_core::patch::vendor::npm_flavor::NpmLockFlavor; - let mut out = LockfileSupplement::default(); + let mut out = LockfileSupplement { + source: "project lockfiles", + ..Default::default() + }; if common.global || common.global_prefix.is_some() { return out; } - let Some((flavor, entries)) = lock_inventory::inventory_npm_lock(&common.cwd).await else { + let entries = lock_inventory::inventory_project(&common.cwd).await; + if entries.is_empty() { return out; - }; - out.source = match flavor { - NpmLockFlavor::PackageLock => "package-lock.json", - NpmLockFlavor::Pnpm => "pnpm-lock.yaml", - NpmLockFlavor::YarnClassic | NpmLockFlavor::YarnBerry => "yarn.lock", - NpmLockFlavor::Bun => "bun.lock", - }; + } let crawled_purls: HashSet<&str> = crawled.iter().map(|p| p.purl.as_str()).collect(); for entry in entries { if crawled_purls.contains(entry.purl.as_str()) { continue; } - let (namespace, name) = match entry.name.split_once('/') { - Some((scope, bare)) => (Some(scope.to_string()), bare.to_string()), - None => (None, entry.name.clone()), + let Some(pkg) = crawled_from_purl(&entry.purl, &common.cwd) else { + continue; }; out.purls.insert(entry.purl.clone()); - out.packages.push(socket_patch_core::crawlers::types::CrawledPackage { - name, - version: entry.version.clone(), - namespace, - purl: entry.purl.clone(), - path: common.cwd.join("node_modules").join(&entry.name), - }); + out.packages.push(pkg); } out } @@ -1348,7 +1338,11 @@ pub async fn run(args: ScanArgs) -> i32 { // are flagged "not yet installed" everywhere a user could act on them. let lockfile_only = lockfile_supplement(&args.common, &all_crawled).await; if !lockfile_only.packages.is_empty() { - *eco_counts.entry(Ecosystem::Npm).or_insert(0) += lockfile_only.packages.len(); + for pkg in &lockfile_only.packages { + if let Some(eco) = Ecosystem::from_purl(&pkg.purl) { + *eco_counts.entry(eco).or_insert(0) += 1; + } + } all_crawled.extend(lockfile_only.packages.iter().cloned()); } let ledger_supplement = vendored_ledger_supplement(&args.common, &all_crawled).await; diff --git a/crates/socket-patch-cli/src/commands/vendor.rs b/crates/socket-patch-cli/src/commands/vendor.rs index 8c0c417..0ca3037 100644 --- a/crates/socket-patch-cli/src/commands/vendor.rs +++ b/crates/socket-patch-cli/src/commands/vendor.rs @@ -555,10 +555,7 @@ pub(crate) async fn vendor_records( if !missing.is_empty() { // The inventory is a local file read — fine offline; only the // fetch itself needs the network. - let inventory = lock_inventory::inventory_npm_lock(&common.cwd) - .await - .map(|(_, entries)| entries) - .unwrap_or_default(); + let inventory = lock_inventory::inventory_project(&common.cwd).await; let client = registry_fetch::build_registry_client(); // Pre-loaded vendor ledger for the artifact-staging path: an // already-vendored purl with no installed copy (fresh clone) @@ -909,12 +906,9 @@ pub(crate) async fn vendor_records( // Offline runs name the packages the lockfile COULD have fetched — // the inventory is a local file read, allowed offline. let lock_resolvable: HashSet = if common.offline { - let entries = socket_patch_core::patch::vendor::lock_inventory::inventory_npm_lock( - &common.cwd, - ) - .await - .map(|(_, e)| e) - .unwrap_or_default(); + let entries = + socket_patch_core::patch::vendor::lock_inventory::inventory_project(&common.cwd) + .await; unmatched .iter() .filter(|p| { diff --git a/crates/socket-patch-cli/tests/cli_parse_scan.rs b/crates/socket-patch-cli/tests/cli_parse_scan.rs index 359994f..b961eb4 100644 --- a/crates/socket-patch-cli/tests/cli_parse_scan.rs +++ b/crates/socket-patch-cli/tests/cli_parse_scan.rs @@ -523,6 +523,7 @@ fn scan_json_empty_cwd_emits_updates_key() { let expected = serde_json::json!({ "status": "success", "scannedPackages": 0, + "lockfileOnlyPackages": 0, "packagesWithPatches": 0, "totalPatches": 0, "freePatches": 0, diff --git a/crates/socket-patch-cli/tests/e2e_vendor_cargo_build.rs b/crates/socket-patch-cli/tests/e2e_vendor_cargo_build.rs index 12e1b1a..28c118d 100644 --- a/crates/socket-patch-cli/tests/e2e_vendor_cargo_build.rs +++ b/crates/socket-patch-cli/tests/e2e_vendor_cargo_build.rs @@ -81,6 +81,10 @@ fn cargo(cwd: &Path, args: &[&str], cargo_home: &Path) -> Output { .args(args) .current_dir(cwd) .env("CARGO_HOME", cargo_home) + // The assertions read `/target/debug/...`; an ambient + // CARGO_TARGET_DIR (shared-build-cache setups) would redirect the + // child build elsewhere and break them. + .env_remove("CARGO_TARGET_DIR") .output() .expect("failed to run cargo") } From 0426e0bc938d58483dd3f5a3ca6d468122dfef6f Mon Sep 17 00:00:00 2001 From: Mikola Lysenko Date: Thu, 11 Jun 2026 19:32:59 -0400 Subject: [PATCH 13/19] feat(apply): beforeHash mismatch warns and applies the full blob by default; --strict restores the hard error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A file whose on-disk content matches NEITHER the patch's beforeHash nor its afterHash previously hard-failed the in-place apply (the flatted case: a patch built against non-registry bytes made plain apply unusable). The default now overwrites such files with the FULL verified patched content and continues: - core: apply_package_patch's force bool becomes MismatchPolicy {Warn (default) | Strict | Force}. Warn promotes HashMismatch to Ready keeping the warning signature (expected/current hashes); the diff strategy self-disables on a wrong base (partial patches are skipped, as they must be) and the archive/blob writes stay hash-gated to exactly afterHash — a tolerated mismatch lands verified patched bytes or fails, never silent corruption. Missing pre-existing files still fail closed (only Force skips them). - CLI: global --strict (env SOCKET_STRICT) restores the fail-closed behavior across apply/get/scan --apply/the hook/go redirects (--force overrides it); plumbed through DownloadParams into the nested applies. Vendor staging is unaffected (already auto-forces into its private stage). - Each overwrite logs a content_mismatch_overwritten warning to stderr and rides the JSON envelope as a Skipped warning event beside the package's Applied event. - Since the full content lives in the afterHash blob and the default --download-mode diff may not have staged it, a pre-apply pass probes for mismatches and downloads the missing blobs by hash (offline runs warn and let those files fail). Live-verified: pristine flatted@3.3.1 + its bad-baseline patch now applies 6/6 files via blob with per-file warnings (exit 0); apply --strict exits 1 with the old error and leaves files untouched. Co-Authored-By: Claude Fable 5 --- crates/socket-patch-cli/CLI_CONTRACT.md | 2 + crates/socket-patch-cli/src/args.rs | 14 ++ crates/socket-patch-cli/src/commands/apply.rs | 145 ++++++++++++- crates/socket-patch-cli/src/commands/get.rs | 6 + crates/socket-patch-cli/src/commands/scan.rs | 3 + .../socket-patch-cli/tests/apply_network.rs | 179 +++++++++++++-- .../socket-patch-cli/tests/cli_global_args.rs | 1 + .../tests/in_process_cargo_apply.rs | 21 +- .../tests/in_process_get_update_count.rs | 1 + crates/socket-patch-core/src/patch/apply.rs | 204 ++++++++++++------ .../src/patch/go_redirect.rs | 51 ++--- .../src/patch/vendor/golang.rs | 7 +- .../socket-patch-core/src/patch/vendor/mod.rs | 5 +- 13 files changed, 502 insertions(+), 137 deletions(-) diff --git a/crates/socket-patch-cli/CLI_CONTRACT.md b/crates/socket-patch-cli/CLI_CONTRACT.md index 48103b0..14bc18e 100644 --- a/crates/socket-patch-cli/CLI_CONTRACT.md +++ b/crates/socket-patch-cli/CLI_CONTRACT.md @@ -56,6 +56,7 @@ Beyond the globals above, each subcommand defines a small set of local arguments |---|---|---|---| | `apply` | `--force` / `-f` | `SOCKET_FORCE` | Bypass beforeHash check | | `vendor` | `--force` / `-f` | `SOCKET_FORCE` | Tolerate missing patch-target files in the stage + bypass the variant probe. A beforeHash mismatch no longer needs it: vendor staging auto-overwrites with the verified patched content (`vendor_content_mismatch_overwritten` warning) | +| (global) | `--strict` | `SOCKET_STRICT` | Treat a beforeHash mismatch as a hard error in the in-place apply paths (apply/get/scan --apply/hook/go redirect). DEFAULT (v3.4): a mismatched file is overwritten with the FULL verified patched content (the diff strategy self-disables on a wrong base; archive/blob writes are hash-gated to exactly afterHash; the missing blob is downloaded on demand) and surfaced as a `content_mismatch_overwritten` stderr warning + Skipped event. `--force` overrides `--strict` and additionally skips missing files. Vendor staging is unaffected (it always auto-overwrites into its private stage). | | `vendor` | `--revert` | `SOCKET_VENDOR_REVERT` | Undo vendoring: restore recorded original lockfile fragments + remove `.socket/vendor/` artifacts. Works without a manifest | | `apply`, `scan`, `vendor` | `--vex` | `SOCKET_VEX` | Generate an OpenVEX 0.2.0 document at this path on a successful run; see "embedded VEX" below | | `apply`, `scan`, `vendor` | `--vex-product`, `--vex-no-verify`, `--vex-doc-id`, `--vex-compact` | `SOCKET_VEX_PRODUCT`, `SOCKET_VEX_NO_VERIFY`, `SOCKET_VEX_DOC_ID`, `SOCKET_VEX_COMPACT` | Passthrough to the embedded VEX builder; mirror the standalone `vex` knobs. Inert unless `--vex` is set | @@ -611,6 +612,7 @@ Every `--json` invocation emits a single JSON object that follows the **unified | `vendor_fetched_missing` | `skipped` (warning) | vendor: the package was not installed; its pristine artifact was fetched per the lockfile resolution (or staged from the committed vendor artifact), integrity-verified, and vendored — the project tree was not touched. | | `vendor_fetch_failed` | `failed` | vendor: the lockfile-resolved fetch was attempted and failed (HTTP error, size cap, integrity mismatch, or a corrupt committed artifact). Suppresses the duplicate `package_not_installed` skip. | | `vendor_fetch_unverifiable` | `skipped` (warning) | vendor: the lockfile records no usable integrity for the missing package; nothing was fetched (fail-closed) and the `package_not_installed` skip follows. | +| `content_mismatch_overwritten` | `skipped` (warning) | apply (default policy): a file matched NEITHER beforeHash nor afterHash and was overwritten with the full verified patched content. `--strict` turns this case into a `failed` event instead. | | `vendor_lock_checksums_unsupported` / `vendor_stale_lock_checksum` | `failed` | vendor (gem): an ambiguous/platform CHECKSUMS entry, or a v1-wired lock whose stale token blocks the hot path (run `vendor --revert` + re-vendor). | | `pypi_{poetry,pdm,pipenv}_no_lockfile` | `failed` | vendor (pypi): a lock-less tool marker with no `requirements.txt` fallback — run ` lock`. | | `vendor_*` / `pypi_*` / `gemfile_*` / `lock_*` / `locked_version_mismatch` / `user_authored_*` / `native_extensions_unsupported` / `platform_gem_unsupported` | `failed`/`skipped` | vendor: per-ecosystem refusal + drift vocabulary; see the Vendor command contract section. New tags are additive (MINOR). | diff --git a/crates/socket-patch-cli/src/args.rs b/crates/socket-patch-cli/src/args.rs index 784b0fe..1fde519 100644 --- a/crates/socket-patch-cli/src/args.rs +++ b/crates/socket-patch-cli/src/args.rs @@ -144,6 +144,19 @@ pub struct GlobalArgs { )] pub offline: bool, + /// Treat a beforeHash mismatch as a hard error. By DEFAULT a file whose + /// on-disk content matches neither the patch's beforeHash nor its + /// afterHash is overwritten with the full verified patched content and + /// surfaced as a stderr warning (`content_mismatch_overwritten`); this + /// flag restores the fail-closed behavior. `--force` overrides it. + #[arg( + long, + env = "SOCKET_STRICT", + default_value_t = false, + value_parser = parse_bool_flag, + )] + pub strict: bool, + /// Operate on globally-installed packages. #[arg( long = "global", @@ -378,6 +391,7 @@ impl Default for GlobalArgs { ecosystems: None, download_mode: "diff".to_string(), offline: false, + strict: false, global: false, global_prefix: None, json: false, diff --git a/crates/socket-patch-cli/src/commands/apply.rs b/crates/socket-patch-cli/src/commands/apply.rs index 07d11db..610f391 100644 --- a/crates/socket-patch-cli/src/commands/apply.rs +++ b/crates/socket-patch-cli/src/commands/apply.rs @@ -5,9 +5,115 @@ use socket_patch_core::crawlers::{ }; use socket_patch_core::manifest::operations::read_manifest; use socket_patch_core::manifest::schema::PatchRecord; -use socket_patch_core::patch::apply::{ +use socket_patch_core::patch::apply::{MismatchPolicy, apply_package_patch, verify_file_patch, ApplyResult, PatchSources, VerifyStatus, }; +/// Files whose pre-apply content matched NEITHER hash and were (or would +/// be) overwritten with the verified patched content — the promoted +/// verify signature `apply_package_patch` leaves behind under the default +/// mismatch policy. +pub(crate) fn mismatch_overwritten_files(result: &ApplyResult) -> Vec { + result + .files_verified + .iter() + .filter(|v| { + v.status == VerifyStatus::Ready + && v.expected_hash.is_some() + && v.current_hash != v.expected_hash + }) + .map(|v| v.file.clone()) + .collect() +} + +/// Surface one mismatch-overwrite per file on stderr (human mode). +fn warn_mismatch_overwrites(result: &ApplyResult, common: &GlobalArgs) { + if common.json || common.silent { + return; + } + for file in mismatch_overwritten_files(result) { + eprintln!( + "Warning (content_mismatch_overwritten): {} {file} did not match the patch's \ + expected original content; applied the full verified patched content instead \ + (pass --strict to fail on mismatches)", + socket_patch_core::utils::purl::normalize_purl(&result.package_key) + ); + } +} + +/// The default mismatch policy applies the FULL patched content for +/// mismatched files — and the full content lives in the afterHash blob, +/// which the default `--download-mode diff` may not have staged. Probe the +/// in-scope packages for mismatches and fetch the missing afterHash blobs +/// by hash (online only) so the apply below can fall through diff → blob. +async fn ensure_blobs_for_mismatches( + args: &ApplyArgs, + manifest: &socket_patch_core::manifest::schema::PatchManifest, + all_packages: &HashMap, + blobs_path: &Path, +) { + if args.common.strict && !args.force { + return; // strict fails on mismatch — nothing to fetch + } + let mut needed: std::collections::HashSet = std::collections::HashSet::new(); + for (purl, pkg_path) in all_packages { + let Some(record) = manifest.patches.get(purl) else { + continue; + }; + for (file_name, info) in &record.files { + if info.before_hash.is_empty() { + continue; + } + let verify = verify_file_patch(pkg_path, file_name, info).await; + if verify.status == socket_patch_core::patch::apply::VerifyStatus::HashMismatch + && tokio::fs::metadata(blobs_path.join(&info.after_hash)) + .await + .is_err() + { + needed.insert(info.after_hash.clone()); + } + } + } + if needed.is_empty() { + return; + } + if args.common.offline { + if !args.common.silent && !args.common.json { + eprintln!( + "Warning: {} mismatched file(s) need their full patched blob, but --offline \ + prevents fetching; those files will fail to apply", + needed.len() + ); + } + return; + } + if !args.common.silent && !args.common.json { + eprintln!( + "Downloading {} full patched blob(s) for mismatched file(s)...", + needed.len() + ); + } + let (client, _) = get_api_client_with_overrides(args.common.api_client_overrides()).await; + let _ = socket_patch_core::api::blob_fetcher::fetch_blobs_by_hash( + &needed, + blobs_path, + &client, + None, + ) + .await; +} + +/// The mismatch policy this run applies with: `--force` ⊃ default +/// (adds the missing-file skip), `--strict` restores fail-closed. +pub(crate) fn mismatch_policy(force: bool, strict: bool) -> MismatchPolicy { + if force { + MismatchPolicy::Force + } else if strict { + MismatchPolicy::Strict + } else { + MismatchPolicy::Warn + } +} + #[cfg(feature = "golang")] use socket_patch_core::patch::go_redirect::{ apply_go_redirect, reconcile_go_redirects, verify_go_redirect_state, @@ -102,7 +208,7 @@ async fn try_local_go_apply( patch: &PatchRecord, sources: &PatchSources<'_>, common: &GlobalArgs, - force: bool, + policy: MismatchPolicy, ) -> Option { if !is_local_go(purl, common) { return None; @@ -126,7 +232,7 @@ async fn try_local_go_apply( sources, Some(&patch.uuid), common.dry_run, - force, + policy, ) .await, ) @@ -139,7 +245,7 @@ async fn try_local_go_apply( _patch: &PatchRecord, _sources: &PatchSources<'_>, _common: &GlobalArgs, - _force: bool, + _policy: MismatchPolicy, ) -> Option { None } @@ -538,6 +644,21 @@ pub async fn run(args: ApplyArgs) -> i32 { } for result in &results { env.record(result_to_event(result, args.common.dry_run)); + // Mismatch overwrites ride as Skipped warning events + // (same pattern as the vendor warnings): the package's + // Applied event stands, the warning is per-file. + for file in mismatch_overwritten_files(result) { + env.record( + PatchEvent::new(PatchAction::Skipped, result.package_key.clone()) + .with_reason( + "content_mismatch_overwritten", + format!( + "{file} did not match the patch's expected original \ + content; the full verified patched content was applied" + ), + ), + ); + } // Sidecar records live on the envelope, not on // individual events. Consumers iterate // `envelope.sidecars[]` and JOIN against @@ -888,6 +1009,7 @@ async fn apply_patches_inner( } // Apply patches + ensure_blobs_for_mismatches(args, &manifest, &all_packages, &blobs_path).await; let mut has_errors = false; // Group release-variant PURLs by base. PyPI (`?artifact_id=`), @@ -977,10 +1099,11 @@ async fn apply_patches_inner( &sources, Some(&patch.uuid), args.common.dry_run, - args.force, + mismatch_policy(args.force, args.common.strict), ) .await; + warn_mismatch_overwrites(&result, &args.common); // A variant that reached apply is the installed distribution // (it passed the first-file check, or `--force` bypassed it), // so record it as matched whether or not the patch succeeded. @@ -1060,7 +1183,14 @@ async fn apply_patches_inner( // cache) — patches in place via `apply_package_patch`. Without the // `golang` feature `try_local_go_apply` is an inert `None`. let result = - match try_local_go_apply(purl, pkg_path, patch, &sources, &args.common, args.force) + match try_local_go_apply( + purl, + pkg_path, + patch, + &sources, + &args.common, + mismatch_policy(args.force, args.common.strict), + ) .await { Some(r) => r, @@ -1072,12 +1202,13 @@ async fn apply_patches_inner( &sources, Some(&patch.uuid), args.common.dry_run, - args.force, + mismatch_policy(args.force, args.common.strict), ) .await } }; + warn_mismatch_overwrites(&result, &args.common); if !result.success { has_errors = true; if !args.common.silent && !args.common.json { diff --git a/crates/socket-patch-cli/src/commands/get.rs b/crates/socket-patch-cli/src/commands/get.rs index 0f520f6..b6f1271 100644 --- a/crates/socket-patch-cli/src/commands/get.rs +++ b/crates/socket-patch-cli/src/commands/get.rs @@ -580,6 +580,9 @@ pub struct DownloadParams { /// `true` (`--all-releases`), every variant is downloaded. No effect /// on ecosystems without per-release artifact_id variants. pub all_releases: bool, + /// `--strict` forwarded to the nested apply (a beforeHash mismatch + /// fails instead of warn-and-overwrite). + pub strict: bool, } /// Narrow a selection of patches down to the release variant(s) present @@ -1193,6 +1196,7 @@ pub async fn download_and_apply_patches( global_prefix: params.global_prefix.clone(), silent: params.json || params.silent, download_mode: params.download_mode.clone(), + strict: params.strict, ..crate::args::GlobalArgs::default() }, force: false, @@ -1621,6 +1625,7 @@ pub async fn run(args: GetArgs) -> i32 { download_mode: args.common.download_mode.clone(), api_overrides: args.common.api_client_overrides(), all_releases: args.all_releases, + strict: args.common.strict, }; let (code, result_json) = download_and_apply_patches(&selected, ¶ms).await; @@ -1810,6 +1815,7 @@ async fn save_and_apply_patch(args: &GetArgs, patch: &PatchResponse) -> i32 { global_prefix: args.common.global_prefix.clone(), silent: quiet, download_mode: args.common.download_mode.clone(), + strict: args.common.strict, ..crate::args::GlobalArgs::default() }, force: false, diff --git a/crates/socket-patch-cli/src/commands/scan.rs b/crates/socket-patch-cli/src/commands/scan.rs index 06a97b0..3f2b989 100644 --- a/crates/socket-patch-cli/src/commands/scan.rs +++ b/crates/socket-patch-cli/src/commands/scan.rs @@ -864,6 +864,7 @@ async fn run_vendor_json_path( download_mode: args.common.download_mode.clone(), api_overrides: args.common.api_client_overrides(), all_releases: args.all_releases, + strict: args.common.strict, }; let mut has_errors = false; let detached_records: Option> = if args.detached { @@ -1821,6 +1822,7 @@ pub async fn run(args: ScanArgs) -> i32 { download_mode: args.common.download_mode.clone(), api_overrides: args.common.api_client_overrides(), all_releases: args.all_releases, + strict: args.common.strict, }; let (code, apply_json) = download_and_apply_patches(&selected, ¶ms).await; apply_code = code; @@ -2304,6 +2306,7 @@ pub async fn run(args: ScanArgs) -> i32 { download_mode: args.common.download_mode.clone(), api_overrides: args.common.api_client_overrides(), all_releases: args.all_releases, + strict: args.common.strict, }; let code = if args.vendor { diff --git a/crates/socket-patch-cli/tests/apply_network.rs b/crates/socket-patch-cli/tests/apply_network.rs index d9bb628..08f1fd8 100644 --- a/crates/socket-patch-cli/tests/apply_network.rs +++ b/crates/socket-patch-cli/tests/apply_network.rs @@ -449,49 +449,98 @@ async fn apply_with_force_overrides_hash_mismatch() { } #[tokio::test] -async fn apply_without_force_hash_mismatch_emits_failed_event() { +async fn apply_hash_mismatch_default_warns_and_applies_strict_fails() { let after = b"after\n"; let after_hash = git_sha256(after); let expected_before = b"expected-before\n"; let actual_before = b"DIFFERENT-CONTENT\n"; let expected_before_hash = git_sha256(expected_before); - let tmp = tempfile::tempdir().expect("tempdir"); - write_root_package_json(tmp.path()); - write_npm_package(tmp.path(), "mismatch", "1.0.0", "index.js", actual_before); - let socket = tmp.path().join(".socket"); - write_manifest_with_patch( - &socket, - "pkg:npm/mismatch@1.0.0", - "11111111-1111-4111-8111-111111111111", - &expected_before_hash, - &after_hash, - ); - let blobs = socket.join("blobs"); - std::fs::create_dir_all(&blobs).unwrap(); - std::fs::write(blobs.join(&after_hash), after).unwrap(); + let fixture = || { + let tmp = tempfile::tempdir().expect("tempdir"); + write_root_package_json(tmp.path()); + write_npm_package(tmp.path(), "mismatch", "1.0.0", "index.js", actual_before); + let socket = tmp.path().join(".socket"); + write_manifest_with_patch( + &socket, + "pkg:npm/mismatch@1.0.0", + "11111111-1111-4111-8111-111111111111", + &expected_before_hash, + &after_hash, + ); + let blobs = socket.join("blobs"); + std::fs::create_dir_all(&blobs).unwrap(); + std::fs::write(blobs.join(&after_hash), after).unwrap(); + tmp + }; + // DEFAULT: the mismatch is overwritten with the full verified patched + // content (the diff strategy would self-skip; the blob is hash-gated to + // afterHash) and surfaced as a warning event — exit 0. + let tmp = fixture(); let out = Command::new(binary()) .args(["apply", "--json", "--offline"]) .current_dir(tmp.path()) .env_remove("SOCKET_API_TOKEN") .output() .expect("run socket-patch"); - let code = out.status.code().unwrap_or(-1); let stdout = String::from_utf8_lossy(&out.stdout).to_string(); - assert_eq!(code, 1, "hash mismatch w/o --force must exit 1"); let v: serde_json::Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); - assert_eq!(v["status"], "partialFailure"); + assert_eq!( + out.status.code().unwrap_or(-1), + 0, + "default mismatch is a warning, not an error: {v:#}" + ); + assert_eq!(v["status"], "success", "{v:#}"); let events = v["events"].as_array().expect("events array"); - let has_failed = events.iter().any(|e| e["action"] == "failed"); assert!( - has_failed, - "must emit a failed event on hash mismatch; got events={events:?}" + events.iter().any(|e| e["action"] == "applied"), + "{events:?}" + ); + assert!( + events + .iter() + .any(|e| e["errorCode"] == "content_mismatch_overwritten"), + "the overwrite is surfaced as a warning event: {events:?}" ); + let content = std::fs::read(tmp.path().join("node_modules/mismatch/index.js")).unwrap(); + assert_eq!(content, after, "the file carries the verified patched bytes"); - // File must be UNCHANGED. + // The human run logs the warning to stderr. + let tmp = fixture(); + let out = Command::new(binary()) + .args(["apply", "--offline", "--yes"]) + .current_dir(tmp.path()) + .env_remove("SOCKET_API_TOKEN") + .output() + .expect("run socket-patch"); + let stderr = String::from_utf8_lossy(&out.stderr); + assert_eq!(out.status.code().unwrap_or(-1), 0, "stderr={stderr}"); + assert!( + stderr.contains("content_mismatch_overwritten"), + "stderr warning present: {stderr}" + ); + + // --strict: the old fail-closed contract — exit 1, failed event, file + // untouched. + let tmp = fixture(); + let out = Command::new(binary()) + .args(["apply", "--json", "--offline", "--strict"]) + .current_dir(tmp.path()) + .env_remove("SOCKET_API_TOKEN") + .output() + .expect("run socket-patch"); + let stdout = String::from_utf8_lossy(&out.stdout).to_string(); + let v: serde_json::Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); + assert_eq!(out.status.code().unwrap_or(-1), 1, "{v:#}"); + assert_eq!(v["status"], "partialFailure", "{v:#}"); + let events = v["events"].as_array().expect("events array"); + assert!( + events.iter().any(|e| e["action"] == "failed"), + "strict emits a failed event: {events:?}" + ); let content = std::fs::read(tmp.path().join("node_modules/mismatch/index.js")).unwrap(); - assert_eq!(content, actual_before, "hash mismatch must not modify file"); + assert_eq!(content, actual_before, "strict must not modify the file"); } // --------------------------------------------------------------------------- @@ -650,3 +699,87 @@ async fn apply_uses_locally_cached_blob_without_fetching() { "cached blob must survive apply" ); } + +// --------------------------------------------------------------------------- +// Mismatch + diff-mode sources: the full blob is redownloaded on demand. +// --------------------------------------------------------------------------- + +/// A mismatched file cannot be patched from a partial source (the diff +/// strategy needs the exact before-bytes), so the default mismatch policy +/// redownloads the FULL afterHash blob and applies that — even when a +/// local source archive made the stage step skip downloading. +#[tokio::test] +async fn apply_mismatch_redownloads_full_blob_and_applies() { + let after = b"after\n"; + let after_hash = git_sha256(after); + let expected_before_hash = git_sha256(b"expected-before\n"); + + let mock = MockServer::start().await; + Mock::given(method("GET")) + .and(path(format!( + "/v0/orgs/{ORG_SLUG}/patches/blob/{after_hash}" + ))) + .respond_with(ResponseTemplate::new(200).set_body_bytes(after.to_vec())) + .mount(&mock) + .await; + + let uuid = "11111111-1111-4111-8111-111111111111"; + let tmp = tempfile::tempdir().expect("tempdir"); + write_root_package_json(tmp.path()); + write_npm_package( + tmp.path(), + "mismatch", + "1.0.0", + "index.js", + b"DIFFERENT-CONTENT\n", + ); + let socket = tmp.path().join(".socket"); + write_manifest_with_patch( + &socket, + "pkg:npm/mismatch@1.0.0", + uuid, + &expected_before_hash, + &after_hash, + ); + // A LOCAL package archive exists (so the stage step downloads nothing) + // but carries no entry for index.js — only the blob can produce the + // patched bytes, and no blob is staged. + let packages = socket.join("packages"); + std::fs::create_dir_all(&packages).unwrap(); + { + use std::io::Write as _; + let mut builder = tar::Builder::new(flate2::write::GzEncoder::new( + std::fs::File::create(packages.join(format!("{uuid}.tar.gz"))).unwrap(), + flate2::Compression::default(), + )); + let mut header = tar::Header::new_gnu(); + let bytes = b"unrelated"; + header.set_size(bytes.len() as u64); + header.set_mode(0o644); + header.set_cksum(); + builder.append_data(&mut header, "other.js", &bytes[..]).unwrap(); + builder.into_inner().unwrap().finish().unwrap().flush().unwrap(); + } + + let (code, stdout, stderr) = run_apply(tmp.path(), &mock.uri(), &[]); + let v: serde_json::Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); + assert_eq!(code, 0, "stdout={v:#}\nstderr={stderr}"); + let events = v["events"].as_array().expect("events array"); + assert!( + events + .iter() + .any(|e| e["errorCode"] == "content_mismatch_overwritten"), + "{events:?}" + ); + + // The blob was fetched on demand… + let requests = mock.received_requests().await.unwrap(); + let blob_path = format!("/v0/orgs/{ORG_SLUG}/patches/blob/{after_hash}"); + assert!( + requests.iter().any(|r| r.url.path() == blob_path), + "the full blob must be redownloaded for the mismatched file" + ); + // …and the file carries the verified patched bytes. + let content = std::fs::read(tmp.path().join("node_modules/mismatch/index.js")).unwrap(); + assert_eq!(content, after); +} diff --git a/crates/socket-patch-cli/tests/cli_global_args.rs b/crates/socket-patch-cli/tests/cli_global_args.rs index bbcb84b..6faebd8 100644 --- a/crates/socket-patch-cli/tests/cli_global_args.rs +++ b/crates/socket-patch-cli/tests/cli_global_args.rs @@ -202,6 +202,7 @@ fn global_flag_cases_cover_every_global_field() { break_lock: _, debug: _, no_telemetry: _, + strict: _, } = common; // 20 fields ↔ 20 long-flag cases. Bump both this count and add a case when diff --git a/crates/socket-patch-cli/tests/in_process_cargo_apply.rs b/crates/socket-patch-cli/tests/in_process_cargo_apply.rs index 860c4dd..bf17233 100644 --- a/crates/socket-patch-cli/tests/in_process_cargo_apply.rs +++ b/crates/socket-patch-cli/tests/in_process_cargo_apply.rs @@ -293,12 +293,14 @@ async fn cargo_fetch_scan_sync_patches_real_file() { } /// Safety gate: when the patch's advertised `beforeHash` does NOT match the -/// on-disk file, apply must REFUSE to write (it cannot trust that the blob is -/// a valid successor of whatever is actually on disk). The positive test -/// above only ever feeds a correct `beforeHash`, so a regression that made -/// apply blindly clobber the file regardless of its current content would -/// sail through it. This test pins the refusal: the file must be left -/// byte-for-byte untouched and the run must NOT report success. +/// on-disk file, `--strict` apply must REFUSE to write (the v3.4 DEFAULT +/// instead overwrites with the verified afterHash content and warns — see +/// `apply_hash_mismatch_default_warns_and_applies_strict_fails`). The +/// positive test above only ever feeds a correct `beforeHash`, so a +/// regression that made strict mode clobber the file regardless of its +/// current content would sail through it. This test pins the strict +/// refusal: the file must be left byte-for-byte untouched and the run must +/// NOT report success. #[tokio::test] #[serial] async fn cargo_apply_refuses_on_before_hash_mismatch() { @@ -344,9 +346,10 @@ async fn cargo_apply_refuses_on_before_hash_mismatch() { ecosystems: Some(vec!["cargo".to_string()]), download_mode: "diff".to_string(), dry_run: false, - // force MUST stay false: with --force, a hash mismatch is - // deliberately downgraded to "ready" and the file WOULD be - // overwritten. We are asserting the safe default refuses. + // strict pins the fail-closed contract: the v3.4 default (and + // --force) deliberately downgrade a hash mismatch to "ready" + // and the file WOULD be overwritten with verified content. + strict: true, ..socket_patch_cli::args::GlobalArgs::default() }, batch_size: 100, diff --git a/crates/socket-patch-cli/tests/in_process_get_update_count.rs b/crates/socket-patch-cli/tests/in_process_get_update_count.rs index a2101ae..36e4bef 100644 --- a/crates/socket-patch-cli/tests/in_process_get_update_count.rs +++ b/crates/socket-patch-cli/tests/in_process_get_update_count.rs @@ -72,6 +72,7 @@ fn params(root: &Path, server: &MockServer) -> DownloadParams { org_slug: Some(ORG.to_string()), proxy_url: None, }, + strict: false, // Skip release-narrowing; npm has no variants anyway. all_releases: true, } diff --git a/crates/socket-patch-core/src/patch/apply.rs b/crates/socket-patch-core/src/patch/apply.rs index e880707..fe5a9d5 100644 --- a/crates/socket-patch-core/src/patch/apply.rs +++ b/crates/socket-patch-core/src/patch/apply.rs @@ -11,7 +11,7 @@ use crate::patch::file_hash::compute_file_git_sha256; use crate::patch::package::read_archive_filtered; /// Status of a file patch verification. -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum VerifyStatus { /// File is ready to be patched (current hash matches beforeHash). Ready, @@ -34,6 +34,33 @@ pub struct VerifyResult { pub target_hash: Option, } +/// How the apply pipeline treats a file whose on-disk content matches +/// NEITHER `beforeHash` nor `afterHash` (and a pre-existing file that is +/// missing). +/// +/// Mismatch tolerance is safe content-wise in every mode: the diff +/// strategy self-disables on a wrong base, and the archive/blob +/// strategies verify their bytes hash to exactly `afterHash` BEFORE any +/// write — a tolerated mismatch is overwritten with the verified patched +/// content or fails, never silently corrupted. What tolerance can do is +/// discard local modifications to the dependency file, which is why +/// `Strict` exists. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub enum MismatchPolicy { + /// DEFAULT: a beforeHash mismatch is overwritten with the verified + /// patched content and surfaced as a warning (the promoted + /// [`VerifyResult`] keeps `expected_hash`/`current_hash`, which is + /// how callers detect and report it). A MISSING pre-existing file is + /// still a hard error. + #[default] + Warn, + /// A beforeHash mismatch is a hard error (`--strict`). + Strict, + /// [`MismatchPolicy::Warn`] PLUS missing pre-existing files are + /// skipped instead of failing (`--force`). + Force, +} + /// Which patch source actually wrote the patched bytes for a file. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum AppliedVia { @@ -682,7 +709,7 @@ pub async fn apply_package_patch( sources: &PatchSources<'_>, uuid: Option<&str>, dry_run: bool, - force: bool, + policy: MismatchPolicy, ) -> ApplyResult { let mut result = ApplyResult { package_key: package_key.to_string(), @@ -714,30 +741,32 @@ pub async fn apply_package_patch( if verify_result.status != VerifyStatus::Ready && verify_result.status != VerifyStatus::AlreadyPatched { - if force { - match verify_result.status { - VerifyStatus::HashMismatch => { - // Force: treat hash mismatch as ready - verify_result.status = VerifyStatus::Ready; - } - VerifyStatus::NotFound => { - // Force: skip files that don't exist (non-new files) - result.files_verified.push(verify_result); - continue; - } - _ => {} + match (verify_result.status, policy) { + // Mismatch tolerated (default + force): promote to Ready. + // The promoted result KEEPS `expected_hash`/`current_hash` + // — the signature callers use to surface the warning. The + // diff strategy self-disables on the wrong base; the + // archive/blob strategies are hash-gated to afterHash. + (VerifyStatus::HashMismatch, MismatchPolicy::Warn | MismatchPolicy::Force) => { + verify_result.status = VerifyStatus::Ready; + } + // Force only: skip missing pre-existing files. + (VerifyStatus::NotFound, MismatchPolicy::Force) => { + result.files_verified.push(verify_result); + continue; + } + _ => { + let msg = verify_result + .message + .clone() + .unwrap_or_else(|| format!("{:?}", verify_result.status)); + result.error = Some(format!( + "Cannot apply patch: {} - {}", + verify_result.file, msg + )); + result.files_verified.push(verify_result); + return result; } - } else { - let msg = verify_result - .message - .clone() - .unwrap_or_else(|| format!("{:?}", verify_result.status)); - result.error = Some(format!( - "Cannot apply patch: {} - {}", - verify_result.file, msg - )); - result.files_verified.push(verify_result); - return result; } } @@ -1654,7 +1683,7 @@ mod tests { &PatchSources::blobs_only(blobs_dir.path()), None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -1706,7 +1735,7 @@ mod tests { &PatchSources::blobs_only(blobs_dir.path()), None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -1743,7 +1772,7 @@ mod tests { &PatchSources::blobs_only(blobs_dir.path()), None, true, - false, + MismatchPolicy::Warn, ) .await; @@ -1785,7 +1814,7 @@ mod tests { &PatchSources::blobs_only(blobs_dir.path()), None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -1818,7 +1847,7 @@ mod tests { &PatchSources::blobs_only(blobs_dir.path()), None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -1826,24 +1855,23 @@ mod tests { assert!(result.error.is_some()); } + /// beforeHash mismatch across the three policies: the DEFAULT (Warn) + /// overwrites with the verified patched content and keeps the + /// promoted warning signature (`Ready` + `expected_hash: Some` + + /// differing `current_hash`); `Strict` is the old hard error; `Force` + /// behaves like Warn (its extra tolerance is missing files). #[tokio::test] - async fn test_apply_package_patch_force_hash_mismatch() { + async fn test_apply_package_patch_hash_mismatch_policies() { let pkg_dir = tempfile::tempdir().unwrap(); let blobs_dir = tempfile::tempdir().unwrap(); let patched = b"patched content"; let after_hash = compute_git_sha256_from_bytes(patched); + let divergent = b"something unexpected"; - // Write a file whose hash does NOT match before_hash - tokio::fs::write(pkg_dir.path().join("index.js"), b"something unexpected") - .await - .unwrap(); - - // Write blob tokio::fs::write(blobs_dir.path().join(&after_hash), patched) .await .unwrap(); - let mut files = HashMap::new(); files.insert( "index.js".to_string(), @@ -1853,25 +1881,41 @@ mod tests { }, ); - // Without force: should fail - let result = apply_package_patch( - "pkg:npm/test@1.0.0", - pkg_dir.path(), - &files, - &PatchSources::blobs_only(blobs_dir.path()), - None, - false, - false, - ) - .await; - assert!(!result.success); + for policy in [MismatchPolicy::Warn, MismatchPolicy::Force] { + tokio::fs::write(pkg_dir.path().join("index.js"), divergent) + .await + .unwrap(); + let result = apply_package_patch( + "pkg:npm/test@1.0.0", + pkg_dir.path(), + &files, + &PatchSources::blobs_only(blobs_dir.path()), + None, + false, + policy, + ) + .await; + assert!(result.success, "{policy:?}: {:?}", result.error); + assert_eq!(result.files_patched.len(), 1, "{policy:?}"); + // The promoted verify keeps the mismatch signature for the + // caller's warning report. + let v = &result.files_verified[0]; + assert_eq!(v.status, VerifyStatus::Ready, "{policy:?}"); + assert!( + v.expected_hash.is_some() && v.current_hash != v.expected_hash, + "{policy:?}: promoted signature retained" + ); + // The bytes on disk are EXACTLY the verified patched content. + let written = tokio::fs::read(pkg_dir.path().join("index.js")) + .await + .unwrap(); + assert_eq!(written, patched, "{policy:?}"); + } - // Reset the file - tokio::fs::write(pkg_dir.path().join("index.js"), b"something unexpected") + // Strict: the old fail-closed behavior, file untouched. + tokio::fs::write(pkg_dir.path().join("index.js"), divergent) .await .unwrap(); - - // With force: should succeed let result = apply_package_patch( "pkg:npm/test@1.0.0", pkg_dir.path(), @@ -1879,16 +1923,38 @@ mod tests { &PatchSources::blobs_only(blobs_dir.path()), None, false, - true, + MismatchPolicy::Strict, ) .await; - assert!(result.success); - assert_eq!(result.files_patched.len(), 1); + assert!(!result.success); + assert!(result + .error + .as_deref() + .unwrap_or("") + .contains("does not match")); + let untouched = tokio::fs::read(pkg_dir.path().join("index.js")) + .await + .unwrap(); + assert_eq!(untouched, divergent, "strict never writes"); - let written = tokio::fs::read(pkg_dir.path().join("index.js")) + // A missing pre-existing file is STILL an error by default and + // under strict — only Force skips it. + tokio::fs::remove_file(pkg_dir.path().join("index.js")) .await .unwrap(); - assert_eq!(written, patched); + for policy in [MismatchPolicy::Warn, MismatchPolicy::Strict] { + let result = apply_package_patch( + "pkg:npm/test@1.0.0", + pkg_dir.path(), + &files, + &PatchSources::blobs_only(blobs_dir.path()), + None, + false, + policy, + ) + .await; + assert!(!result.success, "{policy:?}: missing file fails closed"); + } } #[tokio::test] @@ -1913,7 +1979,7 @@ mod tests { &PatchSources::blobs_only(blobs_dir.path()), None, false, - false, + MismatchPolicy::Warn, ) .await; assert!(!result.success); @@ -1926,7 +1992,7 @@ mod tests { &PatchSources::blobs_only(blobs_dir.path()), None, false, - true, + MismatchPolicy::Force, ) .await; assert!(result.success); @@ -2054,7 +2120,7 @@ mod tests { &sources, Some(TEST_UUID), false, - false, + MismatchPolicy::Warn, ) .await; @@ -2089,7 +2155,7 @@ mod tests { &sources, Some(TEST_UUID), false, - false, + MismatchPolicy::Warn, ) .await; @@ -2123,7 +2189,7 @@ mod tests { &sources, Some(TEST_UUID), false, - false, + MismatchPolicy::Warn, ) .await; @@ -2152,7 +2218,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -2189,7 +2255,7 @@ mod tests { &sources, Some(TEST_UUID), false, - true, // --force + MismatchPolicy::Force, ) .await; @@ -2229,7 +2295,7 @@ mod tests { &sources, Some(TEST_UUID), false, - false, + MismatchPolicy::Warn, ) .await; @@ -2259,7 +2325,7 @@ mod tests { &sources, Some(TEST_UUID), true, // dry-run - false, + MismatchPolicy::Warn, ) .await; @@ -2553,7 +2619,7 @@ mod tests { &PatchSources::blobs_only(blobs_dir.path()), None, false, - false, + MismatchPolicy::Warn, ) .await; diff --git a/crates/socket-patch-core/src/patch/go_redirect.rs b/crates/socket-patch-core/src/patch/go_redirect.rs index 34e5778..f54cd60 100644 --- a/crates/socket-patch-core/src/patch/go_redirect.rs +++ b/crates/socket-patch-core/src/patch/go_redirect.rs @@ -27,7 +27,8 @@ use std::path::{Path, PathBuf}; use crate::manifest::schema::{PatchFileInfo, PatchManifest}; use crate::patch::apply::{ - apply_package_patch, normalize_file_path, ApplyResult, PatchSources, VerifyResult, VerifyStatus, + apply_package_patch, normalize_file_path, ApplyResult, MismatchPolicy, PatchSources, + VerifyResult, VerifyStatus, }; use crate::patch::file_hash::compute_file_git_sha256; use crate::utils::purl::{build_golang_purl, parse_golang_purl, strip_purl_qualifiers}; @@ -164,7 +165,7 @@ pub async fn apply_go_redirect( sources: &PatchSources<'_>, uuid: Option<&str>, dry_run: bool, - force: bool, + policy: MismatchPolicy, ) -> ApplyResult { // SECURITY: refuse coordinates that would escape the copy base. // A `..`/separator-laden `module`/`version` (a tampered manifest PURL) would @@ -195,7 +196,7 @@ pub async fn apply_go_redirect( // Verify (read-only) against the pristine source for an accurate // "would patch" report, without creating the copy or editing go.mod. let mut result = - apply_package_patch(purl, pristine_src, files, sources, uuid, true, force).await; + apply_package_patch(purl, pristine_src, files, sources, uuid, true, policy).await; result.package_path = copy_dir.display().to_string(); result.sidecar = None; // a replace copy is not the cache (no go.sum advisory) return result; @@ -235,7 +236,7 @@ pub async fn apply_go_redirect( } // Delegate to the hardened pipeline, pointed at the copy. - let mut result = apply_package_patch(purl, ©_dir, files, sources, uuid, false, force).await; + let mut result = apply_package_patch(purl, ©_dir, files, sources, uuid, false, policy).await; result.package_path = copy_dir.display().to_string(); // The golang sidecar advisory ("go mod verify will fail against go.sum") // is about in-cache patching; a `replace` copy bypasses go.sum entirely, so @@ -761,7 +762,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; assert!(result.success, "apply failed: {:?}", result.error); @@ -810,7 +811,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -830,7 +831,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; assert!(result.success); @@ -866,7 +867,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -884,7 +885,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; assert!(result.success); @@ -910,7 +911,7 @@ mod tests { &sources, None, true, - false, + MismatchPolicy::Warn, ) .await; assert!(result.success); @@ -945,7 +946,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; assert!(!result.success); @@ -980,7 +981,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; assert!(result.success, "apply failed: {:?}", result.error); @@ -1016,7 +1017,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; assert!(result.success, "apply failed: {:?}", result.error); @@ -1056,7 +1057,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -1090,7 +1091,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -1119,7 +1120,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; // Add a user-authored replace. @@ -1157,7 +1158,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -1209,7 +1210,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; // Drop the directive but keep the copy. @@ -1243,7 +1244,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -1287,7 +1288,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -1328,7 +1329,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -1365,7 +1366,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; assert!(result.success); @@ -1446,7 +1447,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; assert!(result.success, "apply failed: {:?}", result.error); @@ -1495,7 +1496,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; @@ -1536,7 +1537,7 @@ mod tests { &sources, None, false, - false, + MismatchPolicy::Warn, ) .await; assert!(!result.success); diff --git a/crates/socket-patch-core/src/patch/vendor/golang.rs b/crates/socket-patch-core/src/patch/vendor/golang.rs index 066676c..d532485 100644 --- a/crates/socket-patch-core/src/patch/vendor/golang.rs +++ b/crates/socket-patch-core/src/patch/vendor/golang.rs @@ -137,7 +137,7 @@ pub async fn vendor_go_module( sources, Some(&record.uuid), dry_run, - /*force=*/ true, + crate::patch::apply::MismatchPolicy::Force, ) .await; if result.success { @@ -335,6 +335,7 @@ pub async fn revert_go_vendor( #[cfg(test)] mod tests { use super::*; + use crate::patch::apply::MismatchPolicy; use crate::hash::git_sha256::compute_git_sha256_from_bytes; use crate::manifest::schema::{PatchFileInfo, VulnerabilityInfo}; use crate::patch::apply::ApplyResult; @@ -556,7 +557,7 @@ mod tests { &sources, Some(UUID), false, - false, + MismatchPolicy::Warn, ) .await; assert!(pre.success, "fixture redirect failed: {:?}", pre.error); @@ -720,7 +721,7 @@ mod tests { &sources, Some(UUID), false, - false, + MismatchPolicy::Warn, ) .await; let (_result, entry, _warnings) = diff --git a/crates/socket-patch-core/src/patch/vendor/mod.rs b/crates/socket-patch-core/src/patch/vendor/mod.rs index 05fddbf..8b13f22 100644 --- a/crates/socket-patch-core/src/patch/vendor/mod.rs +++ b/crates/socket-patch-core/src/patch/vendor/mod.rs @@ -229,7 +229,10 @@ pub(crate) async fn force_apply_staged( sources, Some(&record.uuid), dry_run, - /*force=*/ true, + // The stage is private and every write path is afterHash-gated; + // Force additionally covers the caller's --force NotFound-skip + // (the missing-file pre-check above handles the default case). + crate::patch::apply::MismatchPolicy::Force, ) .await; if result.success { From 64c59f19ef5ef4d04b0a534c8020abb2f9af50dc Mon Sep 17 00:00:00 2001 From: Mikola Lysenko Date: Thu, 11 Jun 2026 19:36:58 -0400 Subject: [PATCH 14/19] polish(apply): decode percent-encoded purls in human output The 'Patched packages' summary and the no-matching-installed-package warning printed manifest keys verbatim (pkg:npm/%40scope/...); show the decoded form like the scan/vendor output does. JSON keeps verbatim keys. Co-Authored-By: Claude Fable 5 --- crates/socket-patch-cli/src/commands/apply.rs | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/crates/socket-patch-cli/src/commands/apply.rs b/crates/socket-patch-cli/src/commands/apply.rs index 610f391..e2cd0db 100644 --- a/crates/socket-patch-cli/src/commands/apply.rs +++ b/crates/socket-patch-cli/src/commands/apply.rs @@ -730,9 +730,20 @@ pub async fn run(args: ApplyArgs) -> i32 { } else { format!(" (via {})", tags.join("+")) }; - println!(" {}{}", result.package_key, suffix); + println!( + " {}{}", + socket_patch_core::utils::purl::normalize_purl( + &result.package_key + ), + suffix + ); } else if all_files_already_patched(result) { - println!(" {} (already patched)", result.package_key); + println!( + " {} (already patched)", + socket_patch_core::utils::purl::normalize_purl( + &result.package_key + ) + ); } } } @@ -1242,7 +1253,10 @@ async fn apply_patches_inner( unmatched.len() ); for purl in &unmatched { - eprintln!(" - {}", purl); + eprintln!( + " - {}", + socket_patch_core::utils::purl::normalize_purl(purl) + ); } } From 18b4cb16f2558dcef650bb294f2bc768ece03334 Mon Sep 17 00:00:00 2001 From: Mikola Lysenko Date: Fri, 12 Jun 2026 01:00:51 -0400 Subject: [PATCH 15/19] =?UTF-8?q?feat(vendor):=20hold=20patch=20blobs=20in?= =?UTF-8?q?=20memory=20=E2=80=94=20vendoring=20writes=20no=20.socket/blobs?= =?UTF-8?q?=20or=20temp=20files?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Vendor flows (vendor, scan --vendor, --detached) no longer persist patch content anywhere on disk: a vendored project's .socket holds only manifest.json and vendor/. - core: PatchSources.mem_blobs overlay, checked before the on-disk blob read in the apply pipeline's blob strategy. - core: harvest_artifact_blobs — re-stage afterHash blobs from the committed vendor artifact itself (uuid-matched against the ledger, every blob self-verified by its own git-sha256), so in-sync re-runs and fresh clones of vendored projects stage with no network. - cli: stage_vendor_sources_in_memory replaces the disk stager in all vendor flows; missing content is fetched per patch via the proxy-aware patch-view endpoint straight into memory. - cli: DownloadParams.persist_blobs — scan passes !args.vendor so the scan --vendor download phase writes only the manifest. - e2e: .socket-stays-lean assertions (manifest mode, detached, fresh clone) + no-blobs detached idempotency; core harvest unit tests (tgz, dir-shaped, stale-uuid, escaping-path fail-closed). - docs: CLI contract "Patch sources stay in memory" section. Co-Authored-By: Claude Fable 5 --- crates/socket-patch-cli/CLI_CONTRACT.md | 10 +- crates/socket-patch-cli/src/commands/apply.rs | 68 ++--- .../src/commands/fetch_stage.rs | 191 ++++++++++++ crates/socket-patch-cli/src/commands/get.rs | 61 ++-- .../socket-patch-cli/src/commands/repair.rs | 1 + crates/socket-patch-cli/src/commands/scan.rs | 153 +++++++--- .../socket-patch-cli/src/commands/vendor.rs | 94 +++--- .../socket-patch-cli/tests/apply_network.rs | 17 +- .../tests/in_process_get_update_count.rs | 1 + .../tests/in_process_vendor.rs | 14 +- .../socket-patch-cli/tests/scan_vendor_e2e.rs | 78 ++++- .../src/crawlers/deno_crawler.rs | 5 +- crates/socket-patch-core/src/patch/apply.rs | 44 ++- .../src/patch/go_redirect.rs | 3 +- .../src/patch/vendor/composer_lock.rs | 4 +- .../socket-patch-core/src/patch/vendor/gem.rs | 4 +- .../src/patch/vendor/golang.rs | 2 +- .../src/patch/vendor/lock_inventory.rs | 111 ++++--- .../socket-patch-core/src/patch/vendor/mod.rs | 287 +++++++++++++++++- .../src/patch/vendor/npm_common.rs | 10 +- .../src/patch/vendor/npm_lock.rs | 10 +- .../src/patch/vendor/pnpm_lock.rs | 4 +- .../src/patch/vendor/registry_fetch.rs | 58 ++-- crates/socket-patch-core/src/utils/purl.rs | 8 +- .../tests/blob_fetcher_edges_e2e.rs | 6 + 25 files changed, 996 insertions(+), 248 deletions(-) diff --git a/crates/socket-patch-cli/CLI_CONTRACT.md b/crates/socket-patch-cli/CLI_CONTRACT.md index 14bc18e..88229b3 100644 --- a/crates/socket-patch-cli/CLI_CONTRACT.md +++ b/crates/socket-patch-cli/CLI_CONTRACT.md @@ -82,7 +82,7 @@ Beyond the globals above, each subcommand defines a small set of local arguments `scan --sync` is sugar for `--apply --prune` — the canonical single-flag bot invocation. `scan --json --sync --yes` discovers, applies, and reconciles state in one pass. -`scan --vendor` swaps the in-place apply for the vendor pipeline: discover → download (manifest written, as `--apply`) → vendor every patched dependency via the same engine as the `vendor` command (under the same lock). The whole manifest is vendored, so a package vendored at an older patch uuid is **re-vendored automatically** (its old uuid dir is removed — `vendor_stale_artifact_removed`); same-uuid re-runs are `already_vendored` skips. With `--prune`, GC runs **before** the vendor step so stale manifest entries don't fail vendoring with `package_not_installed`. JSON output gains a `download` sub-object (the download phase; no `applied` field — nothing is applied in place) and a `vendor` sub-object (a full vendor Envelope). `--dry-run` previews per-patch `would_vendor` | `would_revendor` (+`oldUuid`) | `already_vendored` without network downloads or disk writes. Interactive mode prompts "Download and vendor N patch(es)?". +`scan --vendor` swaps the in-place apply for the vendor pipeline: discover → download (manifest written, as `--apply`) → vendor every patched dependency via the same engine as the `vendor` command (under the same lock). The whole manifest is vendored, so a package vendored at an older patch uuid is **re-vendored automatically** (its old uuid dir is removed — `vendor_stale_artifact_removed`); same-uuid re-runs are `already_vendored` skips. With `--prune`, GC runs **before** the vendor step so stale manifest entries don't fail vendoring with `package_not_installed`. JSON output gains a `download` sub-object (the download phase; no `applied` field — nothing is applied in place) and a `vendor` sub-object (a full vendor Envelope). The download phase writes only `.socket/manifest.json`; patch blobs are held in memory (see "Patch sources stay in memory" under the vendor contract). `--dry-run` previews per-patch `would_vendor` | `would_revendor` (+`oldUuid`) | `already_vendored` without network downloads or disk writes. Interactive mode prompts "Download and vendor N patch(es)?". `scan --vendor --detached` performs the same vendoring **without ever writing `.socket/manifest.json`**: records are fetched into memory (`download.detached: true`), the artifacts are built + wired, and the ledger entry carries `detached: true` plus an embedded copy of the patch record (`record`) as the verification source. Detached patches are invisible to apply/rollback/repair (nothing is in the manifest), exempt from `vendor`'s manifest reconcile, and exit via `remove ` (which reverts them) or `vendor --revert`. Idempotent re-runs reuse the embedded record and skip the patch-view fetch entirely. @@ -326,6 +326,14 @@ machines with **no socket-patch installed and no Socket API access** (registry a unvendored dependencies may still be needed). Every mechanism below was validated against the real package managers (`spikes/PHASE0-FINDINGS.txt`). +**Patch sources stay in memory (v3.4)**: vendoring never writes `.socket/blobs/`, `.socket/diffs/`, +or temporary patch files. Pre-existing `.socket/` artifacts (from a prior `apply`/`get`/`repair`) +are read in place; already-vendored purls re-stage patch content from the committed artifact itself +(uuid-matched against the ledger, every harvested blob self-verified by its afterHash — so in-sync +re-runs and fresh clones of vendored projects need no network); anything still missing is fetched +into memory via the patch-view endpoint. A vendored project's `.socket/` holds only +`manifest.json` (omitted in detached mode) and `vendor/`. + ### Path convention + patch-UUID recovery (stable) ```text diff --git a/crates/socket-patch-cli/src/commands/apply.rs b/crates/socket-patch-cli/src/commands/apply.rs index e2cd0db..f839b7f 100644 --- a/crates/socket-patch-cli/src/commands/apply.rs +++ b/crates/socket-patch-cli/src/commands/apply.rs @@ -5,8 +5,8 @@ use socket_patch_core::crawlers::{ }; use socket_patch_core::manifest::operations::read_manifest; use socket_patch_core::manifest::schema::PatchRecord; -use socket_patch_core::patch::apply::{MismatchPolicy, - apply_package_patch, verify_file_patch, ApplyResult, PatchSources, VerifyStatus, +use socket_patch_core::patch::apply::{ + apply_package_patch, verify_file_patch, ApplyResult, MismatchPolicy, PatchSources, VerifyStatus, }; /// Files whose pre-apply content matched NEITHER hash and were (or would /// be) overwritten with the verified patched content — the promoted @@ -94,10 +94,7 @@ async fn ensure_blobs_for_mismatches( } let (client, _) = get_api_client_with_overrides(args.common.api_client_overrides()).await; let _ = socket_patch_core::api::blob_fetcher::fetch_blobs_by_hash( - &needed, - blobs_path, - &client, - None, + &needed, blobs_path, &client, None, ) .await; } @@ -732,17 +729,13 @@ pub async fn run(args: ApplyArgs) -> i32 { }; println!( " {}{}", - socket_patch_core::utils::purl::normalize_purl( - &result.package_key - ), + socket_patch_core::utils::purl::normalize_purl(&result.package_key), suffix ); } else if all_files_already_patched(result) { println!( " {} (already patched)", - socket_patch_core::utils::purl::normalize_purl( - &result.package_key - ) + socket_patch_core::utils::purl::normalize_purl(&result.package_key) ); } } @@ -1102,6 +1095,7 @@ async fn apply_patches_inner( blobs_path: &blobs_path, packages_path: Some(&packages_path), diffs_path: Some(&diffs_path), + mem_blobs: None, }; let result = apply_package_patch( variant_purl, @@ -1186,6 +1180,7 @@ async fn apply_patches_inner( blobs_path: &blobs_path, packages_path: Some(&packages_path), diffs_path: Some(&diffs_path), + mem_blobs: None, }; // Local go redirects to a project-local patched copy under // `.socket/go-patches/` wired via a `go.mod` `replace` (the module @@ -1193,31 +1188,30 @@ async fn apply_patches_inner( // Everything else — npm/pypi/gem and cargo (vendored or registry // cache) — patches in place via `apply_package_patch`. Without the // `golang` feature `try_local_go_apply` is an inert `None`. - let result = - match try_local_go_apply( - purl, - pkg_path, - patch, - &sources, - &args.common, - mismatch_policy(args.force, args.common.strict), - ) + let result = match try_local_go_apply( + purl, + pkg_path, + patch, + &sources, + &args.common, + mismatch_policy(args.force, args.common.strict), + ) + .await + { + Some(r) => r, + None => { + apply_package_patch( + purl, + pkg_path, + &patch.files, + &sources, + Some(&patch.uuid), + args.common.dry_run, + mismatch_policy(args.force, args.common.strict), + ) .await - { - Some(r) => r, - None => { - apply_package_patch( - purl, - pkg_path, - &patch.files, - &sources, - Some(&patch.uuid), - args.common.dry_run, - mismatch_policy(args.force, args.common.strict), - ) - .await - } - }; + } + }; warn_mismatch_overwrites(&result, &args.common); if !result.success { @@ -1434,7 +1428,7 @@ mod tests { .enumerate() .map(|(i, status)| VerifyResult { file: format!("package/f{i}.js"), - status: status.clone(), + status: *status, message: None, current_hash: None, expected_hash: None, diff --git a/crates/socket-patch-cli/src/commands/fetch_stage.rs b/crates/socket-patch-cli/src/commands/fetch_stage.rs index b1ffbb9..8976676 100644 --- a/crates/socket-patch-cli/src/commands/fetch_stage.rs +++ b/crates/socket-patch-cli/src/commands/fetch_stage.rs @@ -7,6 +7,7 @@ //! cache is `repair`'s job, keeping these commands read-only against //! `.socket/`). +use std::collections::HashMap; use std::path::{Path, PathBuf}; use socket_patch_core::api::blob_fetcher::{ @@ -36,6 +37,7 @@ impl StagedSources { blobs_path: &self.blobs, packages_path: Some(&self.packages), diffs_path: Some(&self.diffs), + mem_blobs: None, } } } @@ -199,6 +201,7 @@ pub async fn stage_patch_sources( blobs_path: &stage_blobs, packages_path: Some(&stage_packages), diffs_path: Some(&stage_diffs), + mem_blobs: None, }; let fetch_result = fetch_missing_sources(manifest, &sources, download_mode, &client, None).await; @@ -244,3 +247,191 @@ pub async fn stage_patch_sources( _stage: Some(stage), })) } + +/// In-memory staged sources for the VENDOR flows. +/// +/// Existing `.socket/` artifacts are read in place (never copied, never +/// rewritten); patch content that is missing locally is fetched into +/// MEMORY via the patch view endpoint — vendoring writes no +/// `.socket/blobs` entries and no temporary files. The committed +/// `.socket/vendor/` artifact is the patch; nothing else should land on +/// disk. +pub struct MemStagedSources { + blobs: PathBuf, + diffs: PathBuf, + packages: PathBuf, + mem: HashMap>, +} + +impl MemStagedSources { + /// Borrow as the core pipeline's source set (memory overlay first, + /// on-disk artifacts as the read-only fallback). + pub fn as_patch_sources(&self) -> PatchSources<'_> { + PatchSources { + blobs_path: &self.blobs, + packages_path: Some(&self.packages), + diffs_path: Some(&self.diffs), + mem_blobs: Some(&self.mem), + } + } +} + +/// The in-memory staging outcome (mirror of [`StageOutcome`]). +pub enum MemStageOutcome { + Ready(MemStagedSources), + Unavailable, +} + +/// Stage patch sources for a VENDOR run without writing anything: +/// per-record availability follows the same rule as +/// [`stage_patch_sources`] (all after-blobs on disk, or a diff/package +/// archive on disk), and records with no usable local source have their +/// full per-file content fetched into memory from the patch view +/// endpoint (`blobContent`). Offline runs with missing sources are +/// `Unavailable` with the same diagnostics as the disk stager. +pub async fn stage_vendor_sources_in_memory( + common: &GlobalArgs, + manifest: &PatchManifest, + socket_dir: &Path, + project_root: &Path, +) -> Result { + let blobs = socket_dir.join("blobs"); + let diffs = socket_dir.join("diffs"); + let packages = socket_dir.join("packages"); + + let missing_blobs = get_missing_blobs(manifest, &blobs).await; + let missing_diff_archives = get_missing_archives(manifest, &diffs).await; + let missing_package_archives = get_missing_archives(manifest, &packages).await; + + let mut to_fetch: Vec<(&str, &str)> = manifest + .patches + .iter() + .filter_map(|(purl, record)| { + let all_blobs_present = record + .files + .values() + .all(|f| !missing_blobs.contains(&f.after_hash)); + let diff_present = !missing_diff_archives.contains(&record.uuid); + let pkg_present = !missing_package_archives.contains(&record.uuid); + if all_blobs_present || diff_present || pkg_present { + None + } else { + Some((purl.as_str(), record.uuid.as_str())) + } + }) + .collect(); + + if to_fetch.is_empty() { + return Ok(MemStageOutcome::Ready(MemStagedSources { + blobs, + diffs, + packages, + mem: HashMap::new(), + })); + } + + // The committed vendor artifact IS the patched content: harvest its + // afterHash blobs into memory so in-sync re-runs and fresh clones of + // already-vendored projects stage with no network and no disk blobs. + let mut mem = + socket_patch_core::patch::vendor::harvest_artifact_blobs(project_root, &manifest.patches) + .await; + if !mem.is_empty() { + to_fetch.retain(|(purl, _)| { + manifest.patches.get(*purl).is_none_or(|record| { + !record.files.values().all(|f| { + !missing_blobs.contains(&f.after_hash) || mem.contains_key(&f.after_hash) + }) + }) + }); + if to_fetch.is_empty() { + return Ok(MemStageOutcome::Ready(MemStagedSources { + blobs, + diffs, + packages, + mem, + })); + } + } + + if common.offline { + if !common.silent && !common.json { + eprintln!( + "Error: {} patch(es) have no local source and --offline is set:", + to_fetch.len() + ); + for (purl, _) in to_fetch.iter().take(5) { + eprintln!(" - {}", purl); + } + if to_fetch.len() > 5 { + eprintln!(" ... and {} more", to_fetch.len() - 5); + } + eprintln!("Run \"socket-patch repair\" to download missing artifacts."); + } + return Ok(MemStageOutcome::Unavailable); + } + + if !common.silent && !common.json { + println!( + "Fetching {} patch(es)' content (kept in memory)...", + to_fetch.len() + ); + } + + let (client, _) = get_api_client_with_overrides(common.api_client_overrides()).await; + let mut failed: Vec<&str> = Vec::new(); + for (purl, uuid) in &to_fetch { + match client.fetch_patch(common.org.as_deref(), uuid).await { + Ok(Some(patch)) => { + let mut complete = true; + for (file, info) in &patch.files { + let (Some(b64), Some(hash)) = (&info.blob_content, &info.after_hash) else { + if !common.silent && !common.json { + eprintln!(" [error] {purl}: no blob content served for {file}"); + } + complete = false; + break; + }; + // Same key guard as the disk writer: the hash names the + // lookup key the apply pipeline gates writes on. + if hash.len() != 64 || !hash.bytes().all(|b| b.is_ascii_hexdigit()) { + complete = false; + break; + } + match super::get::base64_decode(b64) { + Ok(bytes) => { + mem.insert(hash.clone(), bytes); + } + Err(_) => { + complete = false; + break; + } + } + } + if !complete { + failed.push(purl); + } + } + _ => failed.push(purl), + } + } + if !failed.is_empty() { + if !common.silent && !common.json { + eprintln!( + "Error: could not fetch patch content for {} patch(es):", + failed.len() + ); + for purl in failed.iter().take(5) { + eprintln!(" - {}", purl); + } + } + return Ok(MemStageOutcome::Unavailable); + } + + Ok(MemStageOutcome::Ready(MemStagedSources { + blobs, + diffs, + packages, + mem, + })) +} diff --git a/crates/socket-patch-cli/src/commands/get.rs b/crates/socket-patch-cli/src/commands/get.rs index b6f1271..f9ec978 100644 --- a/crates/socket-patch-cli/src/commands/get.rs +++ b/crates/socket-patch-cli/src/commands/get.rs @@ -583,6 +583,11 @@ pub struct DownloadParams { /// `--strict` forwarded to the nested apply (a beforeHash mismatch /// fails instead of warn-and-overwrite). pub strict: bool, + /// Persist downloaded blob content into `.socket/blobs` (the apply + /// flows need it for later hook/rollback runs). Vendor flows pass + /// `false`: their patch content is staged in memory and the committed + /// artifact is the patch — nothing should land in `.socket/blobs`. + pub persist_blobs: bool, } /// Narrow a selection of patches down to the release variant(s) present @@ -768,14 +773,16 @@ pub(crate) async fn download_patch_records( let socket_dir = params.cwd.join(".socket"); let blobs_dir = socket_dir.join("blobs"); - if let Err(e) = tokio::fs::create_dir_all(&blobs_dir).await { - let err = format!("Failed to create blobs directory: {}", e); - report_error(params.json, &err); - return ( - 1, - serde_json::json!({"status": "error", "error": err}), - HashMap::new(), - ); + if params.persist_blobs { + if let Err(e) = tokio::fs::create_dir_all(&blobs_dir).await { + let err = format!("Failed to create blobs directory: {}", e); + report_error(params.json, &err); + return ( + 1, + serde_json::json!({"status": "error", "error": err}), + HashMap::new(), + ); + } } let mut narrow_warnings: Vec = Vec::new(); @@ -854,9 +861,13 @@ pub(crate) async fn download_patch_records( } } let quiet = params.json || params.silent; - if write_all_patch_blobs(&blobs_dir, &patch, quiet) - .await - .is_err() + // Vendor flows keep blob content in memory (the vendor + // step re-fetches what it needs); persisting blobs here + // would litter .socket/blobs for no consumer. + if params.persist_blobs + && write_all_patch_blobs(&blobs_dir, &patch, quiet) + .await + .is_err() { failed += 1; patch_records_json.push(serde_json::json!({ @@ -980,10 +991,12 @@ pub async fn download_and_apply_patches( report_error(params.json, &err); return (1, serde_json::json!({"status": "error", "error": err})); } - if let Err(e) = tokio::fs::create_dir_all(&blobs_dir).await { - let err = format!("Failed to create blobs directory: {}", e); - report_error(params.json, &err); - return (1, serde_json::json!({"status": "error", "error": err})); + if params.persist_blobs { + if let Err(e) = tokio::fs::create_dir_all(&blobs_dir).await { + let err = format!("Failed to create blobs directory: {}", e); + report_error(params.json, &err); + return (1, serde_json::json!({"status": "error", "error": err})); + } } let mut manifest = match read_manifest(&manifest_path).await { @@ -1033,7 +1046,10 @@ pub async fn download_and_apply_patches( let action = decide_patch_action(&manifest, &patch.purl, &patch.uuid); if let PatchAction::Skipped = action { if !params.json && !params.silent { - eprintln!(" [skip] {} (already in manifest)", normalize_purl(&patch.purl)); + eprintln!( + " [skip] {} (already in manifest)", + normalize_purl(&patch.purl) + ); } downloaded_patches.push(serde_json::json!({ "purl": patch.purl, @@ -1063,9 +1079,13 @@ pub async fn download_and_apply_patches( } let quiet = params.json || params.silent; - if write_all_patch_blobs(&blobs_dir, &patch, quiet) - .await - .is_err() + // Vendor flows keep blob content in memory (the vendor + // step re-fetches what it needs); persisting blobs here + // would litter .socket/blobs for no consumer. + if params.persist_blobs + && write_all_patch_blobs(&blobs_dir, &patch, quiet) + .await + .is_err() { patches_failed += 1; downloaded_patches.push(serde_json::json!({ @@ -1626,6 +1646,7 @@ pub async fn run(args: GetArgs) -> i32 { api_overrides: args.common.api_client_overrides(), all_releases: args.all_releases, strict: args.common.strict, + persist_blobs: true, }; let (code, result_json) = download_and_apply_patches(&selected, ¶ms).await; @@ -1869,7 +1890,7 @@ async fn save_and_apply_patch(args: &GetArgs, patch: &PatchResponse) -> i32 { exit_code } -fn base64_decode(input: &str) -> Result, String> { +pub(crate) fn base64_decode(input: &str) -> Result, String> { let chars = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; let mut table = [255u8; 256]; for (i, &c) in chars.iter().enumerate() { diff --git a/crates/socket-patch-cli/src/commands/repair.rs b/crates/socket-patch-cli/src/commands/repair.rs index f83b9fb..58754b5 100644 --- a/crates/socket-patch-cli/src/commands/repair.rs +++ b/crates/socket-patch-cli/src/commands/repair.rs @@ -241,6 +241,7 @@ pub(crate) async fn repair_inner( blobs_path: &blobs_path, packages_path: Some(&packages_path), diffs_path: Some(&diffs_path), + mem_blobs: None, }; let fetch_result = fetch_missing_sources(&manifest, &sources, download_mode, &client, None).await; diff --git a/crates/socket-patch-cli/src/commands/scan.rs b/crates/socket-patch-cli/src/commands/scan.rs index 3f2b989..7d75263 100644 --- a/crates/socket-patch-cli/src/commands/scan.rs +++ b/crates/socket-patch-cli/src/commands/scan.rs @@ -19,7 +19,7 @@ use std::path::Path; use std::time::Duration; use crate::args::{apply_env_toggles, GlobalArgs}; -use crate::commands::fetch_stage::{stage_patch_sources, StageOutcome}; +use crate::commands::fetch_stage::{stage_vendor_sources_in_memory, MemStageOutcome}; use crate::commands::vex::{generate_vex_from_manifest_path, VexEmbedArgs}; use crate::ecosystem_dispatch::crawl_all_ecosystems; use crate::json_envelope::{Command as EnvelopeCommand, Envelope}; @@ -433,8 +433,7 @@ async fn preverify_vendor_baselines( if info.before_hash.is_empty() { continue; // a new file has no baseline to compare } - if verify_file_patch(&pkg.path, file, &info).await.status - == VerifyStatus::HashMismatch + if verify_file_patch(&pkg.path, file, &info).await.status == VerifyStatus::HashMismatch { mismatched.insert(patch.uuid.clone()); break; @@ -734,16 +733,18 @@ async fn run_scan_vendor_step( patches: records.clone(), setup: None, }; - let staged = match stage_patch_sources(common, &synth, socket_dir).await { - Ok(StageOutcome::Ready(s)) => s, - Ok(StageOutcome::Unavailable) => { - return Err(( - "no_local_source", - "patch artifacts unavailable (offline or download failure)".to_string(), - )) - } - Err(e) => return Err(("stage_failed", e)), - }; + let staged = + match stage_vendor_sources_in_memory(common, &synth, socket_dir, &common.cwd).await + { + Ok(MemStageOutcome::Ready(s)) => s, + Ok(MemStageOutcome::Unavailable) => { + return Err(( + "no_local_source", + "patch artifacts unavailable (offline or download failure)".to_string(), + )) + } + Err(e) => return Err(("stage_failed", e)), + }; let sources = staged.as_patch_sources(); boxed_vendor_records(common, records, &sources, true, &mut env).await } @@ -761,16 +762,19 @@ async fn run_scan_vendor_step( // Same placement as the `vendor` command: dropped entries // are reverted even when zero in-scope patches remain. let mut has_errors = reconcile_dropped(&manifest, common, &mut env).await; - let staged = match stage_patch_sources(common, &manifest, socket_dir).await { - Ok(StageOutcome::Ready(s)) => s, - Ok(StageOutcome::Unavailable) => { - return Err(( - "no_local_source", - "patch artifacts unavailable (offline or download failure)".to_string(), - )) - } - Err(e) => return Err(("stage_failed", e)), - }; + let staged = + match stage_vendor_sources_in_memory(common, &manifest, socket_dir, &common.cwd) + .await + { + Ok(MemStageOutcome::Ready(s)) => s, + Ok(MemStageOutcome::Unavailable) => { + return Err(( + "no_local_source", + "patch artifacts unavailable (offline or download failure)".to_string(), + )) + } + Err(e) => return Err(("stage_failed", e)), + }; let sources = staged.as_patch_sources(); has_errors |= boxed_vendor_records(common, &manifest.patches, &sources, false, &mut env).await; @@ -837,8 +841,14 @@ async fn run_vendor_json_path( // and preview the GC, exactly like `--apply`'s dry run. result["vendor"] = preview_vendor_json(&args.common.cwd, &selected).await; if prune { - let gc = - preview_apply_gc(&args.common, manifest_path, socket_dir, scanned_purls, vendored_purls).await; + let gc = preview_apply_gc( + &args.common, + manifest_path, + socket_dir, + scanned_purls, + vendored_purls, + ) + .await; result["gc"] = gc.to_preview_json(); } let final_code = @@ -865,6 +875,7 @@ async fn run_vendor_json_path( api_overrides: args.common.api_client_overrides(), all_releases: args.all_releases, strict: args.common.strict, + persist_blobs: !args.vendor, }; let mut has_errors = false; let detached_records: Option> = if args.detached { @@ -900,7 +911,14 @@ async fn run_vendor_json_path( // package_not_installed; vendored entries are exempt from // the prune itself. if prune { - let gc = run_apply_gc(&args.common, manifest_path, socket_dir, scanned_purls, vendored_purls).await; + let gc = run_apply_gc( + &args.common, + manifest_path, + socket_dir, + scanned_purls, + vendored_purls, + ) + .await; result["gc"] = gc.to_apply_json(); } @@ -986,7 +1004,14 @@ async fn run_vendor_interactive_path( // GC before the vendor step (see the JSON path): stale manifest // entries would fail vendoring with package_not_installed. if prune { - let gc = run_apply_gc(&args.common, manifest_path, socket_dir, scanned_purls, vendored_purls).await; + let gc = run_apply_gc( + &args.common, + manifest_path, + socket_dir, + scanned_purls, + vendored_purls, + ) + .await; if !gc.pruned.is_empty() { println!("GC: pruned {} manifest entr{}.", gc.pruned.len(), { if gc.pruned.len() == 1 { @@ -1000,7 +1025,11 @@ async fn run_vendor_interactive_path( println!( "GC: reverted {} vendored entr{}; swept {} orphan vendor dir{}.", gc.vendored_reverted.len(), - if gc.vendored_reverted.len() == 1 { "y" } else { "ies" }, + if gc.vendored_reverted.len() == 1 { + "y" + } else { + "ies" + }, gc.vendor_orphan_dirs, if gc.vendor_orphan_dirs == 1 { "" } else { "s" }, ); @@ -1079,9 +1108,8 @@ fn partition_not_installed_selected( if lockfile_only.is_empty() { return (selected, Vec::new()); } - let is_lockfile_only = |p: &str| { - lockfile_only.contains(normalize_purl(strip_purl_qualifiers(p)).as_ref()) - }; + let is_lockfile_only = + |p: &str| lockfile_only.contains(normalize_purl(strip_purl_qualifiers(p)).as_ref()); let (not_installed, kept): (Vec<_>, Vec<_>) = selected .into_iter() .partition(|p| is_lockfile_only(&p.purl)); @@ -1823,6 +1851,7 @@ pub async fn run(args: ScanArgs) -> i32 { api_overrides: args.common.api_client_overrides(), all_releases: args.all_releases, strict: args.common.strict, + persist_blobs: !args.vendor, }; let (code, apply_json) = download_and_apply_patches(&selected, ¶ms).await; apply_code = code; @@ -1837,10 +1866,23 @@ pub async fn run(args: ScanArgs) -> i32 { // --- GC (if requested) -------------------------------------- if prune { let gc = if dry { - preview_apply_gc(&args.common, &manifest_path, &socket_dir, &scanned_purls, &vendored_purls) - .await + preview_apply_gc( + &args.common, + &manifest_path, + &socket_dir, + &scanned_purls, + &vendored_purls, + ) + .await } else { - run_apply_gc(&args.common, &manifest_path, &socket_dir, &scanned_purls, &vendored_purls).await + run_apply_gc( + &args.common, + &manifest_path, + &socket_dir, + &scanned_purls, + &vendored_purls, + ) + .await }; result["gc"] = if dry { gc.to_preview_json() @@ -1890,9 +1932,23 @@ pub async fn run(args: ScanArgs) -> i32 { // --- GC-only path (no --apply, just --prune) -------------------- if prune { let gc = if dry { - preview_apply_gc(&args.common, &manifest_path, &socket_dir, &scanned_purls, &vendored_purls).await + preview_apply_gc( + &args.common, + &manifest_path, + &socket_dir, + &scanned_purls, + &vendored_purls, + ) + .await } else { - run_apply_gc(&args.common, &manifest_path, &socket_dir, &scanned_purls, &vendored_purls).await + run_apply_gc( + &args.common, + &manifest_path, + &socket_dir, + &scanned_purls, + &vendored_purls, + ) + .await }; result["gc"] = if dry { gc.to_preview_json() @@ -2307,6 +2363,7 @@ pub async fn run(args: ScanArgs) -> i32 { api_overrides: args.common.api_client_overrides(), all_releases: args.all_releases, strict: args.common.strict, + persist_blobs: !args.vendor, }; let code = if args.vendor { @@ -2336,7 +2393,14 @@ pub async fn run(args: ScanArgs) -> i32 { // run `socket-patch gc` (or `repair`) explicitly. (Vendor mode // already ran its GC before the vendor step.) if prune && !args.vendor { - let gc = run_apply_gc(&args.common, &manifest_path, &socket_dir, &scanned_purls, &vendored_purls).await; + let gc = run_apply_gc( + &args.common, + &manifest_path, + &socket_dir, + &scanned_purls, + &vendored_purls, + ) + .await; let total = gc.blobs.blobs_removed + gc.diffs.blobs_removed + gc.packages.blobs_removed; if !args.common.silent && (!gc.pruned.is_empty() || total > 0) { println!( @@ -2352,7 +2416,11 @@ pub async fn run(args: ScanArgs) -> i32 { println!( "GC: reverted {} vendored entr{}; swept {} orphan vendor dir{}.", gc.vendored_reverted.len(), - if gc.vendored_reverted.len() == 1 { "y" } else { "ies" }, + if gc.vendored_reverted.len() == 1 { + "y" + } else { + "ies" + }, gc.vendor_orphan_dirs, if gc.vendor_orphan_dirs == 1 { "" } else { "s" }, ); @@ -2794,7 +2862,14 @@ mod tests { let tmp_wet = tempfile::tempdir().unwrap(); let (mp_w, sd_w, blob_w) = seed_manifest_with_blob(tmp_wet.path(), "pkg:npm/gone@1.0.0", &after_hash); - let wet = run_apply_gc(&gc_common(tmp_wet.path()), &mp_w, &sd_w, &scanned, &no_vendored()).await; + let wet = run_apply_gc( + &gc_common(tmp_wet.path()), + &mp_w, + &sd_w, + &scanned, + &no_vendored(), + ) + .await; assert_eq!( preview.blobs.blobs_removed, wet.blobs.blobs_removed, diff --git a/crates/socket-patch-cli/src/commands/vendor.rs b/crates/socket-patch-cli/src/commands/vendor.rs index 0ca3037..da33f3a 100644 --- a/crates/socket-patch-cli/src/commands/vendor.rs +++ b/crates/socket-patch-cli/src/commands/vendor.rs @@ -36,7 +36,7 @@ use std::time::Duration; use crate::args::{apply_env_toggles, GlobalArgs}; use crate::commands::apply::{result_to_event, variant_matches_installed}; -use crate::commands::fetch_stage::{stage_patch_sources, StageOutcome}; +use crate::commands::fetch_stage::{stage_vendor_sources_in_memory, MemStageOutcome}; use crate::commands::lock_cli::{acquire_or_emit, lock_broken_event}; use crate::commands::vex::{generate_vex_from_manifest_path, VexEmbedArgs}; use crate::ecosystem_dispatch::{find_packages_for_purls, partition_purls}; @@ -246,11 +246,8 @@ pub(crate) async fn dispatch_revert_one( pub(crate) async fn dispatch_in_use_one(entry: &VendorEntry, project_root: &Path) -> Option { match entry.ecosystem.as_str() { "npm" => { - socket_patch_core::patch::vendor::npm_flavor::vendored_entry_in_use( - entry, - project_root, - ) - .await + socket_patch_core::patch::vendor::npm_flavor::vendored_entry_in_use(entry, project_root) + .await } _ => None, } @@ -433,20 +430,24 @@ async fn run_vendor(args: &VendorArgs, manifest_path: &Path, env: &mut Envelope) let mut has_errors = reconcile_dropped(&manifest, common, env).await; let socket_dir = manifest_path.parent().unwrap_or(Path::new(".")); - let staged = match stage_patch_sources(common, &manifest, socket_dir).await { - Ok(StageOutcome::Ready(s)) => s, - Ok(StageOutcome::Unavailable) => { - env.mark_error(EnvelopeError::new( - "no_local_source", - "patch artifacts unavailable (offline or download failure)", - )); - return 1; - } - Err(e) => { - env.mark_error(EnvelopeError::new("stage_failed", e)); - return 1; - } - }; + // Vendor stages patch content IN MEMORY: existing .socket artifacts are + // read in place, missing content is fetched per patch — vendoring never + // writes blobs or temp files (the committed artifact is the patch). + let staged = + match stage_vendor_sources_in_memory(common, &manifest, socket_dir, &common.cwd).await { + Ok(MemStageOutcome::Ready(s)) => s, + Ok(MemStageOutcome::Unavailable) => { + env.mark_error(EnvelopeError::new( + "no_local_source", + "patch artifacts unavailable (offline or download failure)", + )); + return 1; + } + Err(e) => { + env.mark_error(EnvelopeError::new("stage_failed", e)); + return 1; + } + }; let sources = staged.as_patch_sources(); has_errors |= vendor_records(common, &manifest.patches, &sources, false, args.force, env).await; @@ -570,9 +571,7 @@ pub(crate) async fn vendor_records( .filter(|e| e.ecosystem == "npm" && e.artifact.path.ends_with(".tgz")) { let tgz = common.cwd.join(&entry.artifact.path); - match registry_fetch::stage_local_artifact(&tgz, &entry.artifact.sha256) - .await - { + match registry_fetch::stage_local_artifact(&tgz, &entry.artifact.sha256).await { Ok(staged) => { all_packages.insert(purl.clone(), staged.dir().to_path_buf()); fetched_holders.push(staged); @@ -588,10 +587,7 @@ pub(crate) async fn vendor_records( .with_error("vendor_fetch_failed", detail.clone()), ); if !common.silent && !common.json { - eprintln!( - "Cannot vendor {}: {detail}", - normalize_purl(purl) - ); + eprintln!("Cannot vendor {}: {detail}", normalize_purl(purl)); } continue; } @@ -912,8 +908,7 @@ pub(crate) async fn vendor_records( unmatched .iter() .filter(|p| { - socket_patch_core::patch::vendor::lock_inventory::lookup(&entries, p) - .is_some() + socket_patch_core::patch::vendor::lock_inventory::lookup(&entries, p).is_some() }) .cloned() .collect() @@ -1203,7 +1198,10 @@ pub(crate) async fn run_vendor_gc( continue; } let entry = state.entries.get(&purl).cloned().expect("listed above"); - if dispatch_revert_one(&entry, &common.cwd, false).await.success { + if dispatch_revert_one(&entry, &common.cwd, false) + .await + .success + { state.entries.remove(&purl); out.dropped_reverted.push(purl); } else { @@ -1229,7 +1227,10 @@ pub(crate) async fn run_vendor_gc( out.unused_reverted.push(purl); continue; } - if !dispatch_revert_one(&entry, &common.cwd, false).await.success { + if !dispatch_revert_one(&entry, &common.cwd, false) + .await + .success + { out.failed.push(purl); continue; } @@ -1365,7 +1366,11 @@ mod gc_tests { assert!(out.dropped_reverted.is_empty(), "{out:?}"); assert!(out.unused_reverted.is_empty(), "{out:?}"); assert_eq!(out.orphan_dirs, 0); - assert!(load_state(tmp.path()).await.unwrap().entries.contains_key(PURL)); + assert!(load_state(tmp.path()) + .await + .unwrap() + .entries + .contains_key(PURL)); } /// (a) the patch is gone from the manifest: revert + drop the entry. @@ -1381,7 +1386,9 @@ mod gc_tests { assert!(out.failed.is_empty(), "{out:?}"); assert!(load_state(tmp.path()).await.unwrap().entries.is_empty()); assert!( - !tmp.path().join(format!(".socket/vendor/npm/{UUID}")).exists(), + !tmp.path() + .join(format!(".socket/vendor/npm/{UUID}")) + .exists(), "artifact dir removed by the revert" ); } @@ -1433,7 +1440,9 @@ mod gc_tests { "dry run must not touch the manifest" ); assert!( - tmp.path().join(format!(".socket/vendor/npm/{UUID}")).exists(), + tmp.path() + .join(format!(".socket/vendor/npm/{UUID}")) + .exists(), "dry run must not remove artifacts" ); } @@ -1449,7 +1458,11 @@ mod gc_tests { .unwrap(); let out = run_vendor_gc(&common, &manifest_path, false).await; assert!(out.unused_reverted.is_empty(), "{out:?}"); - assert!(load_state(tmp.path()).await.unwrap().entries.contains_key(PURL)); + assert!(load_state(tmp.path()) + .await + .unwrap() + .entries + .contains_key(PURL)); // Detached entry: absent from the manifest AND lockfile-invisible — // exactly its normal state. Never reverted by the GC. @@ -1463,7 +1476,11 @@ mod gc_tests { let out = run_vendor_gc(&common, &manifest_path, false).await; assert!(out.dropped_reverted.is_empty(), "{out:?}"); assert!(out.unused_reverted.is_empty(), "{out:?}"); - assert!(load_state(tmp.path()).await.unwrap().entries.contains_key(PURL)); + assert!(load_state(tmp.path()) + .await + .unwrap() + .entries + .contains_key(PURL)); } /// (c) uuid dirs with no owning ledger entry are swept (wet) / counted @@ -1486,6 +1503,9 @@ mod gc_tests { assert_eq!(out.orphan_dirs, 1, "{out:?}"); assert!(!orphan_dir.exists(), "wet run sweeps the orphan"); // The recorded entry's dir survives the sweep. - assert!(tmp.path().join(format!(".socket/vendor/npm/{UUID}")).exists()); + assert!(tmp + .path() + .join(format!(".socket/vendor/npm/{UUID}")) + .exists()); } } diff --git a/crates/socket-patch-cli/tests/apply_network.rs b/crates/socket-patch-cli/tests/apply_network.rs index 08f1fd8..5c4eadb 100644 --- a/crates/socket-patch-cli/tests/apply_network.rs +++ b/crates/socket-patch-cli/tests/apply_network.rs @@ -504,7 +504,10 @@ async fn apply_hash_mismatch_default_warns_and_applies_strict_fails() { "the overwrite is surfaced as a warning event: {events:?}" ); let content = std::fs::read(tmp.path().join("node_modules/mismatch/index.js")).unwrap(); - assert_eq!(content, after, "the file carries the verified patched bytes"); + assert_eq!( + content, after, + "the file carries the verified patched bytes" + ); // The human run logs the warning to stderr. let tmp = fixture(); @@ -757,8 +760,16 @@ async fn apply_mismatch_redownloads_full_blob_and_applies() { header.set_size(bytes.len() as u64); header.set_mode(0o644); header.set_cksum(); - builder.append_data(&mut header, "other.js", &bytes[..]).unwrap(); - builder.into_inner().unwrap().finish().unwrap().flush().unwrap(); + builder + .append_data(&mut header, "other.js", &bytes[..]) + .unwrap(); + builder + .into_inner() + .unwrap() + .finish() + .unwrap() + .flush() + .unwrap(); } let (code, stdout, stderr) = run_apply(tmp.path(), &mock.uri(), &[]); diff --git a/crates/socket-patch-cli/tests/in_process_get_update_count.rs b/crates/socket-patch-cli/tests/in_process_get_update_count.rs index 36e4bef..905a5d0 100644 --- a/crates/socket-patch-cli/tests/in_process_get_update_count.rs +++ b/crates/socket-patch-cli/tests/in_process_get_update_count.rs @@ -73,6 +73,7 @@ fn params(root: &Path, server: &MockServer) -> DownloadParams { proxy_url: None, }, strict: false, + persist_blobs: true, // Skip release-narrowing; npm has no variants anyway. all_releases: true, } diff --git a/crates/socket-patch-cli/tests/in_process_vendor.rs b/crates/socket-patch-cli/tests/in_process_vendor.rs index 791d9f9..536ea67 100644 --- a/crates/socket-patch-cli/tests/in_process_vendor.rs +++ b/crates/socket-patch-cli/tests/in_process_vendor.rs @@ -1292,7 +1292,10 @@ fn mismatched_baseline_vendors_with_warning_event() { .contains("left-pad@1.3.0"), "warning names the package: {env:#}" ); - assert!(fx.tgz_path().exists(), "artifact packed despite the mismatch"); + assert!( + fx.tgz_path().exists(), + "artifact packed despite the mismatch" + ); // The installed tree keeps its divergent bytes (only the stage changed). assert_eq!( std::fs::read(fx.installed_index()).unwrap(), @@ -1374,9 +1377,9 @@ fn vendor_resolves_percent_encoded_scope_purl() { assert_eq!(applied["purl"], "pkg:npm/%40scope/left-pad@1.3.0"); // Artifact lands under the DECODED scope dir. - let tgz = fx - .root() - .join(format!(".socket/vendor/npm/{UUID}/@scope/left-pad-1.3.0.tgz")); + let tgz = fx.root().join(format!( + ".socket/vendor/npm/{UUID}/@scope/left-pad-1.3.0.tgz" + )); assert!(tgz.exists(), "tarball at the decoded scoped path"); // Lock rewired to the vendored artifact. @@ -1389,8 +1392,7 @@ fn vendor_resolves_percent_encoded_scope_purl() { ); // Ledger keyed by the VERBATIM encoded purl (manifest key parity). - let state: Value = - serde_json::from_slice(&std::fs::read(fx.state_path()).unwrap()).unwrap(); + let state: Value = serde_json::from_slice(&std::fs::read(fx.state_path()).unwrap()).unwrap(); assert!( state["entries"]["pkg:npm/%40scope/left-pad@1.3.0"].is_object(), "state keyed by the encoded manifest purl: {state:#}" diff --git a/crates/socket-patch-cli/tests/scan_vendor_e2e.rs b/crates/socket-patch-cli/tests/scan_vendor_e2e.rs index d3e8cb5..a76bc22 100644 --- a/crates/socket-patch-cli/tests/scan_vendor_e2e.rs +++ b/crates/socket-patch-cli/tests/scan_vendor_e2e.rs @@ -171,6 +171,23 @@ fn run_scan_vendor(root: &Path, mock_uri: &str, extra: &[&str]) -> (i32, String, ) } +/// Vendor flows hold patch content in MEMORY: `.socket/` must end up with +/// nothing beyond the manifest and the committed vendor artifacts — no +/// `blobs/`, `diffs/`, `packages/`, or stray temp files. +fn assert_socket_dir_lean(root: &Path) { + let entries: Vec = std::fs::read_dir(root.join(".socket")) + .expect(".socket exists") + .map(|e| e.unwrap().file_name().to_string_lossy().into_owned()) + .filter(|n| n != "apply.lock") + .collect(); + assert!( + entries + .iter() + .all(|n| n == "manifest.json" || n == "vendor"), + "vendoring must not write blobs or temp files into .socket; found: {entries:?}" + ); +} + #[tokio::test] async fn scan_vendor_manifest_mode_end_to_end() { // scan --vendor: discover → download (manifest written) → vendor. @@ -231,6 +248,7 @@ async fn scan_vendor_manifest_mode_end_to_end() { BEFORE, "installed tree stays pristine" ); + assert_socket_dir_lean(tmp.path()); // Idempotent re-run: already_vendored skip, zero new applies. let (code, stdout, stderr) = run_scan_vendor(tmp.path(), &mock.uri(), &[]); @@ -329,6 +347,10 @@ async fn scan_vendor_detached_mode_writes_no_manifest() { .any(|r| r.url.path().contains("/patches/view/")), "idempotent detached re-run must not re-fetch the patch view" ); + assert!( + !tmp.path().join(".socket/blobs").exists(), + "detached vendoring must never persist blobs" + ); } #[tokio::test] @@ -568,7 +590,9 @@ async fn scan_vendor_resolves_percent_encoded_scoped_purl() { assert!(tgz.is_file(), "tarball at the decoded scoped path"); let lock = std::fs::read_to_string(tmp.path().join("package-lock.json")).unwrap(); assert!( - lock.contains(&format!(".socket/vendor/npm/{UUID}/@scope/left-pad-1.3.0.tgz")), + lock.contains(&format!( + ".socket/vendor/npm/{UUID}/@scope/left-pad-1.3.0.tgz" + )), "lock consumes the vendored tarball; lock={lock}" ); // Ledger keyed by the verbatim encoded purl. @@ -673,7 +697,9 @@ async fn scan_prune_reverts_unused_vendored_entry() { "manifest entry dropped: {manifest}" ); assert!( - !tmp.path().join(format!(".socket/vendor/npm/{UUID}")).exists(), + !tmp.path() + .join(format!(".socket/vendor/npm/{UUID}")) + .exists(), "artifact dir removed" ); // The (already left-pad-free) lock stays exactly as the user re-locked @@ -883,7 +909,9 @@ async fn vendor_auto_fetches_missing_package_from_lockfile() { assert_eq!(code, 0, "{v:#}"); let events = v["events"].as_array().unwrap(); assert!( - events.iter().any(|e| e["action"] == "applied" && e["purl"] == PURL), + events + .iter() + .any(|e| e["action"] == "applied" && e["purl"] == PURL), "{v:#}" ); assert!( @@ -1039,6 +1067,7 @@ async fn scan_vendor_works_on_a_completely_fresh_clone() { .join(format!(".socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz")) .is_file()); assert!(!tmp.path().join("node_modules").exists()); + assert_socket_dir_lean(tmp.path()); // Second run: in sync. let (code, stdout, stderr) = run_scan_vendor(tmp.path(), &mock.uri(), &[]); @@ -1046,11 +1075,10 @@ async fn scan_vendor_works_on_a_completely_fresh_clone() { let v: serde_json::Value = serde_json::from_str(stdout.trim()).expect("valid JSON"); let events = v["vendor"]["events"].as_array().unwrap(); assert!( - events - .iter() - .any(|e| e["errorCode"] == "already_vendored"), + events.iter().any(|e| e["errorCode"] == "already_vendored"), "{v}" ); + assert_socket_dir_lean(tmp.path()); } /// Read-only discovery flags lockfile-only packages in JSON and the human @@ -1069,8 +1097,14 @@ async fn scan_discovers_lockfile_only_packages_with_warning() { // JSON shape. let out = Command::new(binary()) .args([ - "scan", "--json", "--api-url", &mock.uri(), "--api-token", "fake-token", - "--org", ORG_SLUG, + "scan", + "--json", + "--api-url", + &mock.uri(), + "--api-token", + "fake-token", + "--org", + ORG_SLUG, ]) .current_dir(tmp.path()) .env("SOCKET_TELEMETRY_DISABLED", "1") @@ -1085,8 +1119,15 @@ async fn scan_discovers_lockfile_only_packages_with_warning() { // Human output: the table marker + the note. let out = Command::new(binary()) .args([ - "scan", "--api-url", &mock.uri(), "--api-token", "fake-token", - "--org", ORG_SLUG, "--dry-run", "--yes", + "scan", + "--api-url", + &mock.uri(), + "--api-token", + "fake-token", + "--org", + ORG_SLUG, + "--dry-run", + "--yes", ]) .current_dir(tmp.path()) .env("SOCKET_TELEMETRY_DISABLED", "1") @@ -1119,8 +1160,16 @@ async fn scan_apply_skips_lockfile_only_without_error() { let out = Command::new(binary()) .args([ - "scan", "--json", "--apply", "--yes", "--api-url", &mock.uri(), - "--api-token", "fake-token", "--org", ORG_SLUG, + "scan", + "--json", + "--apply", + "--yes", + "--api-url", + &mock.uri(), + "--api-token", + "fake-token", + "--org", + ORG_SLUG, ]) .current_dir(tmp.path()) .env("SOCKET_TELEMETRY_DISABLED", "1") @@ -1133,8 +1182,9 @@ async fn scan_apply_skips_lockfile_only_without_error() { assert_eq!(v["status"], "success", "{v}"); let patches = v["apply"]["patches"].as_array().unwrap(); assert!( - patches.iter().any(|p| p["action"] == "skipped" - && p["errorCode"] == "package_not_installed"), + patches + .iter() + .any(|p| p["action"] == "skipped" && p["errorCode"] == "package_not_installed"), "{v}" ); assert!( diff --git a/crates/socket-patch-core/src/crawlers/deno_crawler.rs b/crates/socket-patch-core/src/crawlers/deno_crawler.rs index 150cc27..9a7d784 100644 --- a/crates/socket-patch-core/src/crawlers/deno_crawler.rs +++ b/crates/socket-patch-core/src/crawlers/deno_crawler.rs @@ -131,10 +131,7 @@ impl DenoCrawler { continue; } // Cache layout: //// - let pkg_dir = jsr_cache_path - .join(&*scope) - .join(&*name) - .join(&*version); + let pkg_dir = jsr_cache_path.join(&*scope).join(&*name).join(&*version); if !is_dir(&pkg_dir).await { continue; } diff --git a/crates/socket-patch-core/src/patch/apply.rs b/crates/socket-patch-core/src/patch/apply.rs index fe5a9d5..da1ce6d 100644 --- a/crates/socket-patch-core/src/patch/apply.rs +++ b/crates/socket-patch-core/src/patch/apply.rs @@ -94,6 +94,11 @@ pub struct PatchSources<'a> { pub blobs_path: &'a Path, pub packages_path: Option<&'a Path>, pub diffs_path: Option<&'a Path>, + /// In-memory blob overlay (`afterHash` → patched bytes), consulted + /// BEFORE the on-disk blob dir. The vendor flows stage their patch + /// content here so vendoring writes no `.socket/blobs` entries and no + /// temporary files — the bytes live only for the run. + pub mem_blobs: Option<&'a HashMap>>, } impl<'a> PatchSources<'a> { @@ -105,6 +110,7 @@ impl<'a> PatchSources<'a> { blobs_path, packages_path: None, diffs_path: None, + mem_blobs: None, } } } @@ -877,16 +883,27 @@ pub async fn apply_package_patch( continue; } - // ── Strategy 3: per-file blob (legacy fallback) ────────────── - let blob_path = sources.blobs_path.join(&file_info.after_hash); - let patched_content = match tokio::fs::read(&blob_path).await { - Ok(content) => content, - Err(e) => { - result.error = Some(format!( - "Failed to read blob {}: {}", - file_info.after_hash, e - )); - return result; + // ── Strategy 3: per-file blob ──────────────────────────────── + // The in-memory overlay wins (vendor flows stage there — no + // `.socket/blobs` writes); the on-disk dir is the fallback. + let mem_hit = sources + .mem_blobs + .and_then(|m| m.get(&file_info.after_hash)) + .cloned(); + let patched_content = match mem_hit { + Some(content) => content, + None => { + let blob_path = sources.blobs_path.join(&file_info.after_hash); + match tokio::fs::read(&blob_path).await { + Ok(content) => content, + Err(e) => { + result.error = Some(format!( + "Failed to read blob {}: {}", + file_info.after_hash, e + )); + return result; + } + } } }; @@ -2112,6 +2129,7 @@ mod tests { blobs_path: &blobs_dir, packages_path: Some(&packages_dir), diffs_path: Some(&diffs_dir), + mem_blobs: None, }; let result = apply_package_patch( "pkg:npm/x@1.0.0", @@ -2147,6 +2165,7 @@ mod tests { blobs_path: &blobs_dir, packages_path: Some(&packages_dir), diffs_path: Some(&diffs_dir), + mem_blobs: None, }; let result = apply_package_patch( "pkg:npm/x@1.0.0", @@ -2181,6 +2200,7 @@ mod tests { blobs_path: &blobs_dir, packages_path: Some(&packages_dir), diffs_path: Some(&diffs_dir), + mem_blobs: None, }; let result = apply_package_patch( "pkg:npm/x@1.0.0", @@ -2210,6 +2230,7 @@ mod tests { blobs_path: &blobs_dir, packages_path: Some(&packages_dir), diffs_path: Some(&diffs_dir), + mem_blobs: None, }; let result = apply_package_patch( "pkg:npm/x@1.0.0", @@ -2247,6 +2268,7 @@ mod tests { blobs_path: &blobs_dir, packages_path: Some(&packages_dir), diffs_path: Some(&diffs_dir), + mem_blobs: None, }; let result = apply_package_patch( "pkg:npm/x@1.0.0", @@ -2287,6 +2309,7 @@ mod tests { blobs_path: &blobs_dir, packages_path: Some(&packages_dir), diffs_path: Some(&diffs_dir), + mem_blobs: None, }; let result = apply_package_patch( "pkg:npm/x@1.0.0", @@ -2317,6 +2340,7 @@ mod tests { blobs_path: &blobs_dir, packages_path: Some(&packages_dir), diffs_path: Some(&diffs_dir), + mem_blobs: None, }; let result = apply_package_patch( "pkg:npm/x@1.0.0", diff --git a/crates/socket-patch-core/src/patch/go_redirect.rs b/crates/socket-patch-core/src/patch/go_redirect.rs index f54cd60..63a43f2 100644 --- a/crates/socket-patch-core/src/patch/go_redirect.rs +++ b/crates/socket-patch-core/src/patch/go_redirect.rs @@ -236,7 +236,8 @@ pub async fn apply_go_redirect( } // Delegate to the hardened pipeline, pointed at the copy. - let mut result = apply_package_patch(purl, ©_dir, files, sources, uuid, false, policy).await; + let mut result = + apply_package_patch(purl, ©_dir, files, sources, uuid, false, policy).await; result.package_path = copy_dir.display().to_string(); // The golang sidecar advisory ("go mod verify will fail against go.sum") // is about in-cache patching; a `replace` copy bypasses go.sum entirely, so diff --git a/crates/socket-patch-core/src/patch/vendor/composer_lock.rs b/crates/socket-patch-core/src/patch/vendor/composer_lock.rs index 5a533eb..67ab6f6 100644 --- a/crates/socket-patch-core/src/patch/vendor/composer_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/composer_lock.rs @@ -35,8 +35,8 @@ use serde_json::{json, Map, Value}; use crate::manifest::schema::{PatchFileInfo, PatchRecord}; use crate::patch::apply::{ - is_safe_relative_subpath, normalize_file_path, ApplyResult, PatchSources, - VerifyResult, VerifyStatus, + is_safe_relative_subpath, normalize_file_path, ApplyResult, PatchSources, VerifyResult, + VerifyStatus, }; use crate::patch::copy_tree::{fresh_copy, remove_tree}; use crate::patch::file_hash::compute_file_git_sha256; diff --git a/crates/socket-patch-core/src/patch/vendor/gem.rs b/crates/socket-patch-core/src/patch/vendor/gem.rs index 0eccace..c125bdf 100644 --- a/crates/socket-patch-core/src/patch/vendor/gem.rs +++ b/crates/socket-patch-core/src/patch/vendor/gem.rs @@ -53,8 +53,8 @@ use serde_json::Value; use crate::manifest::schema::{PatchFileInfo, PatchRecord}; use crate::patch::apply::{ - is_safe_relative_subpath, normalize_file_path, ApplyResult, PatchSources, - VerifyResult, VerifyStatus, + is_safe_relative_subpath, normalize_file_path, ApplyResult, PatchSources, VerifyResult, + VerifyStatus, }; use crate::patch::copy_tree::{fresh_copy, remove_tree}; use crate::patch::file_hash::compute_file_git_sha256; diff --git a/crates/socket-patch-core/src/patch/vendor/golang.rs b/crates/socket-patch-core/src/patch/vendor/golang.rs index d532485..f1ad5b5 100644 --- a/crates/socket-patch-core/src/patch/vendor/golang.rs +++ b/crates/socket-patch-core/src/patch/vendor/golang.rs @@ -335,10 +335,10 @@ pub async fn revert_go_vendor( #[cfg(test)] mod tests { use super::*; - use crate::patch::apply::MismatchPolicy; use crate::hash::git_sha256::compute_git_sha256_from_bytes; use crate::manifest::schema::{PatchFileInfo, VulnerabilityInfo}; use crate::patch::apply::ApplyResult; + use crate::patch::apply::MismatchPolicy; use crate::patch::vendor::state::VENDOR_MARKER_FILE; use std::collections::HashMap; use std::path::PathBuf; diff --git a/crates/socket-patch-core/src/patch/vendor/lock_inventory.rs b/crates/socket-patch-core/src/patch/vendor/lock_inventory.rs index 476353c..afc7a61 100644 --- a/crates/socket-patch-core/src/patch/vendor/lock_inventory.rs +++ b/crates/socket-patch-core/src/patch/vendor/lock_inventory.rs @@ -191,8 +191,7 @@ fn dedup_prefer_integrity(raw: Vec) -> Vec { let key = (entry.name.clone(), entry.version.clone()); match seen.get(&key) { Some(&i) => { - if out[i].integrity == LockIntegrity::None - && entry.integrity != LockIntegrity::None + if out[i].integrity == LockIntegrity::None && entry.integrity != LockIntegrity::None { out[i] = entry; } @@ -218,7 +217,12 @@ pub async fn inventory_cargo_lock(project_root: &Path) -> Option, Option, Option, Option); + type CargoBlock = ( + Option, + Option, + Option, + Option, + ); let mut out = Vec::new(); let mut cur: Option = None; let flush = |cur: &mut Option, out: &mut Vec| { @@ -234,7 +238,9 @@ pub async fn inventory_cargo_lock(project_root: &Path) -> Option { + Some(c) + if crates_io && c.len() == 64 && c.bytes().all(|b| b.is_ascii_hexdigit()) => + { LockIntegrity::Sha256Hex(c) } _ => LockIntegrity::None, @@ -292,8 +298,7 @@ pub async fn inventory_go_sum(project_root: &Path) -> Option> let mut out = Vec::new(); for line in text.lines() { let mut parts = line.split_whitespace(); - let (Some(module), Some(version), Some(hash)) = - (parts.next(), parts.next(), parts.next()) + let (Some(module), Some(version), Some(hash)) = (parts.next(), parts.next(), parts.next()) else { continue; }; @@ -349,7 +354,10 @@ async fn inventory_package_lock(root: &Path) -> Option> { continue; }; if node.get("link").and_then(Value::as_bool).unwrap_or(false) - || node.get("inBundle").and_then(Value::as_bool).unwrap_or(false) + || node + .get("inBundle") + .and_then(Value::as_bool) + .unwrap_or(false) { continue; } @@ -432,7 +440,10 @@ async fn inventory_pnpm_lock(root: &Path) -> Option> { } } // Our own vendored spec: not a registry dependency. - if tarball.as_deref().is_some_and(|t| parse_vendor_path(t).is_some()) { + if tarball + .as_deref() + .is_some_and(|t| parse_vendor_path(t).is_some()) + { continue; } out.push(LockfileEntry::npm( @@ -448,7 +459,9 @@ async fn inventory_pnpm_lock(root: &Path) -> Option> { // ───────────────────────────── yarn.lock (classic) ───────────────────────────── async fn inventory_yarn_classic(root: &Path) -> Option> { - let text = tokio::fs::read_to_string(root.join("yarn.lock")).await.ok()?; + let text = tokio::fs::read_to_string(root.join("yarn.lock")) + .await + .ok()?; let mut out = Vec::new(); for block in yarn_classic_lock::scan_blocks(&text) { // Our own vendored block: not a registry dependency. @@ -490,7 +503,9 @@ async fn inventory_yarn_classic(root: &Path) -> Option> { // ───────────────────────────── yarn.lock (berry) ───────────────────────────── async fn inventory_yarn_berry(root: &Path) -> Option> { - let text = tokio::fs::read_to_string(root.join("yarn.lock")).await.ok()?; + let text = tokio::fs::read_to_string(root.join("yarn.lock")) + .await + .ok()?; let mut out = Vec::new(); // Berry reuses classic's block grammar (same scanner the berry backend // imports); `__metadata` and workspace/patch/file resolutions are not @@ -512,8 +527,8 @@ async fn inventory_yarn_berry(root: &Path) -> Option> { continue; }; let version_from_res = reference.split("::").next().unwrap_or(reference); - let version = yarn_berry_lock::berry_field(&block.lines, "version") - .unwrap_or(version_from_res); + let version = + yarn_berry_lock::berry_field(&block.lines, "version").unwrap_or(version_from_res); let integrity = yarn_berry_lock::berry_field(&block.lines, "checksum") .map(|c| LockIntegrity::BerryChecksum(c.to_string())) .unwrap_or(LockIntegrity::None); @@ -525,7 +540,9 @@ async fn inventory_yarn_berry(root: &Path) -> Option> { // ──────────────────────────────── bun.lock ──────────────────────────────── async fn inventory_bun(root: &Path) -> Option> { - let text = tokio::fs::read_to_string(root.join("bun.lock")).await.ok()?; + let text = tokio::fs::read_to_string(root.join("bun.lock")) + .await + .ok()?; bun_lock::check_lock_version(&text).ok()?; let lines: Vec = text.split('\n').map(str::to_string).collect(); let entries = bun_lock::parse_packages_section(&lines).ok()?; @@ -537,7 +554,10 @@ async fn inventory_bun(root: &Path) -> Option> { if entry.elems.len() != 4 || !entry.elems[2].starts_with('{') { continue; } - let Some(spec) = entry.elems.first().and_then(|e| bun_lock::decode_json_string(e)) + let Some(spec) = entry + .elems + .first() + .and_then(|e| bun_lock::decode_json_string(e)) else { continue; }; @@ -577,7 +597,9 @@ async fn inventory_bun(root: &Path) -> Option> { /// versions drop the pretty leading `v`. #[cfg(feature = "composer")] pub async fn inventory_composer_lock(project_root: &Path) -> Option> { - let bytes = tokio::fs::read(project_root.join("composer.lock")).await.ok()?; + let bytes = tokio::fs::read(project_root.join("composer.lock")) + .await + .ok()?; let doc: Value = serde_json::from_slice(&bytes).ok()?; let mut out = Vec::new(); for section in ["packages", "packages-dev"] { @@ -625,14 +647,12 @@ pub async fn inventory_composer_lock(project_root: &Path) -> Option Option Option> { } } } - flush(&mut name, &mut version, &mut sourced_registry, &mut wheel, &mut out); + flush( + &mut name, + &mut version, + &mut sourced_registry, + &mut wheel, + &mut out, + ); Some(dedup_prefer_integrity(out)) } @@ -1187,7 +1217,8 @@ aliased@npm:real-name@^3.0.0: // ── yarn berry ──────────────────────────────────────────────────────── - const YARN_BERRY: &str = "# This file is generated by running \"yarn install\" inside your project. + const YARN_BERRY: &str = + "# This file is generated by running \"yarn install\" inside your project. # Manifest files (package.json) are also used. __metadata: @@ -1288,7 +1319,12 @@ __metadata: async fn dedup_prefers_integrity_bearing_instance() { let raw = vec![ LockfileEntry::npm("dup", "1.0.0", None, LockIntegrity::None), - LockfileEntry::npm("dup", "1.0.0", None, LockIntegrity::Sri("sha512-x==".into())), + LockfileEntry::npm( + "dup", + "1.0.0", + None, + LockIntegrity::Sri("sha512-x==".into()), + ), LockfileEntry::npm("dup", "1.0.0", None, LockIntegrity::None), ]; let out = finalize_npm(raw); @@ -1439,7 +1475,10 @@ checksum = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" let entries = inventory_composer_lock(tmp.path()).await.unwrap(); let monolog = entry(&entries, "monolog/monolog"); - assert_eq!(monolog.version, "3.5.0", "leading v dropped, name lowercased"); + assert_eq!( + monolog.version, "3.5.0", + "leading v dropped, name lowercased" + ); assert_eq!(monolog.purl, "pkg:composer/monolog/monolog@3.5.0"); assert!(matches!(monolog.integrity, LockIntegrity::Sha1Hex(_))); assert!(monolog.resolved.as_deref().unwrap().contains("zipball")); diff --git a/crates/socket-patch-core/src/patch/vendor/mod.rs b/crates/socket-patch-core/src/patch/vendor/mod.rs index 8b13f22..4551991 100644 --- a/crates/socket-patch-core/src/patch/vendor/mod.rs +++ b/crates/socket-patch-core/src/patch/vendor/mod.rs @@ -53,10 +53,9 @@ pub mod cargo_lock; #[cfg(feature = "composer")] pub mod composer_lock; pub mod gem; -pub mod lock_inventory; -pub mod registry_fetch; #[cfg(feature = "golang")] pub mod golang; +pub mod lock_inventory; mod npm_common; pub mod npm_flavor; pub mod npm_lock; @@ -69,6 +68,7 @@ pub mod pypi_poetry; pub mod pypi_requirements; pub mod pypi_uv; pub mod pypi_wheel; +pub mod registry_fetch; mod toml_surgery; pub mod verify; pub mod yarn_berry_lock; @@ -162,7 +162,10 @@ pub(crate) async fn missing_existing_patch_files( if !is_safe_relative_subpath(normalized) { continue; } - if tokio::fs::metadata(staged_dir.join(normalized)).await.is_err() { + if tokio::fs::metadata(staged_dir.join(normalized)) + .await + .is_err() + { missing.push(file_name.clone()); } } @@ -185,6 +188,135 @@ pub(crate) fn failed_apply_result(purl: &str, error: String) -> ApplyResult { } } +/// Patched-content blobs harvested from the committed vendor artifacts: +/// for every manifest record whose patch uuid matches its ledger entry, +/// hash the artifact's files (git-sha256, the manifest hash) and keep the +/// ones matching the record's `afterHash`es. +/// +/// This is what lets vendor RE-RUNS (in-sync verification, re-vendor) run +/// with no network and no `.socket/blobs` — the committed artifact IS the +/// patched content. Artifact shapes: npm/pypi tarball-or-wheel files and +/// the dir-shaped ecosystems (cargo/golang/composer/gem copies). Fail-soft +/// per entry; tampered/oversized artifacts contribute nothing (the apply +/// pipeline's afterHash gate decides correctness either way). +pub async fn harvest_artifact_blobs( + project_root: &Path, + manifest_patches: &HashMap, +) -> HashMap> { + use crate::hash::git_sha256::compute_git_sha256_from_bytes; + + const MAX_ARTIFACT_BYTES: u64 = 256 * 1024 * 1024; + const MAX_FILE_BYTES: u64 = 64 * 1024 * 1024; + + let mut out: HashMap> = HashMap::new(); + let Ok(state) = load_state(project_root).await else { + return out; + }; + if state.entries.is_empty() { + return out; + } + + for (purl, record) in manifest_patches { + let needed: std::collections::HashSet<&str> = record + .files + .values() + .map(|f| f.after_hash.as_str()) + .filter(|h| !h.is_empty() && !out.contains_key(*h)) + .collect(); + if needed.is_empty() { + continue; + } + let Some(entry) = state.entries.get(purl).or_else(|| { + state + .entries + .values() + .find(|e| e.base_purl == crate::utils::purl::strip_purl_qualifiers(purl)) + }) else { + continue; + }; + if entry.uuid != record.uuid { + continue; // stale artifact: a re-vendor is pending, don't trust it + } + // SECURITY: the artifact path comes from the committed, tamperable + // ledger and is joined onto the project root for READING only — + // still, never follow an escaping path. + if !crate::patch::apply::is_safe_relative_subpath(&entry.artifact.path) { + continue; + } + let artifact = project_root.join(&entry.artifact.path); + + // Tarball/wheel artifacts: read entries in memory. + let lower = entry.artifact.path.to_ascii_lowercase(); + if lower.ends_with(".tgz") || lower.ends_with(".tar.gz") { + if let Ok(map) = crate::patch::package::read_archive_to_map(&artifact) { + for bytes in map.into_values() { + let h = compute_git_sha256_from_bytes(&bytes); + if needed.contains(h.as_str()) { + out.insert(h, bytes); + } + } + } + continue; + } + if lower.ends_with(".whl") || lower.ends_with(".zip") { + let Ok(bytes) = tokio::fs::read(&artifact).await else { + continue; + }; + if bytes.len() as u64 > MAX_ARTIFACT_BYTES { + continue; + } + let Ok(mut archive) = zip::ZipArchive::new(std::io::Cursor::new(bytes)) else { + continue; + }; + for i in 0..archive.len() { + use std::io::Read as _; + let Ok(mut file) = archive.by_index(i) else { + continue; + }; + if file.is_dir() || file.size() > MAX_FILE_BYTES { + continue; + } + let mut content = Vec::with_capacity(file.size() as usize); + if file.read_to_end(&mut content).is_err() { + continue; + } + let h = compute_git_sha256_from_bytes(&content); + if needed.contains(h.as_str()) { + out.insert(h, content); + } + } + continue; + } + // Dir-shaped artifacts (cargo/golang/composer/gem copies): the + // record keys are package-relative, so resolve each needed file + // directly instead of walking the whole tree. + if tokio::fs::metadata(&artifact) + .await + .is_ok_and(|m| m.is_dir()) + { + for (file_name, info) in &record.files { + if !needed.contains(info.after_hash.as_str()) { + continue; + } + let rel = crate::patch::apply::normalize_file_path(file_name); + if !crate::patch::apply::is_safe_relative_subpath(rel) { + continue; + } + if let Ok(content) = tokio::fs::read(artifact.join(rel)).await { + if content.len() as u64 > MAX_FILE_BYTES { + continue; + } + let h = compute_git_sha256_from_bytes(&content); + if h == info.after_hash { + out.insert(h, content); + } + } + } + } + } + out +} + /// Run the hardened apply pipeline against a vendor stage/copy with the /// vendor auto-force policy: /// @@ -394,3 +526,152 @@ mod policy_tests { assert!(mismatch_overwrite_warnings(&r, "x", "1").is_empty()); } } + +#[cfg(test)] +mod harvest_tests { + use super::*; + use crate::hash::git_sha256::compute_git_sha256_from_bytes; + use crate::manifest::schema::{PatchFileInfo, PatchRecord}; + use std::collections::HashMap; + use std::io::Write as _; + + const UUID: &str = "11111111-2222-4333-8444-555555555555"; + const PATCHED: &[u8] = b"module.exports = patched;\n"; + + fn record(purl: &str, uuid: &str, file: &str, after: &[u8]) -> (String, PatchRecord) { + let mut files = HashMap::new(); + files.insert( + file.to_string(), + PatchFileInfo { + before_hash: compute_git_sha256_from_bytes(b"original"), + after_hash: compute_git_sha256_from_bytes(after), + }, + ); + ( + purl.to_string(), + PatchRecord { + uuid: uuid.to_string(), + exported_at: "2024-01-01T00:00:00Z".to_string(), + files, + vulnerabilities: HashMap::new(), + description: String::new(), + license: "MIT".to_string(), + tier: "free".to_string(), + }, + ) + } + + fn write_ledger(root: &Path, purl: &str, uuid: &str, artifact_path: &str) { + let vendor_dir = root.join(".socket/vendor"); + std::fs::create_dir_all(&vendor_dir).unwrap(); + let state = serde_json::json!({ + "version": 1, + "entries": { + purl: { + "ecosystem": "npm", + "basePurl": purl, + "uuid": uuid, + "artifact": { "path": artifact_path }, + "wiring": [], + } + } + }); + std::fs::write( + vendor_dir.join("state.json"), + serde_json::to_vec(&state).unwrap(), + ) + .unwrap(); + } + + fn write_tgz(path: &Path, entry_name: &str, content: &[u8]) { + std::fs::create_dir_all(path.parent().unwrap()).unwrap(); + let gz = flate2::write::GzEncoder::new( + std::fs::File::create(path).unwrap(), + flate2::Compression::default(), + ); + let mut tar = tar::Builder::new(gz); + let mut header = tar::Header::new_gnu(); + header.set_size(content.len() as u64); + header.set_mode(0o644); + header.set_cksum(); + tar.append_data(&mut header, entry_name, content).unwrap(); + tar.into_inner().unwrap().finish().unwrap().flush().unwrap(); + } + + #[tokio::test] + async fn harvests_after_blobs_from_committed_tgz() { + let tmp = tempfile::tempdir().unwrap(); + let purl = "pkg:npm/left-pad@1.3.0"; + let rel = format!(".socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz"); + write_tgz(&tmp.path().join(&rel), "package/index.js", PATCHED); + write_ledger(tmp.path(), purl, UUID, &rel); + + let (k, r) = record(purl, UUID, "package/index.js", PATCHED); + let patches = HashMap::from([(k, r)]); + let mem = harvest_artifact_blobs(tmp.path(), &patches).await; + let hash = compute_git_sha256_from_bytes(PATCHED); + assert_eq!( + mem.get(&hash).map(|b| b.as_slice()), + Some(PATCHED), + "tgz artifact must yield its afterHash blob" + ); + } + + #[tokio::test] + async fn stale_uuid_artifact_contributes_nothing() { + let tmp = tempfile::tempdir().unwrap(); + let purl = "pkg:npm/left-pad@1.3.0"; + let rel = format!(".socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz"); + write_tgz(&tmp.path().join(&rel), "package/index.js", PATCHED); + // Ledger still points at an OLD patch uuid: a re-vendor is pending + // and the artifact's content must not be trusted for the new record. + write_ledger( + tmp.path(), + purl, + "99999999-aaaa-4bbb-8ccc-dddddddddddd", + &rel, + ); + + let (k, r) = record(purl, UUID, "package/index.js", PATCHED); + let patches = HashMap::from([(k, r)]); + assert!(harvest_artifact_blobs(tmp.path(), &patches) + .await + .is_empty()); + } + + #[tokio::test] + async fn escaping_artifact_path_is_rejected() { + let tmp = tempfile::tempdir().unwrap(); + let purl = "pkg:npm/left-pad@1.3.0"; + // The artifact CONTENT would match — only the committed, tamperable + // ledger path escapes the project. Must contribute nothing. + let project = tmp.path().join("project"); + write_tgz(&tmp.path().join("outside.tgz"), "package/index.js", PATCHED); + write_ledger(&project, purl, UUID, "../outside.tgz"); + + let (k, r) = record(purl, UUID, "package/index.js", PATCHED); + let patches = HashMap::from([(k, r)]); + assert!(harvest_artifact_blobs(&project, &patches).await.is_empty()); + } + + #[tokio::test] + async fn dir_shaped_artifact_resolves_record_relative_files() { + let tmp = tempfile::tempdir().unwrap(); + let purl = "pkg:cargo/serde@1.0.0"; + let rel = format!(".socket/vendor/cargo/{UUID}/serde-1.0.0"); + let file_dir = tmp.path().join(&rel).join("src"); + std::fs::create_dir_all(&file_dir).unwrap(); + std::fs::write(file_dir.join("lib.rs"), PATCHED).unwrap(); + write_ledger(tmp.path(), purl, UUID, &rel); + + let (k, r) = record(purl, UUID, "src/lib.rs", PATCHED); + let patches = HashMap::from([(k, r)]); + let mem = harvest_artifact_blobs(tmp.path(), &patches).await; + let hash = compute_git_sha256_from_bytes(PATCHED); + assert_eq!( + mem.get(&hash).map(|b| b.as_slice()), + Some(PATCHED), + "dir-shaped artifact must yield its afterHash blob" + ); + } +} diff --git a/crates/socket-patch-core/src/patch/vendor/npm_common.rs b/crates/socket-patch-core/src/patch/vendor/npm_common.rs index e2b4cdb..c9f5a8e 100644 --- a/crates/socket-patch-core/src/patch/vendor/npm_common.rs +++ b/crates/socket-patch-core/src/patch/vendor/npm_common.rs @@ -382,12 +382,18 @@ mod tests { fn guard_coordinates_accepts_plain_and_scoped_names() { let record = record_with_uuid(UUID); let coords = guard_coordinates("pkg:npm/left-pad@1.3.0", &record).unwrap(); - assert_eq!((coords.name.as_str(), coords.version.as_str()), ("left-pad", "1.3.0")); + assert_eq!( + (coords.name.as_str(), coords.version.as_str()), + ("left-pad", "1.3.0") + ); assert_eq!(coords.uuid_dir_rel, format!(".socket/vendor/npm/{UUID}")); assert_eq!(coords.base_purl, "pkg:npm/left-pad@1.3.0"); let coords = guard_coordinates("pkg:npm/@scope/pkg@1.0.0?artifact_id=x", &record).unwrap(); - assert_eq!((coords.name.as_str(), coords.version.as_str()), ("@scope/pkg", "1.0.0")); + assert_eq!( + (coords.name.as_str(), coords.version.as_str()), + ("@scope/pkg", "1.0.0") + ); assert_eq!( coords.base_purl, "pkg:npm/@scope/pkg@1.0.0", "qualifiers stripped" diff --git a/crates/socket-patch-core/src/patch/vendor/npm_lock.rs b/crates/socket-patch-core/src/patch/vendor/npm_lock.rs index 7e23591..c10ea29 100644 --- a/crates/socket-patch-core/src/patch/vendor/npm_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/npm_lock.rs @@ -1141,10 +1141,7 @@ mod tests { let tgz = tokio::fs::read(fx.root().join(fx.expected_rel_tgz())) .await .unwrap(); - assert_eq!( - tgz_member(&tgz, "package/index.js").unwrap(), - PATCHED_INDEX - ); + assert_eq!(tgz_member(&tgz, "package/index.js").unwrap(), PATCHED_INDEX); // The installed tree keeps its (divergent) bytes — only the stage // was overwritten. @@ -1250,10 +1247,7 @@ mod tests { let tgz = tokio::fs::read(fx.root().join(fx.expected_rel_tgz())) .await .unwrap(); - assert_eq!( - tgz_member(&tgz, "package/index.js").unwrap(), - PATCHED_INDEX - ); + assert_eq!(tgz_member(&tgz, "package/index.js").unwrap(), PATCHED_INDEX); let lock = fx.read_lock().await; assert_eq!( lock["packages"]["node_modules/left-pad"]["resolved"], diff --git a/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs b/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs index 0174f6c..2f74548 100644 --- a/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs @@ -2330,7 +2330,9 @@ snapshots: assert_eq!(pnpm_entry_in_use(&entry, fx.root()).await, None); // Missing lock: undeterminable. - tokio::fs::remove_file(fx.root().join(PNPM_LOCK)).await.unwrap(); + tokio::fs::remove_file(fx.root().join(PNPM_LOCK)) + .await + .unwrap(); assert_eq!(pnpm_entry_in_use(&entry, fx.root()).await, None); } diff --git a/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs b/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs index 3cef46b..22f6626 100644 --- a/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs +++ b/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs @@ -210,7 +210,10 @@ async fn fetch_composer( .map_err(|e| FetchError::Failed(format!("cannot create fetch tempdir: {e}")))?; let dir = tmp.path().join("package"); extract_zip(&bytes, &dir, /*strip_first=*/ true).map_err(FetchError::Failed)?; - if tokio::fs::metadata(dir.join("composer.json")).await.is_err() { + if tokio::fs::metadata(dir.join("composer.json")) + .await + .is_err() + { return Err(FetchError::Failed(format!( "fetched dist for {}@{} carries no composer.json", entry.name, entry.version @@ -256,7 +259,9 @@ async fn fetch_gem( continue; } if e.header().size().unwrap_or(u64::MAX) > MAX_DOWNLOAD_BYTES { - return Err(FetchError::Failed("data.tar.gz exceeds the size cap".into())); + return Err(FetchError::Failed( + "data.tar.gz exceeds the size cap".into(), + )); } let mut buf = Vec::new(); e.read_to_end(&mut buf) @@ -546,9 +551,10 @@ async fn fetch_npm( entry: &LockfileEntry, client: &reqwest::Client, ) -> Result { - let url = entry.resolved.clone().unwrap_or_else(|| { - npm_tarball_url(&npm_registry_base(), &entry.name, &entry.version) - }); + let url = entry + .resolved + .clone() + .unwrap_or_else(|| npm_tarball_url(&npm_registry_base(), &entry.name, &entry.version)); let bytes = download(client, &url).await.map_err(FetchError::Failed)?; match &entry.integrity { // yarn berry locks never hash the tarball itself — the checksum is @@ -698,11 +704,9 @@ fn verify_integrity(bytes: &[u8], integrity: &LockIntegrity) -> Result<(), Fetch ))) } } - LockIntegrity::BerryChecksum(_) | LockIntegrity::GoH1(_) => { - Err(FetchError::Unverifiable( - "verifier handled by a dedicated ecosystem fetcher".to_string(), - )) - } + LockIntegrity::BerryChecksum(_) | LockIntegrity::GoH1(_) => Err(FetchError::Unverifiable( + "verifier handled by a dedicated ecosystem fetcher".to_string(), + )), LockIntegrity::None => Err(FetchError::Unverifiable( "no integrity recorded".to_string(), )), @@ -900,7 +904,10 @@ mod tests { assert!(verify_sri(bytes, &multi).is_ok()); let bad = sri_of(b"other"); assert!(verify_sri(bytes, &bad).is_err()); - assert!(verify_sri(bytes, "md5-abc=").is_err(), "unknown algos refuse"); + assert!( + verify_sri(bytes, "md5-abc=").is_err(), + "unknown algos refuse" + ); } #[tokio::test] @@ -1112,7 +1119,11 @@ mod tests { async fn cargo_crate_fetch_verifies_sha256_and_extracts() { // .crate = tar.gz with a {name}-{version}/ top dir. let crate_bytes = make_tgz(&[ - ("left-pad-1.3.0/Cargo.toml", b"[package]\nname = \"left-pad\"\n", false), + ( + "left-pad-1.3.0/Cargo.toml", + b"[package]\nname = \"left-pad\"\n", + false, + ), ("left-pad-1.3.0/src/lib.rs", b"pub fn pad() {}\n", false), ]); let sha = hex::encode(Sha256::digest(&crate_bytes)); @@ -1230,7 +1241,9 @@ mod tests { // Tampered h1 fails closed. let entry = LockfileEntry { - integrity: LockIntegrity::GoH1("h1:AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=".into()), + integrity: LockIntegrity::GoH1( + "h1:AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=".into(), + ), ..entry }; match fetch_and_stage(&entry, &build_registry_client()).await { @@ -1242,14 +1255,17 @@ mod tests { #[cfg(feature = "golang")] #[test] fn go_escape_uppercase_and_zip_prefix_guards() { - assert_eq!(go_escape("github.com/Azure/azure-sdk"), "github.com/!azure/azure-sdk"); + assert_eq!( + go_escape("github.com/Azure/azure-sdk"), + "github.com/!azure/azure-sdk" + ); assert_eq!(go_escape("v1.0.0-RC1"), "v1.0.0-!r!c1"); // An entry outside the module prefix fails the whole artifact. let zip_bytes = make_module_zip("github.com/x/y@v1.0.0/", &[("go.mod", b"m\n")]); let tmp = tempfile::tempdir().unwrap(); - let err = extract_zip_with_prefix(&zip_bytes, tmp.path(), "github.com/OTHER@v1/") - .unwrap_err(); + let err = + extract_zip_with_prefix(&zip_bytes, tmp.path(), "github.com/OTHER@v1/").unwrap_err(); assert!(err.contains("outside"), "{err}"); } @@ -1275,7 +1291,10 @@ mod tests { async fn composer_dist_fetch_verifies_sha1_and_strips_top_dir() { // GitHub zipballs carry an `owner-repo-sha/` top dir. let zip_bytes = make_zip(&[ - ("Seldaek-monolog-abc123/composer.json", br#"{"name":"monolog/monolog"}"#), + ( + "Seldaek-monolog-abc123/composer.json", + br#"{"name":"monolog/monolog"}"#, + ), ("Seldaek-monolog-abc123/src/Logger.php", b" Option<(String, String, String)> { - parse_jsr_purl(purl) - .map(|((s, n), v)| (s.into_owned(), n.into_owned(), v.into_owned())) + parse_jsr_purl(purl).map(|((s, n), v)| (s.into_owned(), n.into_owned(), v.into_owned())) } #[cfg(feature = "deno")] @@ -900,7 +899,10 @@ mod tests { "pkg:npm/@scope/x@1.0.0", "pkg:npm/%40scope/x@1.0.0" )); - assert!(!purl_eq("pkg:npm/%40scope/x@1.0.0", "pkg:npm/@scope/x@2.0.0")); + assert!(!purl_eq( + "pkg:npm/%40scope/x@1.0.0", + "pkg:npm/@scope/x@2.0.0" + )); // Qualifiers/subpath are preserved verbatim (not decoded). assert_eq!( normalize_purl("pkg:npm/%40s/x@1?artifact_id=a%2Fb"), diff --git a/crates/socket-patch-core/tests/blob_fetcher_edges_e2e.rs b/crates/socket-patch-core/tests/blob_fetcher_edges_e2e.rs index 5dcdf8f..761e56e 100644 --- a/crates/socket-patch-core/tests/blob_fetcher_edges_e2e.rs +++ b/crates/socket-patch-core/tests/blob_fetcher_edges_e2e.rs @@ -184,6 +184,7 @@ async fn fetch_missing_sources_package_mode_with_no_packages_path() { blobs_path: &blobs, packages_path: None, diffs_path: None, + mem_blobs: None, }; // Non-empty manifest: there IS work to do. So `total == 0` below can // only mean the None-packages_path branch short-circuited — not that @@ -226,6 +227,7 @@ async fn fetch_missing_sources_diff_mode_with_no_diffs_path() { blobs_path: &blobs, packages_path: None, diffs_path: None, + mem_blobs: None, }; let manifest = manifest_with_after_hashes(&[&"a".repeat(64)]); let client = dummy_client(); @@ -632,6 +634,7 @@ async fn fetch_missing_sources_diff_downloads_and_writes_archive() { blobs_path: &blobs, packages_path: None, diffs_path: Some(&diffs), + mem_blobs: None, }; let manifest = manifest_with_uuids(&[uuid]); let client = proxy_client(&server.uri()); @@ -688,6 +691,7 @@ async fn fetch_missing_sources_package_downloads_via_package_endpoint() { blobs_path: &blobs, packages_path: Some(&packages), diffs_path: None, + mem_blobs: None, }; let manifest = manifest_with_uuids(&[uuid]); let client = proxy_client(&server.uri()); @@ -723,6 +727,7 @@ async fn fetch_missing_sources_diff_404_is_failure_with_kind_message() { blobs_path: &blobs, packages_path: None, diffs_path: Some(&diffs), + mem_blobs: None, }; let manifest = manifest_with_uuids(&[uuid]); let client = proxy_client(&server.uri()); @@ -765,6 +770,7 @@ async fn fetch_missing_sources_diff_invokes_progress_callback() { blobs_path: &blobs, packages_path: None, diffs_path: Some(&diffs), + mem_blobs: None, }; let manifest = manifest_with_uuids(&[uuid]); let client = proxy_client(&server.uri()); From c92f6b2ec3cd536f0642a69c80022db3b06bab21 Mon Sep 17 00:00:00 2001 From: Mikola Lysenko Date: Fri, 12 Jun 2026 02:21:22 -0400 Subject: [PATCH 16/19] feat(repair): rebuild missing/corrupt vendored artifacts + no-ledger reconstruction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit repair now owns the vendored-artifact lifecycle: artifacts referenced by the ledger and/or rewired lockfiles but missing or corrupt on disk are rebuilt fail-closed, and a wholesale-deleted .socket/vendor (state.json included) is reconstructed from the lockfile references alone. Core: - ArtifactHealth + check_vendored_artifact (vendor/verify.rs): per-file afterHashes plus a whole-file sha256 cross-check against the ledger for file-shaped artifacts. - recover_lock_entry (lock_inventory): the pre-vendor registry fragment recovered from the wiring originals (npm/pnpm/yarn/berry/bun fragments, composer dist, gem checksum line, uv wheel, cargo entry.lock); golang rides the unrewired go.sum. - wired_vendor_integrity + fetch_npm_unverified + artifact_matches_integrity: the REWIRED lock's recorded integrity of our packed tarball is the trust anchor — reconstruction can fetch pristine unverified and still land only bytes that reproduce the wired integrity (tamper => removed, fail). - Artifact-only rebuild branches in the composer/cargo/gem/golang/pypi backends: wired-but-broken artifacts rebuild in place with NO lock write and NO ledger re-record (fixes the latent original-clobbering full-path re-run); golang in-sync re-runs now record nothing; uv same-uuid re-runs are an InSync hot path instead of a refusal. - pnpm: fail-closed duplicate-mapping-key guard for half-edited locks in edit_packages/edit_snapshot_rekey. - Memory stager: a diff archive alone is no longer a sufficient vendor source (auto-force can need full after-blobs a diff cannot produce). CLI: - repair_vendor.rs: ledger health pass, lockfile-reference reconstruction (uuid recovered from the contract's path rule; manifest record else the patch view API => detached entry with the record embedded), rebuilds via the normal vendor dispatch + the pristine-source ladder, post-verified against the recorded fingerprint. Offline rebuilds run when fully local. - repair: manifest_not_found softened when vendor traces exist; step 1 skips vendored/lockfile-referenced entries (a vendored project's repair never re-litters .socket/blobs|diffs). - vendor auto-fetch: a MISSING committed artifact falls through to the ledger-recovered registry fetch instead of failing; corrupt stays loud. - Envelope: PatchAction::Rebuilt + summary.rebuilt (omitted while zero). Tests: repair_vendor_e2e (12 scenarios incl. tampered-pristine rejection, offline both ways, detached, no-ledger and no-manifest reconstruction), per-backend wired-missing-copy rebuilds, health matrix, fragment recovery per wiring kind, pnpm colon-key scanner unit, half-drifted lock guard. Live-verified on Flowise: 19/19 fresh vendor with a lean .socket, deleted artifact rebuilt byte-identically, and a 14/14 full reconstruction from nothing but the rewired pnpm lockfile. Co-Authored-By: Claude Fable 5 --- crates/socket-patch-cli/CLI_CONTRACT.md | 46 +- .../src/commands/fetch_stage.rs | 21 +- crates/socket-patch-cli/src/commands/get.rs | 19 + crates/socket-patch-cli/src/commands/mod.rs | 1 + .../socket-patch-cli/src/commands/repair.rs | 131 ++- .../src/commands/repair_vendor.rs | 853 ++++++++++++++++++ .../socket-patch-cli/src/commands/vendor.rs | 325 ++++--- crates/socket-patch-cli/src/json_envelope.rs | 13 + .../tests/repair_vendor_e2e.rs | 755 ++++++++++++++++ .../src/patch/vendor/cargo.rs | 147 ++- .../src/patch/vendor/composer_lock.rs | 128 ++- .../socket-patch-core/src/patch/vendor/gem.rs | 174 +++- .../src/patch/vendor/golang.rs | 117 ++- .../src/patch/vendor/lock_inventory.rs | 693 ++++++++++++++ .../socket-patch-core/src/patch/vendor/mod.rs | 1 + .../src/patch/vendor/pnpm_lock.rs | 84 ++ .../src/patch/vendor/pypi.rs | 232 ++++- .../src/patch/vendor/pypi_uv.rs | 44 +- .../src/patch/vendor/registry_fetch.rs | 114 ++- .../src/patch/vendor/verify.rs | 180 ++++ 20 files changed, 3817 insertions(+), 261 deletions(-) create mode 100644 crates/socket-patch-cli/src/commands/repair_vendor.rs create mode 100644 crates/socket-patch-cli/tests/repair_vendor_e2e.rs diff --git a/crates/socket-patch-cli/CLI_CONTRACT.md b/crates/socket-patch-cli/CLI_CONTRACT.md index 88229b3..9b02e11 100644 --- a/crates/socket-patch-cli/CLI_CONTRACT.md +++ b/crates/socket-patch-cli/CLI_CONTRACT.md @@ -15,7 +15,7 @@ This document defines the **public surface** of the `socket-patch` binary. Anyth | `list` | — | Print patches in the local manifest | | `remove` | — | Remove patch from manifest (rolls back first); requires positional `identifier` | | `setup` | — | Wire automatic-patching install hooks (npm/pypi/gem) | -| `repair` | `gc` | Download missing blobs + clean up unused ones | +| `repair` | `gc` | Download missing blobs, rebuild missing/corrupt vendored artifacts, clean up unused ones | | `vendor` | — | Eject patched dependencies into committable `.socket/vendor/` and rewire lockfiles | | `vex` | — | Emit an OpenVEX 0.2.0 attestation derived from the local manifest | @@ -334,6 +334,37 @@ re-runs and fresh clones of vendored projects need no network); anything still m into memory via the patch-view endpoint. A vendored project's `.socket/` holds only `manifest.json` (omitted in detached mode) and `vendor/`. +**Vendored artifact repair (v3.5)**: `repair` health-checks every ledger entry — per-file +afterHashes inside the artifact plus, for file-shaped artifacts (`.tgz`/`.whl`), the whole file +against the ledger's recorded sha256 (the rewired lock integrity references those exact bytes) — +and REBUILDS missing/corrupt artifacts through the normal vendor backends. The wired hot paths +rebuild the artifact only: lockfiles stay byte-identical and the ledger entry is not re-recorded +(the first run's entry holds the only pre-vendor originals). Pristine sources follow the same +ladder as vendor: the installed copy first (works under `--offline`), then a lockfile-verified +registry fetch, then the pre-vendor registry fragment recovered from the ledger's wiring +`original`s (`recover_lock_entry`) — always integrity-verified fail-closed, and the rebuilt +artifact is re-verified against the recorded fingerprint before the run counts it (`rebuilt` +event; a mismatch removes the artifact and fails with `vendor_artifact_rebuild_failed`). +Lockfile references to `.socket/vendor///...` with NO ledger coverage (the ledger was +deleted wholesale) are RECONSTRUCTED: the uuid comes from the path (the recovery rule above), the +record from the manifest — or the patch API, yielding a *detached* entry with the record embedded +— and a fresh ledger entry is persisted with the rebuilt artifact's fingerprint. When nothing is +installed and the ledger is gone, npm-family reconstruction has one more rung: the REWIRED +lockfile still records the integrity of the packed vendored tarball, so the pristine copy is +fetched (unverified, conventional registry URL, `SOCKET_NPM_REGISTRY` honored) and the +deterministically REBUILT artifact must reproduce that wired integrity — a tampered pristine +source changes the rebuilt bytes and fails closed (`vendor_artifact_rebuild_failed`, nothing +kept). Reconstructed entries carry no pre-vendor wiring originals, so a later `--revert` degrades +to the documented `vendor_lock_entry_drifted` guidance (re-resolve with the package manager). Because of this +phase, `repair` no longer errors with `manifest_not_found` when the project has a vendor ledger +or vendor-path lockfile references — it runs the vendored phase alone. Step 1's source download +likewise skips vendored-in-sync manifest entries (their content lives in the committed artifact), +so repairing a vendored project never re-litters `.socket/blobs`. `--dry-run` previews +(`details.wouldRebuild`); `--offline` rebuilds only from fully local sources and fails per-entry +otherwise; `vendor`/`scan --vendor` re-runs get the same rebuild for wired-but-broken artifacts +(`vendor_artifact_rebuilt` warning) and recover registry resolutions for missing committed +artifacts instead of failing. + ### Path convention + patch-UUID recovery (stable) ```text @@ -590,6 +621,7 @@ Every `--json` invocation emits a single JSON object that follows the **unified | `failed` | every command | A specific patch attempt failed. `errorCode` + `error` set. | | `removed` | `gc`/`repair`, `remove`, `rollback` | Data was removed from `.socket/` (or files rolled back). `bytes` optional. | | `verified` | `apply --dry-run`, `scan --dry-run` | The patch *would* apply cleanly. `files` lists previewed changes. | +| `rebuilt` | `repair` | A missing/corrupt vendored artifact was rebuilt in place (or its lost ledger entry restored — `details.ledgerRestored`). `summary.rebuilt` counts these (the field is omitted while zero). | ### Stable `errorCode` tags @@ -618,8 +650,14 @@ Every `--json` invocation emits a single JSON object that follows the **unified | `vendor_integrity_unverified` | `skipped` (warning) | vendor (pipenv): the lockfile format does not hash-check file entries; the committed wheel bytes are the protection. | | `vendor_content_mismatch_overwritten` | `skipped` (warning) | vendor: a staged file matched NEITHER beforeHash nor afterHash (patch built against different bytes, or local edits); the stage was overwritten with the verified patched content and the vendor succeeded. | | `vendor_fetched_missing` | `skipped` (warning) | vendor: the package was not installed; its pristine artifact was fetched per the lockfile resolution (or staged from the committed vendor artifact), integrity-verified, and vendored — the project tree was not touched. | -| `vendor_fetch_failed` | `failed` | vendor: the lockfile-resolved fetch was attempted and failed (HTTP error, size cap, integrity mismatch, or a corrupt committed artifact). Suppresses the duplicate `package_not_installed` skip. | +| `vendor_fetch_failed` | `failed` | vendor: the lockfile-resolved fetch was attempted and failed (HTTP error, size cap, integrity mismatch, or a PRESENT-but-corrupt committed artifact — pointed at `socket-patch repair`). A MISSING committed artifact no longer lands here: it falls through to the ledger-recovered registry fetch. Suppresses the duplicate `package_not_installed` skip. | | `vendor_fetch_unverifiable` | `skipped` (warning) | vendor: the lockfile records no usable integrity for the missing package; nothing was fetched (fail-closed) and the `package_not_installed` skip follows. | +| `vendor_artifact_missing` | `skipped` (warning) / `failed` | vendor: the committed artifact is gone — the registry resolution is recovered from the ledger and the artifact rebuilt (warning); repair `--offline` with no local source surfaces it as the per-entry failure instead. | +| `vendor_artifact_corrupt` | `failed` | repair `--offline`: the committed artifact fails verification (member afterHashes or the ledger's whole-file sha256) and no local source can rebuild it. Online repairs rebuild instead. | +| `vendor_artifact_rebuilt` | `skipped` (warning) | vendor / scan `--vendor`: a wired-but-missing/stale artifact was rebuilt in place; lockfiles and the ledger entry untouched. (Under `repair` the `rebuilt` event carries this signal.) | +| `vendor_artifact_rebuild_failed` | `failed` | repair: the rebuild ran but the result failed verification against the recorded fingerprint (e.g. an edited state.json sha); the unverifiable artifact was removed. | +| `vendor_artifact_unrepairable` | `failed` | repair: no verifiable pristine source exists (not installed + lockfile rewired + no recoverable ledger fragment), the wheel is platform-locked with no installed copy, or the ledger entry itself cannot be trusted. | +| `vendor_uuid_mismatch` | `skipped` | repair: the manifest's patch uuid moved past the vendored artifact — a re-vendor (`vendor` / `scan --vendor`) is pending; repair does not cross patch generations. | | `content_mismatch_overwritten` | `skipped` (warning) | apply (default policy): a file matched NEITHER beforeHash nor afterHash and was overwritten with the full verified patched content. `--strict` turns this case into a `failed` event instead. | | `vendor_lock_checksums_unsupported` / `vendor_stale_lock_checksum` | `failed` | vendor (gem): an ambiguous/platform CHECKSUMS entry, or a v1-wired lock whose stale token blocks the hot path (run `vendor --revert` + re-vendor). | | `pypi_{poetry,pdm,pipenv}_no_lockfile` | `failed` | vendor (pypi): a lock-less tool marker with no `requirements.txt` fallback — run ` lock`. | @@ -629,7 +667,7 @@ Every `--json` invocation emits a single JSON object that follows the **unified | Code | Subcommands | Meaning | |-----------------------|----------------------------------|---------| -| `manifest_not_found` | list, remove, repair, rollback | `.socket/manifest.json` doesn't exist. | +| `manifest_not_found` | list, remove, repair, rollback | `.socket/manifest.json` doesn't exist. v3.5: `repair` proceeds anyway (vendored phase only) when a vendor ledger or vendor-path lockfile references exist. | | `manifest_invalid` | list, remove | Manifest exists but is unparseable. | | `manifest_unreadable` | list, remove | I/O error reading manifest. | | `apply_failed` | apply | apply pipeline error before any patch ran. | @@ -643,7 +681,7 @@ Every `--json` invocation emits a single JSON object that follows the **unified | `apply` | `Applied` · `Updated` · `Skipped` (already_patched / package_not_installed / vendored) · `Failed` · `Verified` (dry-run) | | `vendor` | `Applied` (= vendored; `command` routes) · `Skipped` (refusals, warnings, unsupported ecosystems) · `Failed` · `Removed` (reconcile + `--revert`) · `Verified` (dry-run) | | `list` | `Discovered` (with `details.vulnerabilities`, `details.tier`, `details.license`, `details.description`, `details.exportedAt`) | -| `repair`/`gc`| `Downloaded` (or `Verified` on dry-run) · `Removed` (or `Verified`) · `Failed` artifact events | +| `repair`/`gc`| `Downloaded` (or `Verified` on dry-run) · `Rebuilt` (vendored artifacts; `Verified` previews on dry-run) · `Skipped` (vendor_uuid_mismatch) · `Removed` (or `Verified`) · `Failed` events | | `remove` | `Removed` (per purl) · artifact-level `Removed` event (with `details.blobsRemoved`, `details.rolledBack`) | ### Migration status (v3.0) diff --git a/crates/socket-patch-cli/src/commands/fetch_stage.rs b/crates/socket-patch-cli/src/commands/fetch_stage.rs index 8976676..17db89b 100644 --- a/crates/socket-patch-cli/src/commands/fetch_stage.rs +++ b/crates/socket-patch-cli/src/commands/fetch_stage.rs @@ -283,11 +283,12 @@ pub enum MemStageOutcome { } /// Stage patch sources for a VENDOR run without writing anything: -/// per-record availability follows the same rule as -/// [`stage_patch_sources`] (all after-blobs on disk, or a diff/package -/// archive on disk), and records with no usable local source have their -/// full per-file content fetched into memory from the patch view -/// endpoint (`blobContent`). Offline runs with missing sources are +/// a record is locally satisfied when all its after-blobs are on disk or +/// a package archive is (a diff archive is NOT sufficient — vendor's +/// auto-force policy can need the full after-blob for files a diff cannot +/// reproduce); anything else has its full per-file content fetched into +/// memory from the patch view endpoint (`blobContent`), preceded by the +/// committed-artifact harvest. Offline runs with missing sources are /// `Unavailable` with the same diagnostics as the disk stager. pub async fn stage_vendor_sources_in_memory( common: &GlobalArgs, @@ -300,9 +301,14 @@ pub async fn stage_vendor_sources_in_memory( let packages = socket_dir.join("packages"); let missing_blobs = get_missing_blobs(manifest, &blobs).await; - let missing_diff_archives = get_missing_archives(manifest, &diffs).await; let missing_package_archives = get_missing_archives(manifest, &packages).await; + // A diff archive alone is NOT a sufficient source here, unlike the disk + // stager: vendoring runs the auto-force policy, where a beforeHash + // mismatch (already-applied tree, patch built against different bytes) + // is overwritten with the FULL after-blob — which a diff cannot + // produce. On-disk diffs still serve Strategy 2 for clean files; the + // after-blob content must additionally exist (disk, harvest, or fetch). let mut to_fetch: Vec<(&str, &str)> = manifest .patches .iter() @@ -311,9 +317,8 @@ pub async fn stage_vendor_sources_in_memory( .files .values() .all(|f| !missing_blobs.contains(&f.after_hash)); - let diff_present = !missing_diff_archives.contains(&record.uuid); let pkg_present = !missing_package_archives.contains(&record.uuid); - if all_blobs_present || diff_present || pkg_present { + if all_blobs_present || pkg_present { None } else { Some((purl.as_str(), record.uuid.as_str())) diff --git a/crates/socket-patch-cli/src/commands/get.rs b/crates/socket-patch-cli/src/commands/get.rs index f9ec978..0ba397f 100644 --- a/crates/socket-patch-cli/src/commands/get.rs +++ b/crates/socket-patch-cli/src/commands/get.rs @@ -346,6 +346,25 @@ fn build_patch_record(patch: &PatchResponse, files: HashMap (String, PatchRecord) { + let mut files = HashMap::new(); + for (file_path, file_info) in &patch.files { + if let (Some(before), Some(after)) = (&file_info.before_hash, &file_info.after_hash) { + files.insert( + file_path.clone(), + PatchFileInfo { + before_hash: before.clone(), + after_hash: after.clone(), + }, + ); + } + } + (patch.purl.clone(), build_patch_record(patch, files)) +} + #[derive(Args)] pub struct GetArgs { /// Patch identifier (UUID, CVE ID, GHSA ID, PURL, or package name). diff --git a/crates/socket-patch-cli/src/commands/mod.rs b/crates/socket-patch-cli/src/commands/mod.rs index 1fa97c9..51b1829 100644 --- a/crates/socket-patch-cli/src/commands/mod.rs +++ b/crates/socket-patch-cli/src/commands/mod.rs @@ -5,6 +5,7 @@ pub mod list; pub mod lock_cli; pub mod remove; pub mod repair; +pub mod repair_vendor; pub mod rollback; pub mod scan; pub mod setup; diff --git a/crates/socket-patch-cli/src/commands/repair.rs b/crates/socket-patch-cli/src/commands/repair.rs index 58754b5..4050e8c 100644 --- a/crates/socket-patch-cli/src/commands/repair.rs +++ b/crates/socket-patch-cli/src/commands/repair.rs @@ -61,18 +61,37 @@ pub async fn run(args: RepairArgs) -> i32 { let manifest_path = args.common.resolved_manifest_path(); if tokio::fs::metadata(&manifest_path).await.is_err() { - if args.common.json { - let mut env = Envelope::new(Command::Repair); - env.dry_run = args.common.dry_run; - env.mark_error(EnvelopeError::new( - "manifest_not_found", - format!("Manifest not found at {}", manifest_path.display()), - )); - println!("{}", env.to_pretty_json()); - } else { - eprintln!("Manifest not found at {}", manifest_path.display()); + // No manifest is still repairable when the project carries vendored + // state: a committed ledger, or lockfiles rewired to + // `.socket/vendor/...` paths (the ledger itself may be the thing + // that needs repairing). Only a project with neither is an error. + let state_file = args + .common + .cwd + .join(socket_patch_core::patch::vendor::VENDOR_STATE_REL); + let has_vendor_traces = tokio::fs::metadata(&state_file).await.is_ok() + || !crate::commands::repair_vendor::scan_vendor_references(&args.common.cwd) + .await + .is_empty(); + if !has_vendor_traces { + if args.common.json { + let mut env = Envelope::new(Command::Repair); + env.dry_run = args.common.dry_run; + env.mark_error(EnvelopeError::new( + "manifest_not_found", + format!("Manifest not found at {}", manifest_path.display()), + )); + println!("{}", env.to_pretty_json()); + } else { + eprintln!("Manifest not found at {}", manifest_path.display()); + } + return 1; + } + // The vendor-only repair still serializes on the .socket lock; the + // lock layer deliberately refuses to mkdir. + if let Some(dir) = manifest_path.parent() { + let _ = tokio::fs::create_dir_all(dir).await; } - return 1; } // Serialize against concurrent socket-patch runs targeting the @@ -165,10 +184,11 @@ pub(crate) async fn repair_inner( args: &RepairArgs, manifest_path: &Path, ) -> Result<(Envelope, RepairCounts), String> { + // `Ok(None)` = no manifest (vendor-only repair); present-but-invalid + // stays a hard error. let manifest = read_manifest(manifest_path) .await - .map_err(|e| e.to_string())? - .ok_or_else(|| "Invalid manifest".to_string())?; + .map_err(|e| e.to_string())?; let socket_dir = manifest_path.parent().unwrap(); let blobs_path = socket_dir.join("blobs"); @@ -192,19 +212,69 @@ pub(crate) async fn repair_inner( let mut blobs_checked = 0usize; let mut bytes_freed = 0u64; + // The envelope is built up-front: the vendored-artifact phase records + // its events inline; the download/cleanup aggregates are appended at + // the end (event ordering is documented best-effort). + let mut env = Envelope::new(Command::Repair); + env.dry_run = args.common.dry_run; + // Step 1: Check for and download missing artifacts in the requested // mode. Counts below refer to whatever kind of artifact was requested // (file blobs, diff archives, or package archives). - let missing_artifacts: Vec = match download_mode { - DownloadMode::File => get_missing_blobs(&manifest, &blobs_path) + // + // VENDORED-in-sync manifest entries are excluded: vendor flows keep + // patch content in memory and the committed artifact IS the patch, so + // a fully-vendored project legitimately has no `.socket/blobs|diffs| + // packages` — repair must not re-litter them (or fail trying). The + // cleanup phase below still uses the FULL manifest, so it never sweeps + // sources an in-place apply may need for rollback. + let vendor_state = socket_patch_core::patch::vendor::load_state(&args.common.cwd) + .await + .unwrap_or_default(); + // Lockfile vendor references count as vendored even before the ledger + // is reconstructed, so a no-ledger repair doesn't download sources for + // entries the vendored phase is about to own. + let referenced_uuids: std::collections::HashSet = + crate::commands::repair_vendor::scan_vendor_references(&args.common.cwd) + .await + .into_iter() + .map(|(_, uuid, _)| uuid) + .collect(); + let scoped_manifest = manifest.as_ref().map(|m| { + let patches = m + .patches + .iter() + .filter(|(purl, rec)| { + !referenced_uuids.contains(&rec.uuid) + && vendor_state + .entries + .get(*purl) + .or_else(|| { + vendor_state + .entries + .values() + .find(|e| &e.base_purl == *purl) + }) + .is_none_or(|e| e.uuid != rec.uuid) + }) + .map(|(k, v)| (k.clone(), v.clone())) + .collect(); + socket_patch_core::manifest::schema::PatchManifest { + patches, + setup: m.setup.clone(), + } + }); + let missing_artifacts: Vec = match (&scoped_manifest, download_mode) { + (None, _) => Vec::new(), + (Some(m), DownloadMode::File) => get_missing_blobs(m, &blobs_path) .await .into_iter() .collect(), - DownloadMode::Diff => get_missing_archives(&manifest, &diffs_path) + (Some(m), DownloadMode::Diff) => get_missing_archives(m, &diffs_path) .await .into_iter() .collect(), - DownloadMode::Package => get_missing_archives(&manifest, &packages_path) + (Some(m), DownloadMode::Package) => get_missing_archives(m, &packages_path) .await .into_iter() .collect(), @@ -243,8 +313,13 @@ pub(crate) async fn repair_inner( diffs_path: Some(&diffs_path), mem_blobs: None, }; + // Step 1 only runs with a manifest (missing_artifacts is + // empty otherwise), so the expect is unreachable. + let m = scoped_manifest + .as_ref() + .expect("step 1 requires a manifest"); let fetch_result = - fetch_missing_sources(&manifest, &sources, download_mode, &client, None).await; + fetch_missing_sources(m, &sources, download_mode, &client, None).await; downloaded_count = fetch_result.downloaded; download_failed_count = fetch_result.failed; if !quiet { @@ -278,8 +353,24 @@ pub(crate) async fn repair_inner( ); } + // Step 1.5: vendored artifacts — health-check the ledger (and any + // lockfile vendor references with no ledger coverage) and rebuild + // missing/corrupt artifacts. Runs under `--download-only` too: + // restoring artifacts IS repair's download half. + let vendor_counts = crate::commands::repair_vendor::repair_vendored_artifacts( + &args.common, + manifest.as_ref(), + socket_dir, + &mut env, + ) + .await; + if !quiet && vendor_counts.rebuilt > 0 { + println!("Rebuilt {} vendored artifact(s).", vendor_counts.rebuilt); + } + // Step 2: Clean up unused artifacts across all three directories. - if !args.download_only { + if let (false, Some(manifest)) = (args.download_only, manifest.as_ref()) { + let manifest = manifest.clone(); if !quiet { println!(); } @@ -361,8 +452,6 @@ pub(crate) async fn repair_inner( // Translate the aggregate counts into envelope events. `repair` // operates on artifacts (not specific patches), so events use the // `PatchEvent::artifact` form (no PURL/UUID). - let mut env = Envelope::new(Command::Repair); - env.dry_run = args.common.dry_run; let action_for_repair = if args.common.dry_run { PatchAction::Verified } else { diff --git a/crates/socket-patch-cli/src/commands/repair_vendor.rs b/crates/socket-patch-cli/src/commands/repair_vendor.rs new file mode 100644 index 0000000..ee9801c --- /dev/null +++ b/crates/socket-patch-cli/src/commands/repair_vendor.rs @@ -0,0 +1,853 @@ +//! `repair`'s vendored-artifact phase: rebuild committed vendor artifacts +//! that are referenced (ledger entry and/or rewired lockfile) but missing +//! or corrupt on disk. +//! +//! Detection is the core health check ([`check_vendored_artifact`]: per-file +//! afterHashes + the whole-file ledger sha256 for file-shaped artifacts). +//! Rebuilds re-dispatch the normal vendor backends — their wired hot paths +//! rebuild the ARTIFACT only and never touch lockfiles or re-record ledger +//! originals — fed by the same pristine-source ladder as `vendor` (installed +//! copy → lockfile-verified registry fetch → ledger-recovered pre-vendor +//! fragment), with patch content staged in memory. +//! +//! Lockfile references with NO ledger coverage (`.socket/vendor` deleted +//! wholesale, state.json included) are RECONSTRUCTED: the uuid is recovered +//! from the lockfile path itself (the contract's uuid-in-path rule), the +//! record from the manifest (or the patch API, yielding a detached entry), +//! and a fresh ledger entry is re-synthesized so sweep/GC/revert know the +//! artifact again. Reconstructed entries carry no pre-vendor wiring +//! originals — `--revert` degrades to its documented +//! `vendor_lock_entry_drifted` re-resolve guidance. + +use std::collections::{HashMap, HashSet}; +use std::path::Path; + +use socket_patch_core::api::client::get_api_client_with_overrides; +use socket_patch_core::crawlers::CrawlerOptions; +use socket_patch_core::manifest::schema::{PatchManifest, PatchRecord}; +use socket_patch_core::patch::copy_tree::remove_tree; +use socket_patch_core::patch::vendor::{ + self, check_vendored_artifact, file_sha256_hex, load_state, lock_inventory, parse_vendor_path, + registry_fetch, ArtifactHealth, VendorEntry, +}; +use socket_patch_core::utils::purl::strip_purl_qualifiers; +use socket_patch_core::vex::time::now_rfc3339; + +use crate::args::GlobalArgs; +use crate::commands::fetch_stage::{stage_vendor_sources_in_memory, MemStageOutcome}; +use crate::commands::vendor::{ + dispatch_vendor_one, ecosystem_in_scope, fetch_pristine_package, persist_vendor_entry, + record_warning, PristineFetch, +}; +use crate::ecosystem_dispatch::{find_packages_for_purls, partition_purls}; +use crate::json_envelope::{Envelope, PatchAction, PatchEvent}; + +/// Counts surfaced to `repair_inner` for telemetry/human output. +#[derive(Default)] +pub(crate) struct RepairVendorCounts { + pub rebuilt: usize, + pub failed: usize, + pub healthy: usize, +} + +/// One broken vendored unit queued for rebuild. +struct Candidate { + purl: String, + entry: VendorEntry, + record: PatchRecord, + detached: bool, + /// True when the ledger entry was re-synthesized from a lockfile + /// reference (it must be persisted after a successful rebuild). + reconstructed: bool, + reason: &'static str, +} + +/// Files the vendor backends rewire — the search space for +/// `.socket/vendor///` references when the ledger is gone. +const WIRING_FILES: &[&str] = &[ + "package-lock.json", + "npm-shrinkwrap.json", + "pnpm-lock.yaml", + "yarn.lock", + "bun.lock", + "package.json", + "Cargo.toml", + "Cargo.lock", + ".cargo/config.toml", + "go.mod", + "composer.json", + "composer.lock", + "Gemfile", + "Gemfile.lock", + "uv.lock", + "pyproject.toml", + "poetry.lock", + "pdm.lock", + "Pipfile.lock", + "requirements.txt", +]; + +/// Scan the wiring-bearing files for vendored-artifact references, +/// returning deduped `(ecosystem, uuid, artifact relpath)` triples. Pure +/// text scan + the canonical path parser — the same recovery rule the CLI +/// contract documents for external tools. +pub(crate) async fn scan_vendor_references(project_root: &Path) -> Vec<(String, String, String)> { + let mut seen: HashSet<(String, String)> = HashSet::new(); + let mut out = Vec::new(); + for file in WIRING_FILES { + let Ok(text) = tokio::fs::read_to_string(project_root.join(file)).await else { + continue; + }; + let mut rest = text.as_str(); + while let Some(idx) = rest.find(".socket") { + let slice = &rest[idx..]; + // `:` ends a reference too: pnpm snapshot keys are + // `name@file::` and yaml mappings suffix the path with a + // colon — npm names/versions never contain one. + let end = slice + .find([ + '"', '\'', '`', ' ', '\t', '\n', '\r', ',', ')', ']', '}', ';', ':', + ]) + .unwrap_or(slice.len()); + let candidate = slice[..end].replace('\\', "/"); + if let Some(parts) = parse_vendor_path(&candidate) { + if seen.insert((parts.eco.to_string(), parts.uuid.clone())) { + out.push(( + parts.eco.to_string(), + parts.uuid.clone(), + candidate.trim_start_matches("./").to_string(), + )); + } + } + rest = &rest[idx + ".socket".len()..]; + } + } + out.sort(); + out +} + +fn synth_entry(eco: &str, uuid: &str, artifact_path: &str, base_purl: &str) -> VendorEntry { + VendorEntry { + ecosystem: eco.to_string(), + base_purl: base_purl.to_string(), + uuid: uuid.to_string(), + artifact: socket_patch_core::patch::vendor::state::VendorArtifact { + path: artifact_path.to_string(), + sha256: String::new(), + size: None, + platform_locked: None, + }, + wiring: Vec::new(), + lock: None, + took_over_go_patches: false, + detached: false, + record: None, + flavor: None, + uv: None, + pnpm: None, + poetry: None, + pdm: None, + pipenv: None, + } +} + +fn fail( + env: &mut Envelope, + counts: &mut RepairVendorCounts, + quiet: bool, + purl: &str, + code: &str, + detail: String, +) { + if !quiet { + eprintln!( + "Cannot repair vendored artifact for {}: {detail}", + socket_patch_core::utils::purl::normalize_purl(purl) + ); + } + env.record(PatchEvent::new(PatchAction::Failed, purl.to_string()).with_error(code, detail)); + env.mark_partial_failure(); + counts.failed += 1; +} + +/// The vendored-artifact phase of `repair`. Runs between the download and +/// cleanup phases (and under `--download-only` — restoring artifacts IS +/// repair's job). `manifest` is `None` when the project has no +/// `.socket/manifest.json` (detached/reconstruction-only repairs). +pub(crate) async fn repair_vendored_artifacts( + common: &GlobalArgs, + manifest: Option<&PatchManifest>, + socket_dir: &Path, + env: &mut Envelope, +) -> RepairVendorCounts { + let quiet = common.json || common.silent; + let mut counts = RepairVendorCounts::default(); + + let mut state = match load_state(&common.cwd).await { + Ok(s) => s, + Err(e) => { + env.record( + PatchEvent::artifact(PatchAction::Failed) + .with_error("vendor_state_unreadable", e.to_string()), + ); + env.mark_partial_failure(); + counts.failed += 1; + return counts; + } + }; + + // ── Pass 1: ledger-driven health check ─────────────────────────────── + let mut candidates: Vec = Vec::new(); + let mut ledger_purls: Vec = state.entries.keys().cloned().collect(); + ledger_purls.sort(); + for purl in &ledger_purls { + let entry = state.entries[purl].clone(); + if !ecosystem_in_scope(common, &entry.ecosystem) { + continue; + } + let record = match (&entry.record, manifest) { + (Some(r), _) => r.clone(), + (None, Some(m)) => { + match m + .patches + .get(purl) + .cloned() + .or_else(|| m.patches.values().find(|r| r.uuid == entry.uuid).cloned()) + { + Some(r) => r, + // Dropped from the manifest: the vendor reconcile owns + // reverting it — not repair's call. + None => continue, + } + } + // Non-detached entry with no manifest at all: recover the + // record from the API below, like a reconstruction. + (None, None) => match fetch_record_by_uuid(common, &entry.uuid).await { + Some((_, r)) => r, + None => { + fail( + env, + &mut counts, + quiet, + purl, + "vendor_artifact_unrepairable", + format!( + "no manifest record for patch {} and the patch view could not \ + be fetched (offline or API failure)", + entry.uuid + ), + ); + continue; + } + }, + }; + if record.uuid != entry.uuid { + env.record( + PatchEvent::new(PatchAction::Skipped, purl.clone()).with_reason( + "vendor_uuid_mismatch", + "the manifest's patch uuid moved on; run `socket-patch vendor` (or \ + `scan --vendor`) to re-vendor", + ), + ); + continue; + } + match check_vendored_artifact(&common.cwd, &entry, &record).await { + ArtifactHealth::Healthy => counts.healthy += 1, + ArtifactHealth::StaleUuid => { + env.record( + PatchEvent::new(PatchAction::Skipped, purl.clone()).with_reason( + "vendor_uuid_mismatch", + "a re-vendor is pending for this package; run `socket-patch vendor`", + ), + ); + } + ArtifactHealth::Unverifiable { reason } => { + fail( + env, + &mut counts, + quiet, + purl, + "vendor_artifact_unrepairable", + format!("the ledger entry cannot be verified ({reason}); fix state.json"), + ); + } + ArtifactHealth::Missing => { + let detached = entry.detached; + candidates.push(Candidate { + purl: purl.clone(), + entry, + record, + detached, + reconstructed: false, + reason: "vendor_artifact_missing", + }); + } + ArtifactHealth::Corrupt { .. } => { + let detached = entry.detached; + candidates.push(Candidate { + purl: purl.clone(), + entry, + record, + detached, + reconstructed: false, + reason: "vendor_artifact_corrupt", + }); + } + } + } + + // ── Pass 2: lockfile references with no ledger coverage ───────────── + let covered: HashSet<(String, String)> = state + .entries + .values() + .map(|e| (e.ecosystem.clone(), e.uuid.clone())) + .collect(); + for (eco, uuid, relpath) in scan_vendor_references(&common.cwd).await { + if covered.contains(&(eco.clone(), uuid.clone())) || !ecosystem_in_scope(common, &eco) { + continue; + } + // The record: manifest by uuid first, else the patch API (the entry + // is then detached — exactly the manifest-less vendoring shape). + let (purl, record, detached) = + match manifest.and_then(|m| m.patches.iter().find(|(_, r)| r.uuid == uuid)) { + Some((p, r)) => (p.clone(), r.clone(), false), + None => match fetch_record_by_uuid(common, &uuid).await { + Some((purl, r)) => (purl, r, true), + None => { + fail( + env, + &mut counts, + quiet, + &format!("pkg:{eco}/unknown@{uuid}"), + "vendor_artifact_missing", + format!( + "the lockfile references .socket/vendor/{eco}/{uuid}/ but the \ + vendor ledger is gone and the patch view could not be fetched \ + (offline or API failure); restore .socket/vendor/state.json or \ + re-run online" + ), + ); + continue; + } + }, + }; + let mut entry = synth_entry(&eco, &uuid, &relpath, strip_purl_qualifiers(&purl)); + entry.detached = detached; + if detached { + entry.record = Some(record.clone()); + } + match check_vendored_artifact(&common.cwd, &entry, &record).await { + ArtifactHealth::Healthy => { + // The artifact survived; only the ledger was lost. Restore + // the entry (sha/size recomputed) so GC/sweep/revert know + // the artifact again — without it the next `scan --prune` + // would sweep the uuid dir as an orphan. + if common.dry_run { + env.record( + PatchEvent::new(PatchAction::Verified, purl.clone()).with_details( + serde_json::json!({ + "vendorArtifact": true, + "wouldRestoreLedgerEntry": true, + "path": relpath, + }), + ), + ); + continue; + } + fill_artifact_fingerprint(&common.cwd, &mut entry).await; + let save_failed = + persist_vendor_entry(common, env, &mut state, &purl, entry, detached, &record) + .await; + if save_failed { + counts.failed += 1; + continue; + } + env.record( + PatchEvent::new(PatchAction::Rebuilt, purl.clone()).with_details( + serde_json::json!({ + "path": relpath, + "ledgerRestored": true, + "artifactRebuilt": false, + }), + ), + ); + counts.rebuilt += 1; + } + _ => { + candidates.push(Candidate { + purl, + entry, + record, + detached, + reconstructed: true, + reason: "vendor_artifact_missing", + }); + } + } + } + + if candidates.is_empty() { + return counts; + } + + // ── Dry run: preview only ──────────────────────────────────────────── + if common.dry_run { + for c in &candidates { + env.record( + PatchEvent::new(PatchAction::Verified, c.purl.clone()).with_details( + serde_json::json!({ + "vendorArtifact": true, + "wouldRebuild": true, + "reason": c.reason, + "path": c.entry.artifact.path, + }), + ), + ); + } + return counts; + } + + if !quiet { + println!( + "\nRebuilding {} broken vendored artifact(s)...", + candidates.len() + ); + } + + // ── Corrupt artifacts are deleted first ────────────────────────────── + // The backends' wired hot paths rebuild on MISSING; turning corrupt + // into missing gives every ecosystem one uniform rebuild trigger (and + // never leaves tampered bytes to be blended into a rebuild). + for c in &candidates { + if c.reason == "vendor_artifact_corrupt" { + if let Some(rel) = vendor::path::vendor_uuid_dir_rel(&c.entry.ecosystem, &c.entry.uuid) + { + let _ = remove_tree(&common.cwd.join(rel)).await; + } + } + } + + // ── Patch content (in memory, like all vendor flows) ──────────────── + let records_map: HashMap = candidates + .iter() + .map(|c| (c.purl.clone(), c.record.clone())) + .collect(); + let synth = PatchManifest { + patches: records_map, + setup: None, + }; + let staged = match stage_vendor_sources_in_memory(common, &synth, socket_dir, &common.cwd).await + { + Ok(MemStageOutcome::Ready(s)) => s, + Ok(MemStageOutcome::Unavailable) => { + for c in &candidates { + fail( + env, + &mut counts, + quiet, + &c.purl, + c.reason, + format!( + "the vendored artifact at {} is broken and its patch content has \ + no local source ({})", + c.entry.artifact.path, + if common.offline { + "--offline prevents fetching it" + } else { + "download failed" + } + ), + ); + } + return counts; + } + Err(e) => { + env.record(PatchEvent::artifact(PatchAction::Failed).with_error("stage_failed", e)); + env.mark_partial_failure(); + counts.failed += candidates.len(); + return counts; + } + }; + let sources = staged.as_patch_sources(); + + // ── Pristine package sources ───────────────────────────────────────── + let purls: Vec = candidates.iter().map(|c| c.purl.clone()).collect(); + let partitioned = partition_purls(&purls, common.ecosystems.as_deref()); + let crawler_options = CrawlerOptions { + cwd: common.cwd.clone(), + global: common.global, + global_prefix: common.global_prefix.clone(), + batch_size: 100, + }; + let mut all_packages = find_packages_for_purls(&partitioned, &crawler_options, quiet).await; + let inventory = lock_inventory::inventory_project(&common.cwd).await; + let client = registry_fetch::build_registry_client(); + let mut holders: Vec = Vec::new(); + let mut unrebuildable: HashSet = HashSet::new(); + // Reconstructed npm candidates fetched UNVERIFIED from the conventional + // registry: their rebuilt tarball MUST match the integrity the rewired + // lockfile records (the trust anchor) before anything is persisted. + let mut must_verify: HashMap = HashMap::new(); + for c in &candidates { + if all_packages.contains_key(&c.purl) { + continue; // installed copy: works offline too + } + if common.offline { + fail( + env, + &mut counts, + quiet, + &c.purl, + c.reason, + format!( + "the vendored artifact at {} is broken, the package is not installed, \ + and --offline prevents fetching a pristine copy", + c.entry.artifact.path + ), + ); + unrebuildable.insert(c.purl.clone()); + continue; + } + match fetch_pristine_package(&common.cwd, &inventory, &client, &c.purl, Some(&c.entry)) + .await + { + PristineFetch::Fetched(fetched) => { + all_packages.insert(c.purl.clone(), fetched.dir().to_path_buf()); + holders.push(fetched); + } + PristineFetch::NoSource | PristineFetch::Unverifiable(_) => { + // Last rung (npm): the REWIRED lockfile still records the + // integrity of our packed tarball. Fetch the pristine copy + // unverified, rebuild deterministically, and verify the + // REBUILT artifact against that wired integrity below — + // end-to-end fail-closed without ledger or installed copy. + if c.entry.ecosystem == "npm" { + if let Some(wired) = + lock_inventory::wired_vendor_integrity(&common.cwd, &c.entry.artifact.path) + .await + { + if let Some((name, version)) = npm_coords(&c.entry.base_purl) { + match registry_fetch::fetch_npm_unverified(&name, &version, &client) + .await + { + Ok(fetched) => { + all_packages + .insert(c.purl.clone(), fetched.dir().to_path_buf()); + holders.push(fetched); + must_verify.insert(c.purl.clone(), wired); + continue; + } + Err(registry_fetch::FetchError::Failed(d)) + | Err(registry_fetch::FetchError::Unverifiable(d)) => { + fail( + env, + &mut counts, + quiet, + &c.purl, + "vendor_fetch_failed", + d, + ); + unrebuildable.insert(c.purl.clone()); + continue; + } + } + } + } + } + let detail = fetch_pristine_unrepairable_detail(c).unwrap_or_else(|| { + "no verifiable pristine source: the package is not installed, the \ + lockfile is rewired to the (broken) vendored artifact, and the \ + ledger records no recoverable registry fragment" + .to_string() + }); + fail( + env, + &mut counts, + quiet, + &c.purl, + "vendor_artifact_unrepairable", + detail, + ); + unrebuildable.insert(c.purl.clone()); + } + PristineFetch::Failed(detail) => { + fail( + env, + &mut counts, + quiet, + &c.purl, + "vendor_fetch_failed", + detail, + ); + unrebuildable.insert(c.purl.clone()); + } + } + } + + // ── Rebuild via the normal backends ────────────────────────────────── + let vendored_at = now_rfc3339(); + for c in candidates { + if unrebuildable.contains(&c.purl) { + continue; + } + let Some(pkg_path) = all_packages.get(&c.purl).cloned() else { + continue; // failed above + }; + let outcome = dispatch_vendor_one( + &c.purl, + &pkg_path, + &common.cwd, + &c.record, + &sources, + &vendored_at, + false, + false, + ) + .await; + match outcome { + None => { + fail( + env, + &mut counts, + quiet, + &c.purl, + "vendor_artifact_unrepairable", + "no vendor backend for this ecosystem in this build".to_string(), + ); + } + Some(socket_patch_core::patch::vendor::VendorOutcome::Refused { code, detail }) => { + fail(env, &mut counts, quiet, &c.purl, code, detail); + } + Some(socket_patch_core::patch::vendor::VendorOutcome::Done { + result, + entry, + warnings, + }) => { + if !result.success { + fail( + env, + &mut counts, + quiet, + &c.purl, + "vendor_artifact_rebuild_failed", + result.error.unwrap_or_else(|| "rebuild failed".to_string()), + ); + continue; + } + for w in &warnings { + // The Rebuilt event below carries the rebuild signal. + if w.code != "vendor_artifact_rebuilt" { + record_warning(env, &c.purl, w, common); + } + } + // Unverified pristine source: the rebuilt tarball must + // reproduce the integrity the rewired lockfile records. + if let Some(wired) = must_verify.get(&c.purl) { + let abs = common.cwd.join(&c.entry.artifact.path); + let verdict = match tokio::fs::read(&abs).await { + Ok(bytes) => { + let name = npm_coords(&c.entry.base_purl) + .map(|(n, _)| n) + .unwrap_or_default(); + registry_fetch::artifact_matches_integrity(&bytes, &name, wired) + } + Err(e) => Err(format!("cannot read the rebuilt artifact: {e}")), + }; + if let Err(detail) = verdict { + if let Some(rel) = + vendor::path::vendor_uuid_dir_rel(&c.entry.ecosystem, &c.entry.uuid) + { + let _ = remove_tree(&common.cwd.join(rel)).await; + } + fail( + env, + &mut counts, + quiet, + &c.purl, + "vendor_artifact_rebuild_failed", + format!( + "the rebuilt artifact does not match the integrity the \ + lockfile records ({detail}); the pristine source may have \ + been tampered with — nothing was kept" + ), + ); + continue; + } + } + // The entry whose recorded fingerprint the post-check must + // match: a backend-returned entry (drift healed / wiring + // re-recorded) wins; a reconstructed entry gets its + // fingerprint computed from the rebuilt bytes. + let mut check_entry = c.entry.clone(); + if let Some(e) = entry { + check_entry = e.clone(); + if persist_vendor_entry( + common, env, &mut state, &c.purl, e, c.detached, &c.record, + ) + .await + { + counts.failed += 1; + continue; + } + } else if c.reconstructed { + fill_artifact_fingerprint(&common.cwd, &mut check_entry).await; + if persist_vendor_entry( + common, + env, + &mut state, + &c.purl, + check_entry.clone(), + c.detached, + &c.record, + ) + .await + { + counts.failed += 1; + continue; + } + } + // ── Fail-closed post-verify ────────────────────────────── + match check_vendored_artifact(&common.cwd, &check_entry, &c.record).await { + ArtifactHealth::Healthy => { + if !quiet { + println!( + "Rebuilt {} ({})", + socket_patch_core::utils::purl::normalize_purl(&c.purl), + check_entry.artifact.path + ); + } + env.record( + PatchEvent::new(PatchAction::Rebuilt, c.purl.clone()).with_details( + serde_json::json!({ + "path": check_entry.artifact.path, + "reason": c.reason, + }), + ), + ); + counts.rebuilt += 1; + } + other => { + // The deterministic rebuild did not reproduce the + // recorded artifact (e.g. a tampered ledger sha): + // remove it rather than leave unverifiable bytes. + if let Some(rel) = vendor::path::vendor_uuid_dir_rel( + &check_entry.ecosystem, + &check_entry.uuid, + ) { + let _ = remove_tree(&common.cwd.join(rel)).await; + } + fail( + env, + &mut counts, + quiet, + &c.purl, + "vendor_artifact_rebuild_failed", + format!( + "the rebuilt artifact does not match the recorded \ + fingerprint ({other:?}); if state.json was edited, run \ + `socket-patch vendor` to re-vendor from scratch", + ), + ); + } + } + } + } + } + drop(holders); + counts +} + +/// Compute and record the artifact fingerprint (sha256 + size for +/// file-shaped artifacts) on a re-synthesized ledger entry. +async fn fill_artifact_fingerprint(project_root: &Path, entry: &mut VendorEntry) { + let norm = entry.artifact.path.replace('\\', "/"); + if !(norm.ends_with(".tgz") || norm.ends_with(".tar.gz") || norm.ends_with(".whl")) { + return; // dir-shaped: integrity is per-file afterHashes + } + let abs = project_root.join(&norm); + if let Some(hex) = file_sha256_hex(&abs).await { + entry.artifact.sha256 = hex; + } + if let Ok(meta) = tokio::fs::metadata(&abs).await { + entry.artifact.size = Some(meta.len()); + } +} + +/// Fetch one patch view by uuid (proxy-aware) and shape it as a manifest +/// record; `None` offline or on any API failure. +async fn fetch_record_by_uuid(common: &GlobalArgs, uuid: &str) -> Option<(String, PatchRecord)> { + if common.offline { + return None; + } + let (client, _) = get_api_client_with_overrides(common.api_client_overrides()).await; + let patch = client + .fetch_patch(common.org.as_deref(), uuid) + .await + .ok()??; + Some(crate::commands::get::record_from_patch_response(&patch)) +} + +/// `pkg:npm/@` → (name, version); the name may be scoped. +fn npm_coords(base_purl: &str) -> Option<(String, String)> { + let rest = strip_purl_qualifiers(base_purl).strip_prefix("pkg:npm/")?; + let (name, version) = rest.rsplit_once('@')?; + if name.is_empty() || version.is_empty() { + return None; + } + Some((name.to_string(), version.to_string())) +} + +/// A more specific unrepairable detail when one is knowable from the entry. +fn fetch_pristine_unrepairable_detail(c: &Candidate) -> Option { + if c.entry.artifact.platform_locked == Some(true) { + Some( + "the vendored wheel is platform-locked (compiled); reinstall the package on \ + this platform and re-run repair, or run `socket-patch vendor` to rebuild it" + .to_string(), + ) + } else { + None + } +} + +#[cfg(test)] +mod tests { + use super::*; + + /// pnpm writes vendored paths in THREE spellings — override values, + /// `tarball:` fields, and snapshot KEYS with a trailing colon. The + /// scanner must yield the clean relpath whichever form it meets first. + #[tokio::test] + async fn scan_handles_pnpm_snapshot_key_colons() { + let tmp = tempfile::tempdir().unwrap(); + let uuid = "11111111-1111-4111-8111-111111111111"; + let lock = format!( + "overrides:\n left-pad@1.3.0: file:.socket/vendor/npm/{uuid}/left-pad-1.3.0.tgz\n\n\ + snapshots:\n\n left-pad@file:.socket/vendor/npm/{uuid}/left-pad-1.3.0.tgz:\n {{}}\n" + ); + tokio::fs::write(tmp.path().join("pnpm-lock.yaml"), &lock) + .await + .unwrap(); + let refs = scan_vendor_references(tmp.path()).await; + assert_eq!(refs.len(), 1, "{refs:?}"); + assert_eq!( + refs[0].2, + format!(".socket/vendor/npm/{uuid}/left-pad-1.3.0.tgz"), + "no trailing colon: {refs:?}" + ); + + // Snapshot-key-only lock (the key form is the FIRST occurrence). + let lock = format!( + "snapshots:\n\n left-pad@file:.socket/vendor/npm/{uuid}/left-pad-1.3.0.tgz:\n {{}}\n" + ); + tokio::fs::write(tmp.path().join("pnpm-lock.yaml"), &lock) + .await + .unwrap(); + let refs = scan_vendor_references(tmp.path()).await; + assert_eq!(refs.len(), 1, "{refs:?}"); + assert!( + refs[0].2.ends_with("left-pad-1.3.0.tgz"), + "trailing colon must be cut: {refs:?}" + ); + } +} diff --git a/crates/socket-patch-cli/src/commands/vendor.rs b/crates/socket-patch-cli/src/commands/vendor.rs index da33f3a..dae1815 100644 --- a/crates/socket-patch-cli/src/commands/vendor.rs +++ b/crates/socket-patch-cli/src/commands/vendor.rs @@ -82,7 +82,7 @@ pub struct VendorArgs { /// Refusal codes that are expected skips, not command failures: the user's /// request is still fully satisfied when these are the only non-successes. -fn refusal_is_benign(code: &str) -> bool { +pub(crate) fn refusal_is_benign(code: &str) -> bool { matches!(code, "vendor_unsupported_ecosystem" | "already_vendored") } @@ -90,7 +90,7 @@ fn refusal_is_benign(code: &str) -> bool { /// installed location (site-packages root for pypi, the package dir /// otherwise). Returns `None` for purls with no vendor backend in this build. #[allow(clippy::too_many_arguments)] -async fn dispatch_vendor_one( +pub(crate) async fn dispatch_vendor_one( purl: &str, pkg_path: &Path, project_root: &Path, @@ -294,7 +294,12 @@ pub(crate) fn ecosystem_in_scope(common: &GlobalArgs, eco: &str) -> bool { /// Surface a backend warning: stderr line for humans, a Skipped event with /// the stable code for JSON consumers (Skipped never flips the status). -fn record_warning(env: &mut Envelope, purl: &str, warning: &VendorWarning, common: &GlobalArgs) { +pub(crate) fn record_warning( + env: &mut Envelope, + purl: &str, + warning: &VendorWarning, + common: &GlobalArgs, +) { if !common.silent && !common.json { eprintln!("Warning ({}): {}", warning.code, warning.detail); } @@ -469,6 +474,140 @@ async fn run_vendor(args: &VendorArgs, manifest_path: &Path, env: &mut Envelope) /// /// Does NOT lock, read the manifest, or print the envelope — callers own all /// three. Returns whether any non-benign failure occurred. +/// Persist one backend-returned ledger entry: detached flagging, wiring +/// `original` carry-forward from the entry being replaced, per-package save +/// (crash-consistent with what is already wired), and the stale-uuid-dir +/// sweep on re-vendors. Returns `true` when the save failed (has_errors). +#[allow(clippy::too_many_arguments)] +pub(crate) async fn persist_vendor_entry( + common: &GlobalArgs, + env: &mut Envelope, + state: &mut socket_patch_core::patch::vendor::VendorState, + candidate: &str, + mut entry: socket_patch_core::patch::vendor::VendorEntry, + detached: bool, + record: &PatchRecord, +) -> bool { + let mut has_errors = false; + let candidate = candidate.to_string(); + entry.detached = detached; + entry.record = detached.then(|| record.clone()); + // A re-vendor run re-derives the entry from current + // disk state, where the takeover already happened — + // preserve the prior flag or the revert-time + // "takeover_not_restored" hint is lost. + let prev = state.entries.get(&candidate).cloned(); + if let Some(prev) = &prev { + entry.took_over_go_patches = entry.took_over_go_patches || prev.took_over_go_patches; + // A re-vendor (new patch uuid) rewrites our own + // stale wiring, so the backend records + // `original: None` (it must never record a + // dangling `.socket/vendor/` pointer as the + // pre-vendor fragment). The TRUE pre-vendor + // original lives in the entry being replaced — + // carry it forward by wiring identity, or a + // later `--revert` can only shrug + // (`vendor_lock_entry_drifted`) instead of + // restoring the registry fragment. + for rec in &mut entry.wiring { + if rec.action == socket_patch_core::patch::vendor::state::WiringAction::Rewritten + && rec.original.is_none() + { + if let Some(prev_rec) = prev + .wiring + .iter() + .find(|p| p.file == rec.file && p.kind == rec.kind && p.key == rec.key) + { + rec.original = prev_rec.original.clone(); + } + } + } + } + let new_uuid = entry.uuid.clone(); + state.entries.insert(candidate.clone(), entry); + // Persist per-package so a crash mid-run leaves a + // ledger that matches what's already wired. + if let Err(e) = save_state(&common.cwd, state).await { + has_errors = true; + env.record( + PatchEvent::new(PatchAction::Failed, candidate.clone()) + .with_error("vendor_state_write_failed", e.to_string()), + ); + } else if let Some(prev) = prev.filter(|p| p.uuid != new_uuid) { + // Re-vendor under a newer patch uuid: the old + // uuid's dir is an orphan now — the wiring and + // ledger both point at the new uuid — unless + // another entry still shares it (the same + // `(eco, uuid)` ownership test as `--revert`'s + // orphan sweep). Only the live entry would + // otherwise reclaim it, and that never happens. + let still_referenced = state + .entries + .values() + .any(|e| e.ecosystem == prev.ecosystem && e.uuid == prev.uuid); + let stale_rel = vendor::path::vendor_uuid_dir_rel(&prev.ecosystem, &prev.uuid); + if let Some(rel) = stale_rel.filter(|_| !still_referenced) { + if !common.dry_run { + let _ = remove_tree(&common.cwd.join(rel)).await; + } + env.record( + PatchEvent::new(PatchAction::Removed, candidate.clone()).with_reason( + "vendor_stale_artifact_removed", + "previous patch uuid's vendored artifact removed", + ), + ); + } + } + has_errors +} + +/// One registry-fetch attempt through the pristine-source ladder's network +/// half: the lockfile inventory first, then the ledger-recovered pre-vendor +/// registry fragment (the live lockfile is rewired to `.socket/vendor/...` +/// for vendored packages, so only `--revert`'s restore data still knows the +/// registry resolution). Always integrity-verified fail-closed. +pub(crate) enum PristineFetch { + Fetched(socket_patch_core::patch::vendor::registry_fetch::FetchedPackage), + /// Neither the lockfile nor the ledger can name a verifiable source. + NoSource, + Unverifiable(String), + Failed(String), +} + +pub(crate) async fn fetch_pristine_package( + project_root: &Path, + inventory: &[socket_patch_core::patch::vendor::lock_inventory::LockfileEntry], + client: &socket_patch_core::patch::vendor::registry_fetch::RegistryClient, + purl: &str, + ledger_entry: Option<&socket_patch_core::patch::vendor::VendorEntry>, +) -> PristineFetch { + use socket_patch_core::patch::vendor::{lock_inventory, registry_fetch}; + + let entry = match lock_inventory::lookup(inventory, purl) { + Some(e) => e.clone(), + None => { + let Some(le) = ledger_entry else { + return PristineFetch::NoSource; + }; + match lock_inventory::recover_lock_entry(project_root, le).await { + Ok(rec) => rec, + Err(e) => { + return PristineFetch::Unverifiable(format!( + "the lockfile no longer records a registry resolution for {purl} \ + (rewired to the vendored artifact) and the ledger cannot recover \ + one: {e}" + )) + } + } + } + }; + match registry_fetch::fetch_and_stage(&entry, client).await { + Ok(fetched) => PristineFetch::Fetched(fetched), + Err(registry_fetch::FetchError::Unverifiable(d)) => PristineFetch::Unverifiable(d), + Err(registry_fetch::FetchError::Failed(d)) => PristineFetch::Failed(d), + } +} + pub(crate) async fn vendor_records( common: &GlobalArgs, records: &HashMap, @@ -564,60 +703,86 @@ pub(crate) async fn vendor_records( // against the ledger — offline-safe, no registry traffic. let ledger = load_state(&common.cwd).await.unwrap_or_default(); for purl in &missing { - if let Some(entry) = ledger + let ledger_entry = ledger .entries .get(purl) - .or_else(|| ledger.entries.values().find(|e| &e.base_purl == purl)) + .or_else(|| ledger.entries.values().find(|e| &e.base_purl == purl)); + if let Some(entry) = ledger_entry .filter(|e| e.ecosystem == "npm" && e.artifact.path.ends_with(".tgz")) { let tgz = common.cwd.join(&entry.artifact.path); - match registry_fetch::stage_local_artifact(&tgz, &entry.artifact.sha256).await { - Ok(staged) => { - all_packages.insert(purl.clone(), staged.dir().to_path_buf()); - fetched_holders.push(staged); - continue; - } - Err(registry_fetch::FetchError::Failed(detail)) => { - // A corrupt committed artifact is worth a loud - // failure — re-vendoring over it would mask the - // corruption. - fetch_failed.insert(purl.clone()); - env.record( - PatchEvent::new(PatchAction::Failed, purl.clone()) - .with_error("vendor_fetch_failed", detail.clone()), - ); - if !common.silent && !common.json { - eprintln!("Cannot vendor {}: {detail}", normalize_purl(purl)); + if tokio::fs::metadata(&tgz).await.is_err() { + // The committed artifact is GONE (gitignored or + // deleted): not corruption — fall through to the + // registry ladder, which recovers the pre-vendor + // resolution from the ledger and rebuilds. + record_warning( + env, + purl, + &VendorWarning::new( + "vendor_artifact_missing", + format!( + "the committed vendored artifact {} is missing; \ + recovering the registry resolution to rebuild it", + entry.artifact.path + ), + ), + common, + ); + } else { + match registry_fetch::stage_local_artifact(&tgz, &entry.artifact.sha256) + .await + { + Ok(staged) => { + all_packages.insert(purl.clone(), staged.dir().to_path_buf()); + fetched_holders.push(staged); + continue; + } + Err(registry_fetch::FetchError::Failed(detail)) => { + // A PRESENT-but-corrupt committed artifact is + // worth a loud failure — silently re-vendoring + // over it would mask the corruption. + fetch_failed.insert(purl.clone()); + let detail = format!( + "{detail}; run `socket-patch repair` to rebuild the \ + vendored artifact" + ); + env.record( + PatchEvent::new(PatchAction::Failed, purl.clone()) + .with_error("vendor_fetch_failed", detail.clone()), + ); + if !common.silent && !common.json { + eprintln!("Cannot vendor {}: {detail}", normalize_purl(purl)); + } + continue; + } + Err(registry_fetch::FetchError::Unverifiable(_)) => { + // No recorded hash (legacy ledger) — fall + // through to the lockfile/registry path. } - continue; - } - Err(registry_fetch::FetchError::Unverifiable(_)) => { - // No recorded hash (legacy ledger) — fall - // through to the lockfile/registry path. } } } - let Some(entry) = lock_inventory::lookup(&inventory, purl) else { - continue; // not lockfile-resolvable → package_not_installed - }; if common.offline { // The enriched skip detail lands below in the unmatched // pass (the purl stays unmatched). continue; } - match registry_fetch::fetch_and_stage(entry, &client).await { - Ok(fetched) => { + match fetch_pristine_package(&common.cwd, &inventory, &client, purl, ledger_entry) + .await + { + PristineFetch::Fetched(fetched) => { record_warning( env, purl, &VendorWarning::new( "vendor_fetched_missing", format!( - "{}@{} is not installed; fetched the pristine artifact \ - from {} (integrity verified against the lockfile) and \ - vendored from that copy — the project tree was not \ - touched", - entry.name, entry.version, fetched.url + "{} is not installed; fetched the pristine artifact \ + from {} (integrity verified) and vendored from that \ + copy — the project tree was not touched", + normalize_purl(purl), + fetched.url ), ), common, @@ -625,7 +790,11 @@ pub(crate) async fn vendor_records( all_packages.insert(purl.clone(), fetched.dir().to_path_buf()); fetched_holders.push(fetched); } - Err(registry_fetch::FetchError::Unverifiable(detail)) => { + PristineFetch::NoSource => { + // Plain not-installed package → the calm + // package_not_installed skip below. + } + PristineFetch::Unverifiable(detail) => { record_warning( env, purl, @@ -634,7 +803,7 @@ pub(crate) async fn vendor_records( ); // Falls through to package_not_installed below. } - Err(registry_fetch::FetchError::Failed(detail)) => { + PristineFetch::Failed(detail) => { fetch_failed.insert(purl.clone()); env.record( PatchEvent::new(PatchAction::Failed, purl.clone()) @@ -803,79 +972,11 @@ pub(crate) async fn vendor_records( for w in &warnings { record_warning(env, candidate, w, common); } - if let Some(mut entry) = entry { - entry.detached = detached; - entry.record = detached.then(|| record.clone()); - // A re-vendor run re-derives the entry from current - // disk state, where the takeover already happened — - // preserve the prior flag or the revert-time - // "takeover_not_restored" hint is lost. - let prev = state.entries.get(candidate).cloned(); - if let Some(prev) = &prev { - entry.took_over_go_patches = - entry.took_over_go_patches || prev.took_over_go_patches; - // A re-vendor (new patch uuid) rewrites our own - // stale wiring, so the backend records - // `original: None` (it must never record a - // dangling `.socket/vendor/` pointer as the - // pre-vendor fragment). The TRUE pre-vendor - // original lives in the entry being replaced — - // carry it forward by wiring identity, or a - // later `--revert` can only shrug - // (`vendor_lock_entry_drifted`) instead of - // restoring the registry fragment. - for rec in &mut entry.wiring { - if rec.action - == socket_patch_core::patch::vendor::state::WiringAction::Rewritten - && rec.original.is_none() - { - if let Some(prev_rec) = prev.wiring.iter().find(|p| { - p.file == rec.file - && p.kind == rec.kind - && p.key == rec.key - }) { - rec.original = prev_rec.original.clone(); - } - } - } - } - let new_uuid = entry.uuid.clone(); - state.entries.insert(candidate.clone(), entry); - // Persist per-package so a crash mid-run leaves a - // ledger that matches what's already wired. - if let Err(e) = save_state(&common.cwd, &state).await { - has_errors = true; - env.record( - PatchEvent::new(PatchAction::Failed, candidate.clone()) - .with_error("vendor_state_write_failed", e.to_string()), - ); - } else if let Some(prev) = prev.filter(|p| p.uuid != new_uuid) { - // Re-vendor under a newer patch uuid: the old - // uuid's dir is an orphan now — the wiring and - // ledger both point at the new uuid — unless - // another entry still shares it (the same - // `(eco, uuid)` ownership test as `--revert`'s - // orphan sweep). Only the live entry would - // otherwise reclaim it, and that never happens. - let still_referenced = state - .entries - .values() - .any(|e| e.ecosystem == prev.ecosystem && e.uuid == prev.uuid); - let stale_rel = - vendor::path::vendor_uuid_dir_rel(&prev.ecosystem, &prev.uuid); - if let Some(rel) = stale_rel.filter(|_| !still_referenced) { - if !common.dry_run { - let _ = remove_tree(&common.cwd.join(rel)).await; - } - env.record( - PatchEvent::new(PatchAction::Removed, candidate.clone()) - .with_reason( - "vendor_stale_artifact_removed", - "previous patch uuid's vendored artifact removed", - ), - ); - } - } + if let Some(entry) = entry { + has_errors |= persist_vendor_entry( + common, env, &mut state, candidate, entry, detached, record, + ) + .await; } } } diff --git a/crates/socket-patch-cli/src/json_envelope.rs b/crates/socket-patch-cli/src/json_envelope.rs index 9d88fde..13eb342 100644 --- a/crates/socket-patch-cli/src/json_envelope.rs +++ b/crates/socket-patch-cli/src/json_envelope.rs @@ -328,6 +328,10 @@ pub enum PatchAction { /// `apply --dry-run` / `scan --dry-run`: patch *would* apply /// cleanly. `files` lists what would change. Verified, + /// `repair`: a missing/corrupt vendored artifact was rebuilt in place + /// from verified sources (lockfiles and the vendor ledger untouched + /// unless drift was healed). + Rebuilt, } /// Patch-source strategy used to apply a file. Mirrors the existing @@ -402,6 +406,14 @@ pub struct Summary { pub failed: u32, pub removed: u32, pub verified: u32, + /// `repair`-only (vendored artifact rebuilds); omitted while zero so + /// every other command's summary shape is unchanged. + #[serde(skip_serializing_if = "u32_is_zero")] + pub rebuilt: u32, +} + +fn u32_is_zero(n: &u32) -> bool { + *n == 0 } impl Summary { @@ -415,6 +427,7 @@ impl Summary { PatchAction::Failed => self.failed += 1, PatchAction::Removed => self.removed += 1, PatchAction::Verified => self.verified += 1, + PatchAction::Rebuilt => self.rebuilt += 1, } } } diff --git a/crates/socket-patch-cli/tests/repair_vendor_e2e.rs b/crates/socket-patch-cli/tests/repair_vendor_e2e.rs new file mode 100644 index 0000000..a6003fe --- /dev/null +++ b/crates/socket-patch-cli/tests/repair_vendor_e2e.rs @@ -0,0 +1,755 @@ +//! End-to-end tests for `repair`'s vendored-artifact phase: artifacts +//! referenced by the ledger and/or rewired lockfiles but missing/corrupt on +//! disk are rebuilt fail-closed (and the ledger itself is reconstructed from +//! lockfile references when it was deleted wholesale). Mock API + real npm +//! lockfile fixtures, driven through the built binary. + +use std::path::{Path, PathBuf}; +use std::process::Command; + +use sha2::{Digest, Sha256}; +use wiremock::matchers::{method, path}; +use wiremock::{Mock, MockServer, ResponseTemplate}; + +fn binary() -> PathBuf { + env!("CARGO_BIN_EXE_socket-patch").into() +} + +const ORG_SLUG: &str = "test-org"; +const UUID: &str = "11111111-1111-4111-8111-111111111111"; +const PURL: &str = "pkg:npm/left-pad@1.3.0"; +const ENCODED: &str = "pkg%3Anpm%2Fleft-pad%401.3.0"; +const BEFORE: &[u8] = b"before\n"; +const AFTER: &[u8] = b"after\n"; +const AFTER_B64: &str = "YWZ0ZXIK"; + +fn git_sha256(content: &[u8]) -> String { + let header = format!("blob {}\0", content.len()); + let mut hasher = Sha256::new(); + hasher.update(header.as_bytes()); + hasher.update(content); + hex::encode(hasher.finalize()) +} + +fn sha256_hex(bytes: &[u8]) -> String { + hex::encode(Sha256::digest(bytes)) +} + +fn sri_of(bytes: &[u8]) -> String { + use base64::Engine as _; + use sha2::Sha512; + format!( + "sha512-{}", + base64::engine::general_purpose::STANDARD.encode(Sha512::digest(bytes)) + ) +} + +/// A pristine registry tarball for left-pad@1.3.0 (BEFORE bytes). +fn pristine_tgz() -> Vec { + let mut builder = tar::Builder::new(flate2::write::GzEncoder::new( + Vec::new(), + flate2::Compression::default(), + )); + for (path, bytes) in [ + ( + "package/package.json", + br#"{"name":"left-pad","version":"1.3.0"}"#.as_slice(), + ), + ("package/index.js", BEFORE), + ] { + let mut header = tar::Header::new_gnu(); + header.set_size(bytes.len() as u64); + header.set_mode(0o644); + header.set_cksum(); + builder.append_data(&mut header, path, bytes).unwrap(); + } + builder.into_inner().unwrap().finish().unwrap() +} + +/// Vendorable npm project: package.json, a v3 lock whose left-pad entry +/// resolves to `resolved_url`/`integrity`, and the installed package. +fn write_fixture(root: &Path, resolved_url: &str, integrity: &str) { + std::fs::write( + root.join("package.json"), + r#"{ "name": "repair-vendor-test", "version": "0.0.0" }"#, + ) + .unwrap(); + let lock = serde_json::json!({ + "name": "repair-vendor-test", + "version": "0.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "repair-vendor-test", + "version": "0.0.0", + "dependencies": { "left-pad": "^1.3.0" } + }, + "node_modules/left-pad": { + "version": "1.3.0", + "resolved": resolved_url, + "integrity": integrity, + "license": "WTFPL" + } + } + }); + let mut lock_bytes = serde_json::to_vec_pretty(&lock).unwrap(); + lock_bytes.push(b'\n'); + std::fs::write(root.join("package-lock.json"), lock_bytes).unwrap(); + + let pkg = root.join("node_modules/left-pad"); + std::fs::create_dir_all(&pkg).unwrap(); + std::fs::write( + pkg.join("package.json"), + br#"{"name":"left-pad","version":"1.3.0"}"#, + ) + .unwrap(); + std::fs::write(pkg.join("index.js"), BEFORE).unwrap(); +} + +/// Mount discovery + view for `UUID` (same shapes as scan_vendor_e2e). +async fn mount_patch_api(mock: &MockServer) { + let before_hash = git_sha256(BEFORE); + let after_hash = git_sha256(AFTER); + Mock::given(method("POST")) + .and(path(format!("/v0/orgs/{ORG_SLUG}/patches/batch"))) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "packages": [{ + "purl": PURL, + "patches": [{ + "uuid": UUID, + "purl": PURL, + "tier": "free", + "cveIds": ["CVE-2026-0001"], + "ghsaIds": [], + "severity": "high", + "title": "vendor target" + }] + }], + "canAccessPaidPatches": false, + }))) + .mount(mock) + .await; + Mock::given(method("GET")) + .and(path(format!( + "/v0/orgs/{ORG_SLUG}/patches/by-package/{ENCODED}" + ))) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "patches": [{ + "uuid": UUID, + "purl": PURL, + "publishedAt": "2026-01-01T00:00:00Z", + "description": "Vendor patch", + "license": "MIT", + "tier": "free", + "vulnerabilities": {} + }], + "canAccessPaidPatches": false, + }))) + .mount(mock) + .await; + Mock::given(method("GET")) + .and(path(format!("/v0/orgs/{ORG_SLUG}/patches/view/{UUID}"))) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "uuid": UUID, + "purl": PURL, + "publishedAt": "2026-01-01T00:00:00Z", + "files": { + "package/index.js": { + "beforeHash": before_hash, + "afterHash": after_hash, + "blobContent": AFTER_B64, + } + }, + "vulnerabilities": { + "GHSA-aaaa-bbbb-cccc": { + "cves": ["CVE-2026-0001"], + "summary": "test vuln", + "severity": "high", + "description": "details" + } + }, + "description": "Vendor patch", + "license": "MIT", + "tier": "free", + }))) + .mount(mock) + .await; +} + +/// Serve the after-blob for `--download-mode file` repairs (test 7's step 1 +/// runs before the ledger is reconstructed, so its vendored entry is not +/// yet excluded from the download phase). +async fn mount_blob(mock: &MockServer) { + Mock::given(method("GET")) + .and(path(format!( + "/v0/orgs/{ORG_SLUG}/patches/blob/{}", + git_sha256(AFTER) + ))) + .respond_with(ResponseTemplate::new(200).set_body_bytes(AFTER)) + .mount(mock) + .await; +} + +fn run_cli(root: &Path, mock_uri: &str, argv: &[&str]) -> (i32, String, String) { + let mut full = argv.to_vec(); + full.extend_from_slice(&[ + "--json", + "--api-url", + mock_uri, + "--api-token", + "fake-token", + "--org", + ORG_SLUG, + ]); + let out = Command::new(binary()) + .args(&full) + .current_dir(root) + .env("SOCKET_TELEMETRY_DISABLED", "1") + .output() + .expect("run"); + ( + out.status.code().unwrap_or(-1), + String::from_utf8_lossy(&out.stdout).into_owned(), + String::from_utf8_lossy(&out.stderr).into_owned(), + ) +} + +/// `scan --vendor --yes` to establish a vendored project; returns the +/// vendored tarball path. +fn vendor_project(root: &Path, mock_uri: &str, extra: &[&str]) -> PathBuf { + let mut argv = vec!["scan", "--vendor", "--yes"]; + argv.extend_from_slice(extra); + let (code, stdout, stderr) = run_cli(root, mock_uri, &argv); + assert_eq!(code, 0, "vendor setup failed: {stdout} {stderr}"); + let tgz = root.join(format!(".socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz")); + assert!(tgz.is_file(), "setup must vendor the tarball"); + tgz +} + +fn parse_env(stdout: &str) -> serde_json::Value { + serde_json::from_str(stdout.trim()).unwrap_or_else(|e| panic!("bad JSON ({e}): {stdout}")) +} + +fn events_of(v: &serde_json::Value) -> Vec { + v["events"].as_array().cloned().unwrap_or_default() +} + +/// 1. Deleted tarball → `repair` rebuilds it byte-identically (installed +/// copy + view-fetched patch content), lockfile and ledger untouched. +#[tokio::test] +async fn repair_rebuilds_deleted_vendored_tarball() { + let mock = MockServer::start().await; + mount_patch_api(&mock).await; + let tmp = tempfile::tempdir().unwrap(); + write_fixture( + tmp.path(), + "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz", + "sha512-orig==", + ); + let tgz = vendor_project(tmp.path(), &mock.uri(), &[]); + let tgz_bytes = std::fs::read(&tgz).unwrap(); + let lock1 = std::fs::read(tmp.path().join("package-lock.json")).unwrap(); + let state1 = std::fs::read(tmp.path().join(".socket/vendor/state.json")).unwrap(); + + std::fs::remove_file(&tgz).unwrap(); + + let (code, stdout, stderr) = run_cli(tmp.path(), &mock.uri(), &["repair"]); + assert_eq!(code, 0, "stdout={stdout} stderr={stderr}"); + let v = parse_env(&stdout); + assert_eq!(v["summary"]["rebuilt"], 1, "envelope={v}"); + assert!( + events_of(&v) + .iter() + .any(|e| e["action"] == "rebuilt" && e["purl"] == PURL), + "envelope={v}" + ); + assert_eq!( + std::fs::read(&tgz).unwrap(), + tgz_bytes, + "deterministic rebuild must reproduce the recorded bytes" + ); + assert_eq!( + std::fs::read(tmp.path().join("package-lock.json")).unwrap(), + lock1, + "lockfile untouched" + ); + assert_eq!( + std::fs::read(tmp.path().join(".socket/vendor/state.json")).unwrap(), + state1, + "ledger untouched" + ); + + // Healthy re-run: nothing to rebuild. + let (code, stdout, _) = run_cli(tmp.path(), &mock.uri(), &["repair"]); + assert_eq!(code, 0); + let v = parse_env(&stdout); + assert!( + v["summary"]["rebuilt"].is_null() || v["summary"]["rebuilt"] == 0, + "healthy ledger rebuilds nothing: {v}" + ); +} + +/// 2. `repair --offline` rebuilds from purely local sources (installed copy +/// + seeded blob) with zero network. +#[tokio::test] +async fn repair_offline_rebuilds_from_local_sources() { + let mock = MockServer::start().await; + mount_patch_api(&mock).await; + let tmp = tempfile::tempdir().unwrap(); + write_fixture( + tmp.path(), + "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz", + "sha512-orig==", + ); + let tgz = vendor_project(tmp.path(), &mock.uri(), &[]); + std::fs::remove_file(&tgz).unwrap(); + + // Patch content available locally: the after-blob on disk. + let blobs = tmp.path().join(".socket/blobs"); + std::fs::create_dir_all(&blobs).unwrap(); + std::fs::write(blobs.join(git_sha256(AFTER)), AFTER).unwrap(); + + let before_reqs = mock.received_requests().await.unwrap().len(); + let (code, stdout, stderr) = run_cli(tmp.path(), &mock.uri(), &["repair", "--offline"]); + assert_eq!(code, 0, "stdout={stdout} stderr={stderr}"); + let v = parse_env(&stdout); + assert_eq!(v["summary"]["rebuilt"], 1, "envelope={v}"); + assert!(tgz.is_file(), "tarball rebuilt offline"); + let after_reqs = mock.received_requests().await.unwrap().len(); + assert_eq!( + before_reqs, after_reqs, + "--offline must make no network requests" + ); +} + +/// 3. Truncated/corrupt tarball → detected (whole-file sha vs ledger) and +/// rebuilt. +#[tokio::test] +async fn repair_rebuilds_corrupt_vendored_tarball() { + let mock = MockServer::start().await; + mount_patch_api(&mock).await; + let tmp = tempfile::tempdir().unwrap(); + write_fixture( + tmp.path(), + "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz", + "sha512-orig==", + ); + let tgz = vendor_project(tmp.path(), &mock.uri(), &[]); + let tgz_bytes = std::fs::read(&tgz).unwrap(); + + std::fs::write(&tgz, b"\x1f\x8bgarbage").unwrap(); + + let (code, stdout, stderr) = run_cli(tmp.path(), &mock.uri(), &["repair"]); + assert_eq!(code, 0, "stdout={stdout} stderr={stderr}"); + let v = parse_env(&stdout); + assert_eq!(v["summary"]["rebuilt"], 1, "envelope={v}"); + assert_eq!( + std::fs::read(&tgz).unwrap(), + tgz_bytes, + "rebuild restores the recorded bytes" + ); +} + +/// 4. A tampered ledger sha can never be satisfied: the rebuild is removed +/// and the run fails loudly rather than leaving unverifiable bytes. +#[tokio::test] +async fn repair_fails_closed_on_tampered_ledger_sha() { + let mock = MockServer::start().await; + mount_patch_api(&mock).await; + let tmp = tempfile::tempdir().unwrap(); + write_fixture( + tmp.path(), + "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz", + "sha512-orig==", + ); + let tgz = vendor_project(tmp.path(), &mock.uri(), &[]); + + let state_path = tmp.path().join(".socket/vendor/state.json"); + let state = std::fs::read_to_string(&state_path).unwrap(); + let mut v: serde_json::Value = serde_json::from_str(&state).unwrap(); + v["entries"][PURL]["artifact"]["sha256"] = serde_json::json!("0".repeat(64)); + std::fs::write(&state_path, serde_json::to_vec_pretty(&v).unwrap()).unwrap(); + + let (code, stdout, stderr) = run_cli(tmp.path(), &mock.uri(), &["repair"]); + assert_eq!(code, 1, "stdout={stdout} stderr={stderr}"); + let env = parse_env(&stdout); + assert!( + events_of(&env) + .iter() + .any(|e| e["action"] == "failed" && e["errorCode"] == "vendor_artifact_rebuild_failed"), + "envelope={env}" + ); + assert!( + !tgz.exists(), + "an unverifiable rebuild must not be left on disk" + ); +} + +/// 5. Fresh-clone `vendor` re-run with the committed artifact AND +/// node_modules gone: the ledger's wiring original recovers the registry +/// resolution, the pristine tarball is fetched + verified, and the +/// artifact is rebuilt — exit 0 (previously a hard vendor_fetch_failed). +#[tokio::test] +async fn vendor_rerun_recovers_registry_resolution_from_ledger() { + let mock = MockServer::start().await; + mount_patch_api(&mock).await; + let tgz_bytes = pristine_tgz(); + let integrity = sri_of(&tgz_bytes); + Mock::given(method("GET")) + .and(path("/left-pad/-/left-pad-1.3.0.tgz")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(tgz_bytes)) + .mount(&mock) + .await; + let tmp = tempfile::tempdir().unwrap(); + // The PRE-VENDOR lock resolves to the mock registry with the real + // integrity — that's what the ledger preserves as the wiring original. + write_fixture( + tmp.path(), + &format!("{}/left-pad/-/left-pad-1.3.0.tgz", mock.uri()), + &integrity, + ); + let tgz = vendor_project(tmp.path(), &mock.uri(), &[]); + let lock1 = std::fs::read(tmp.path().join("package-lock.json")).unwrap(); + + std::fs::remove_file(&tgz).unwrap(); + std::fs::remove_dir_all(tmp.path().join("node_modules")).unwrap(); + + let (code, stdout, stderr) = run_cli(tmp.path(), &mock.uri(), &["vendor"]); + assert_eq!(code, 0, "stdout={stdout} stderr={stderr}"); + let v = parse_env(&stdout); + assert!( + events_of(&v) + .iter() + .any(|e| e["errorCode"] == "vendor_artifact_missing"), + "the missing artifact is surfaced as a warning skip: {v}" + ); + assert!(tgz.is_file(), "artifact rebuilt from the recovered fetch"); + assert_eq!( + std::fs::read(tmp.path().join("package-lock.json")).unwrap(), + lock1, + "lockfile byte-stable" + ); +} + +/// 6. Detached vendoring (no manifest ever): repair rebuilds via the +/// ledger-embedded record. +#[tokio::test] +async fn repair_rebuilds_detached_entry_without_manifest() { + let mock = MockServer::start().await; + mount_patch_api(&mock).await; + let tmp = tempfile::tempdir().unwrap(); + write_fixture( + tmp.path(), + "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz", + "sha512-orig==", + ); + let tgz = vendor_project(tmp.path(), &mock.uri(), &["--detached"]); + assert!( + !tmp.path().join(".socket/manifest.json").exists(), + "detached mode writes no manifest" + ); + std::fs::remove_file(&tgz).unwrap(); + + let (code, stdout, stderr) = run_cli(tmp.path(), &mock.uri(), &["repair"]); + assert_eq!(code, 0, "stdout={stdout} stderr={stderr}"); + let v = parse_env(&stdout); + assert_eq!(v["summary"]["rebuilt"], 1, "envelope={v}"); + assert!(tgz.is_file()); +} + +/// 7. The whole `.socket/vendor` tree (state.json included) deleted while +/// the manifest survives: repair reconstructs the ledger entry from the +/// lockfile's vendor-path reference and rebuilds the artifact. +#[tokio::test] +async fn repair_reconstructs_ledger_from_lockfile_references() { + let mock = MockServer::start().await; + mount_patch_api(&mock).await; + let tmp = tempfile::tempdir().unwrap(); + write_fixture( + tmp.path(), + "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz", + "sha512-orig==", + ); + let tgz = vendor_project(tmp.path(), &mock.uri(), &[]); + let lock1 = std::fs::read(tmp.path().join("package-lock.json")).unwrap(); + + std::fs::remove_dir_all(tmp.path().join(".socket/vendor")).unwrap(); + + // With the ledger gone, step 1 sees the manifest entry as un-vendored + // and downloads its source; serve the blob and use file mode. + mount_blob(&mock).await; + let (code, stdout, stderr) = run_cli( + tmp.path(), + &mock.uri(), + &["repair", "--download-mode", "file"], + ); + assert_eq!(code, 0, "stdout={stdout} stderr={stderr}"); + let v = parse_env(&stdout); + assert_eq!(v["summary"]["rebuilt"], 1, "envelope={v}"); + assert!(tgz.is_file(), "artifact rebuilt"); + assert_eq!( + std::fs::read(tmp.path().join("package-lock.json")).unwrap(), + lock1, + "lockfile untouched" + ); + + // The re-synthesized ledger entry: same uuid, fingerprint of the + // rebuilt bytes, NOT detached (the manifest still has the record). + let state: serde_json::Value = serde_json::from_str( + &std::fs::read_to_string(tmp.path().join(".socket/vendor/state.json")).unwrap(), + ) + .unwrap(); + let entry = &state["entries"][PURL]; + assert_eq!(entry["uuid"], UUID, "state={state}"); + assert!(entry["detached"].is_null(), "state={state}"); + assert_eq!( + entry["artifact"]["sha256"], + sha256_hex(&std::fs::read(&tgz).unwrap()), + "recomputed fingerprint matches the rebuilt artifact: {state}" + ); + + // Revert degrades gracefully (no recorded originals): exit 0, artifact + // removed, the drifted-entry guidance surfaced. + let (code, stdout, _) = run_cli(tmp.path(), &mock.uri(), &["vendor", "--revert"]); + assert_eq!(code, 0, "revert of a reconstructed entry: {stdout}"); + assert!(!tgz.exists(), "revert removed the artifact"); +} + +/// 8. No ledger AND no manifest — only the rewired lockfile: the uuid in +/// the lock path drives an API view fetch and the entry is re-created +/// DETACHED (manifest-invisible), with the artifact rebuilt. +#[tokio::test] +async fn repair_reconstructs_detached_from_lockfile_only() { + let mock = MockServer::start().await; + mount_patch_api(&mock).await; + let tmp = tempfile::tempdir().unwrap(); + write_fixture( + tmp.path(), + "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz", + "sha512-orig==", + ); + let tgz = vendor_project(tmp.path(), &mock.uri(), &[]); + + std::fs::remove_dir_all(tmp.path().join(".socket")).unwrap(); + + let (code, stdout, stderr) = run_cli(tmp.path(), &mock.uri(), &["repair"]); + assert_eq!(code, 0, "stdout={stdout} stderr={stderr}"); + let v = parse_env(&stdout); + assert_eq!(v["summary"]["rebuilt"], 1, "envelope={v}"); + assert!(tgz.is_file(), "artifact rebuilt"); + + let state: serde_json::Value = serde_json::from_str( + &std::fs::read_to_string(tmp.path().join(".socket/vendor/state.json")).unwrap(), + ) + .unwrap(); + let entry = &state["entries"][PURL]; + assert_eq!(entry["uuid"], UUID, "state={state}"); + assert_eq!( + entry["detached"], true, + "manifest-less reconstruction is detached: {state}" + ); + assert_eq!( + entry["record"]["uuid"], UUID, + "the record is embedded for future repairs/VEX: {state}" + ); +} + +/// 9. The hardest reconstruction: no ledger, no manifest help needed beyond +/// the record, and NO installed copy. The rewired lockfile's recorded +/// integrity is the trust anchor: the pristine tarball is fetched +/// unverified from the conventional registry URL and the REBUILT +/// artifact must reproduce the wired integrity. +#[tokio::test] +async fn repair_reconstructs_without_installed_copy_via_wired_integrity() { + let mock = MockServer::start().await; + mount_patch_api(&mock).await; + Mock::given(method("GET")) + .and(path("/left-pad/-/left-pad-1.3.0.tgz")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(pristine_tgz())) + .mount(&mock) + .await; + let tmp = tempfile::tempdir().unwrap(); + write_fixture( + tmp.path(), + "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz", + "sha512-orig==", + ); + let tgz = vendor_project(tmp.path(), &mock.uri(), &[]); + + // Fresh-clone hole: vendor tree gone AND nothing installed. + std::fs::remove_dir_all(tmp.path().join(".socket/vendor")).unwrap(); + std::fs::remove_dir_all(tmp.path().join("node_modules")).unwrap(); + + mount_blob(&mock).await; + let out = Command::new(binary()) + .args([ + "repair", + "--download-mode", + "file", + "--json", + "--api-url", + &mock.uri(), + "--api-token", + "fake-token", + "--org", + ORG_SLUG, + ]) + .current_dir(tmp.path()) + .env("SOCKET_TELEMETRY_DISABLED", "1") + .env("SOCKET_NPM_REGISTRY", mock.uri()) + .output() + .expect("run"); + let stdout = String::from_utf8_lossy(&out.stdout); + let stderr = String::from_utf8_lossy(&out.stderr); + assert_eq!( + out.status.code(), + Some(0), + "stdout={stdout} stderr={stderr}" + ); + let v = parse_env(&stdout); + assert_eq!(v["summary"]["rebuilt"], 1, "envelope={v}"); + assert!(tgz.is_file(), "artifact rebuilt from the unverified fetch"); + + // The rebuilt tarball's integrity is exactly what the lock records. + let lock = std::fs::read_to_string(tmp.path().join("package-lock.json")).unwrap(); + let rebuilt_sri = sri_of(&std::fs::read(&tgz).unwrap()); + assert!( + lock.contains(&rebuilt_sri), + "rebuilt sri {rebuilt_sri} must be the wired one; lock={lock}" + ); +} + +/// 10. A tampered pristine source changes the deterministic rebuild, which +/// then fails the wired-integrity check: nothing is kept, exit 1. +#[tokio::test] +async fn repair_reconstruction_rejects_tampered_pristine_source() { + let mock = MockServer::start().await; + mount_patch_api(&mock).await; + // The "registry" serves a tarball whose non-patched member differs. + let mut tampered = tar::Builder::new(flate2::write::GzEncoder::new( + Vec::new(), + flate2::Compression::default(), + )); + for (p, bytes) in [ + ( + "package/package.json", + br#"{"name":"left-pad","version":"1.3.0","scripts":{"postinstall":"evil"}}"#.as_slice(), + ), + ("package/index.js", BEFORE), + ] { + let mut header = tar::Header::new_gnu(); + header.set_size(bytes.len() as u64); + header.set_mode(0o644); + header.set_cksum(); + tampered.append_data(&mut header, p, bytes).unwrap(); + } + let tampered = tampered.into_inner().unwrap().finish().unwrap(); + Mock::given(method("GET")) + .and(path("/left-pad/-/left-pad-1.3.0.tgz")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(tampered)) + .mount(&mock) + .await; + + let tmp = tempfile::tempdir().unwrap(); + write_fixture( + tmp.path(), + "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz", + "sha512-orig==", + ); + let tgz = vendor_project(tmp.path(), &mock.uri(), &[]); + std::fs::remove_dir_all(tmp.path().join(".socket/vendor")).unwrap(); + std::fs::remove_dir_all(tmp.path().join("node_modules")).unwrap(); + + mount_blob(&mock).await; + let out = Command::new(binary()) + .args([ + "repair", + "--download-mode", + "file", + "--json", + "--api-url", + &mock.uri(), + "--api-token", + "fake-token", + "--org", + ORG_SLUG, + ]) + .current_dir(tmp.path()) + .env("SOCKET_TELEMETRY_DISABLED", "1") + .env("SOCKET_NPM_REGISTRY", mock.uri()) + .output() + .expect("run"); + let stdout = String::from_utf8_lossy(&out.stdout); + assert_eq!(out.status.code(), Some(1), "stdout={stdout}"); + let v = parse_env(&stdout); + assert!( + events_of(&v).iter().any(|e| e["action"] == "failed" + && e["errorCode"] == "vendor_artifact_rebuild_failed" + && e["error"] + .as_str() + .unwrap_or("") + .contains("integrity the lockfile records")), + "envelope={v}" + ); + assert!(!tgz.exists(), "a tampered rebuild must not be kept"); +} + +/// Dry run previews the rebuild without touching disk. +#[tokio::test] +async fn repair_dry_run_previews_rebuild() { + let mock = MockServer::start().await; + mount_patch_api(&mock).await; + let tmp = tempfile::tempdir().unwrap(); + write_fixture( + tmp.path(), + "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz", + "sha512-orig==", + ); + let tgz = vendor_project(tmp.path(), &mock.uri(), &[]); + std::fs::remove_file(&tgz).unwrap(); + + let (code, stdout, stderr) = run_cli(tmp.path(), &mock.uri(), &["repair", "--dry-run"]); + assert_eq!(code, 0, "stdout={stdout} stderr={stderr}"); + let v = parse_env(&stdout); + assert!( + events_of(&v).iter().any(|e| e["action"] == "verified" + && e["details"]["wouldRebuild"] == true + && e["purl"] == PURL), + "envelope={v}" + ); + assert!(!tgz.exists(), "dry run writes nothing"); +} + +/// Offline with a broken artifact and NO local sources: a calm, loud, +/// per-entry failure naming the purl and the path; exit 1. +#[tokio::test] +async fn repair_offline_without_sources_fails_loudly() { + let mock = MockServer::start().await; + mount_patch_api(&mock).await; + let tmp = tempfile::tempdir().unwrap(); + write_fixture( + tmp.path(), + "https://registry.npmjs.org/left-pad/-/left-pad-1.3.0.tgz", + "sha512-orig==", + ); + let tgz = vendor_project(tmp.path(), &mock.uri(), &[]); + std::fs::remove_file(&tgz).unwrap(); + // No installed copy either — and no local patch sources. + std::fs::remove_dir_all(tmp.path().join("node_modules")).unwrap(); + + let (code, stdout, stderr) = run_cli(tmp.path(), &mock.uri(), &["repair", "--offline"]); + assert_eq!(code, 1, "stdout={stdout} stderr={stderr}"); + let v = parse_env(&stdout); + let failed: Vec<_> = events_of(&v) + .into_iter() + .filter(|e| e["action"] == "failed") + .collect(); + assert!( + failed + .iter() + .any(|e| e["purl"] == PURL && e["error"].as_str().unwrap_or("").contains("--offline")), + "the failure names the purl and the offline cause: {v}" + ); + assert!(!tgz.exists()); +} diff --git a/crates/socket-patch-core/src/patch/vendor/cargo.rs b/crates/socket-patch-core/src/patch/vendor/cargo.rs index 614fd28..5e4eeea 100644 --- a/crates/socket-patch-core/src/patch/vendor/cargo.rs +++ b/crates/socket-patch-core/src/patch/vendor/cargo.rs @@ -77,14 +77,8 @@ async fn lock_entry_detached(project_root: &Path, name: &str, version: &str) -> /// `afterHash`, the config entry points at this copy, and the lock entry is /// already detached — i.e. a re-run has nothing to do. Touch nothing then, so /// cargo's source fingerprint and the committed bytes stay stable. -async fn vendor_in_sync( - copy_dir: &Path, - files: &HashMap, - project_root: &Path, - name: &str, - version: &str, - copy_rel: &str, -) -> bool { +/// The committed copy exists and every patched file matches its afterHash. +async fn copy_hashes_ok(copy_dir: &Path, files: &HashMap) -> bool { if tokio::fs::metadata(copy_dir).await.is_err() { return false; } @@ -95,6 +89,12 @@ async fn vendor_in_sync( _ => return false, } } + true +} + +/// The config `[patch]` entry points at THIS copy and the lock entry is +/// already detached — the wiring half of the in-sync test. +async fn wiring_in_sync(project_root: &Path, name: &str, version: &str, copy_rel: &str) -> bool { let entries = cargo_config::read_patch_entries(project_root).await; if entries.get(name).and_then(|i| i.path.as_deref()) != Some(copy_rel) { return false; @@ -294,26 +294,66 @@ pub async fn vendor_cargo_crate( // Hot path: already in sync → touch nothing (entry stays with the caller's // existing ledger record, which holds the unrecoverable lock originals). - if vendor_in_sync( - ©_dir, - &record.files, - project_root, - name, - version, - ©_rel, - ) - .await - { - let verified = record - .files - .keys() - .map(|f| already_patched_verify(f)) - .collect(); - return done( - synthesized_result(purl, ©_dir, verified, true, None), - None, - Vec::new(), - ); + if wiring_in_sync(project_root, name, version, ©_rel).await { + if copy_hashes_ok(©_dir, &record.files).await { + let verified = record + .files + .keys() + .map(|f| already_patched_verify(f)) + .collect(); + return done( + synthesized_result(purl, ©_dir, verified, true, None), + None, + Vec::new(), + ); + } + // Wired but the committed copy is missing/stale: rebuild the + // ARTIFACT only — config + lock are already correct, and the full + // path's surgery would re-record live vendored state over the + // first run's unrecoverable lock originals. + if let Err(e) = fresh_copy(pristine_src, ©_dir, Some(".cargo-checksum.json")).await { + let _ = remove_tree(&uuid_dir).await; + return done( + synthesized_result( + purl, + ©_dir, + Vec::new(), + false, + Some(format!("failed to copy pristine source: {e}")), + ), + None, + Vec::new(), + ); + } + let mut warnings: Vec = Vec::new(); + let mut result = super::force_apply_staged( + purl, + ©_dir, + record, + sources, + false, + force, + name, + version, + &mut warnings, + ) + .await; + result.package_path = copy_dir.display().to_string(); + if !result.success { + let _ = remove_tree(&uuid_dir).await; + return done(result, None, warnings); + } + // Same path-dep invariant as the full path: no checksum sidecar. + let _ = tokio::fs::remove_file(copy_dir.join(".cargo-checksum.json")).await; + result.sidecar = None; + warnings.push(VendorWarning::new( + "vendor_artifact_rebuilt", + format!( + "the committed vendored copy for {name}@{version} was missing or stale; \ + rebuilt at {copy_rel} (config and lock untouched)" + ), + )); + return done(result, None, warnings); } // ── materialise the patched copy ────────────────────────────────────── @@ -1042,6 +1082,57 @@ mod tests { ); } + /// Wired config+lock with a deleted committed copy: the artifact is + /// rebuilt in place, config and lock stay byte-identical, no fresh entry. + #[tokio::test] + async fn test_wired_missing_copy_rebuilds_artifact_only() { + let (dir, blobs, pristine, record) = fixture().await; + let root = dir.path(); + expect_done(run_vendor(PURL, root, &blobs, &pristine, &record, false).await); + + let copy = root.join(copy_rel()).join("src/lib.rs"); + let cfg = root.join(".cargo/config.toml"); + let lock = root.join("Cargo.lock"); + let copy1 = tokio::fs::read(©).await.unwrap(); + let cfg1 = tokio::fs::read(&cfg).await.unwrap(); + let lock1 = tokio::fs::read(&lock).await.unwrap(); + + crate::patch::copy_tree::remove_tree(&root.join(copy_rel())) + .await + .unwrap(); + + let (result, entry, warnings) = + expect_done(run_vendor(PURL, root, &blobs, &pristine, &record, false).await); + assert!(result.success, "{:?}", result.error); + assert!( + entry.is_none(), + "artifact-only rebuild must not emit a fresh entry" + ); + assert!( + warnings.iter().any(|w| w.code == "vendor_artifact_rebuilt"), + "rebuild is surfaced: {warnings:?}" + ); + assert_eq!( + tokio::fs::read(©).await.unwrap(), + copy1, + "rebuilt copy carries the patched bytes" + ); + assert!( + !root.join(copy_rel()).join(".cargo-checksum.json").exists(), + "no checksum sidecar in the rebuilt path-dep copy" + ); + assert_eq!( + tokio::fs::read(&cfg).await.unwrap(), + cfg1, + "config untouched" + ); + assert_eq!( + tokio::fs::read(&lock).await.unwrap(), + lock1, + "lock untouched" + ); + } + #[tokio::test] async fn test_dry_run_writes_nothing() { let (dir, blobs, pristine, record) = fixture().await; diff --git a/crates/socket-patch-core/src/patch/vendor/composer_lock.rs b/crates/socket-patch-core/src/patch/vendor/composer_lock.rs index 67ab6f6..30a0334 100644 --- a/crates/socket-patch-core/src/patch/vendor/composer_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/composer_lock.rs @@ -176,19 +176,76 @@ pub async fn vendor_composer( // at the uuid path → touch nothing, report AlreadyPatched. `entry` stays // `None`: the first run's ledger entry holds the only copy of the // verbatim pre-vendor original, and re-recording here would clobber it. - if entry_is_wired(&lock[section][idx], ©_rel) - && copy_matches_after_hashes(©_dir, &record.files).await - { - let verified = record - .files - .keys() - .map(|f| already_patched_verify(f)) - .collect(); - return VendorOutcome::Done { - result: synthesized_result(purl, ©_dir, verified, true, None), - entry: None, - warnings: Vec::new(), - }; + if entry_is_wired(&lock[section][idx], ©_rel) { + if copy_matches_after_hashes(©_dir, &record.files).await { + let verified = record + .files + .keys() + .map(|f| already_patched_verify(f)) + .collect(); + return VendorOutcome::Done { + result: synthesized_result(purl, ©_dir, verified, true, None), + entry: None, + warnings: Vec::new(), + }; + } + // Wired but the committed copy is missing/stale: rebuild the + // ARTIFACT only. The lock is already correct and the first run's + // ledger entry holds the only pre-vendor original — running the + // full path here would re-record the live VENDORED fragment as + // `original`, breaking a later `--revert`. + if !dry_run { + if let Err(e) = fresh_copy(installed_dir, ©_dir, None).await { + return VendorOutcome::Done { + result: synthesized_result( + purl, + ©_dir, + Vec::new(), + false, + Some(format!("failed to copy installed package: {e}")), + ), + entry: None, + warnings: Vec::new(), + }; + } + let mut warnings: Vec = Vec::new(); + let mut result = super::force_apply_staged( + purl, + ©_dir, + record, + sources, + false, + force, + &pkg, + version, + &mut warnings, + ) + .await; + result.package_path = copy_dir.display().to_string(); + if !result.success { + // Don't leave a half-built copy; the pre-state was already + // broken, so removing restores the (missing) status quo. + let _ = remove_tree(&uuid_dir).await; + return VendorOutcome::Done { + result, + entry: None, + warnings, + }; + } + warnings.push(VendorWarning::new( + "vendor_artifact_rebuilt", + format!( + "the committed vendored copy for {pkg}@{version} was missing or stale; \ + rebuilt at {copy_rel} (composer.lock untouched)" + ), + )); + return VendorOutcome::Done { + result, + entry: None, + warnings, + }; + } + // Dry runs fall through to the verify-only preview below. } // ── dry run: verify-only against the installed dir, no writes ──────── @@ -1078,6 +1135,51 @@ mod tests { ); } + /// Wired lock + deleted/corrupt copy: the artifact is rebuilt in place, + /// the lock stays byte-identical, no ledger entry is re-recorded. + #[tokio::test] + async fn test_wired_missing_copy_rebuilds_artifact_only() { + let lock = lock_value("psr/log", "3.0.2", false); + let (dir, blobs, installed, record) = fixture(&lock).await; + let root = dir.path(); + + let (r1, e1, _) = + unwrap_done(run_vendor(root, &blobs, &installed, &record, PURL, false).await); + assert!(r1.success); + assert!(e1.is_some()); + let lock_bytes = tokio::fs::read(root.join(COMPOSER_LOCK)).await.unwrap(); + let patched = root.join(copy_rel()).join("src/LoggerInterface.php"); + let patched_bytes = tokio::fs::read(&patched).await.unwrap(); + + // Simulate the fresh-clone hole: the committed copy is gone. + crate::patch::copy_tree::remove_tree(&root.join(copy_rel())) + .await + .unwrap(); + + let (r2, e2, w2) = + unwrap_done(run_vendor(root, &blobs, &installed, &record, PURL, false).await); + assert!(r2.success, "{:?}", r2.error); + assert!( + e2.is_none(), + "artifact-only rebuild must not re-record (the live vendored \ + fragment would clobber the pre-vendor original)" + ); + assert!( + w2.iter().any(|w| w.code == "vendor_artifact_rebuilt"), + "rebuild is surfaced: {w2:?}" + ); + assert_eq!( + tokio::fs::read(&patched).await.unwrap(), + patched_bytes, + "rebuilt copy carries the patched bytes" + ); + assert_eq!( + tokio::fs::read(root.join(COMPOSER_LOCK)).await.unwrap(), + lock_bytes, + "composer.lock untouched by the rebuild" + ); + } + #[tokio::test] async fn test_dry_run_writes_nothing() { let lock = lock_value("psr/log", "3.0.2", false); diff --git a/crates/socket-patch-core/src/patch/vendor/gem.rs b/crates/socket-patch-core/src/patch/vendor/gem.rs index c125bdf..8a4ad73 100644 --- a/crates/socket-patch-core/src/patch/vendor/gem.rs +++ b/crates/socket-patch-core/src/patch/vendor/gem.rs @@ -244,40 +244,115 @@ pub async fn vendor_gem( // the first run's ledger entry holds the only copy of the pre-vendor // originals. let remote_line = format!(" remote: {copy_rel}"); - let wired = copy_matches_after_hashes(©_dir, &record.files).await + let lock_wired = + lock_text.split('\n').any(|l| l == remote_line) && gemfile_text.contains(©_rel); + let copy_ok = copy_matches_after_hashes(©_dir, &record.files).await && tokio::fs::metadata(copy_dir.join(format!("{name}.gemspec"))) .await - .is_ok() - && lock_text.split('\n').any(|l| l == remote_line) - && gemfile_text.contains(©_rel); - if wired { + .is_ok(); + if lock_wired { if lock_checksum_in_sync(&lock_text, name, version) { - let verified = record - .files - .keys() - .map(|f| already_patched_verify(f)) - .collect(); - return VendorOutcome::Done { - result: synthesized_result(purl, ©_dir, verified, true, None), - entry: None, - warnings: Vec::new(), - }; + if copy_ok { + let verified = record + .files + .keys() + .map(|f| already_patched_verify(f)) + .collect(); + return VendorOutcome::Done { + result: synthesized_result(purl, ©_dir, verified, true, None), + entry: None, + warnings: Vec::new(), + }; + } + // Wired (Gemfile + lock + CHECKSUMS) but the committed copy is + // missing/stale: rebuild the ARTIFACT only — the pair edit is + // already correct and the full path would re-record the live + // vendored fragments as `original`, breaking a later --revert. + if !dry_run { + if let Err(e) = fresh_copy(installed_dir, ©_dir, None).await { + return VendorOutcome::Done { + result: synthesized_result( + purl, + ©_dir, + Vec::new(), + false, + Some(format!("failed to copy installed gem: {e}")), + ), + entry: None, + warnings: Vec::new(), + }; + } + if let Err(e) = + tokio::fs::write(copy_dir.join(format!("{name}.gemspec")), &spec_text).await + { + let _ = remove_tree(&uuid_dir).await; + return VendorOutcome::Done { + result: synthesized_result( + purl, + ©_dir, + Vec::new(), + false, + Some(format!( + "failed to copy the stub gemspec into the vendored dir: {e}" + )), + ), + entry: None, + warnings: Vec::new(), + }; + } + let mut warnings: Vec = Vec::new(); + let mut result = super::force_apply_staged( + purl, + ©_dir, + record, + sources, + false, + force, + name, + version, + &mut warnings, + ) + .await; + result.package_path = copy_dir.display().to_string(); + if !result.success { + let _ = remove_tree(&uuid_dir).await; + return VendorOutcome::Done { + result, + entry: None, + warnings, + }; + } + warnings.push(VendorWarning::new( + "vendor_artifact_rebuilt", + format!( + "the committed vendored copy for {name}@{version} was missing or \ + stale; rebuilt at {copy_rel} (Gemfile and Gemfile.lock untouched)" + ), + )); + return VendorOutcome::Done { + result, + entry: None, + warnings, + }; + } + // Dry runs fall through to the verify-only preview below. + } else { + // Wired everywhere EXCEPT the lock's CHECKSUMS entry, which still + // carries the registry form — a lock wired by a pre-CHECKSUMS-aware + // socket-patch. Bundler never repairs this itself (spike G4: install, + // frozen install and `bundle lock` all silently preserve a stale + // token), and we cannot strip it here: this run records no ledger + // entry, so a revert would put back everything EXCEPT the token — + // leaving a bare CHECKSUMS entry on a registry-sourced gem, which + // hard-fails frozen installs (exit 16). Refuse with the repair path + // instead of the generic "already carries `path:`" Gemfile refusal. + return refused( + "vendor_stale_lock_checksum", + format!( + "Gemfile.lock already wires `{name}` to {copy_rel} but its CHECKSUMS entry is not bundler's bare path-gem form (an earlier socket-patch left the registry line in place); run `vendor --revert` for {purl} and re-vendor to repair it" + ), + ); } - // Wired everywhere EXCEPT the lock's CHECKSUMS entry, which still - // carries the registry form — a lock wired by a pre-CHECKSUMS-aware - // socket-patch. Bundler never repairs this itself (spike G4: install, - // frozen install and `bundle lock` all silently preserve a stale - // token), and we cannot strip it here: this run records no ledger - // entry, so a revert would put back everything EXCEPT the token — - // leaving a bare CHECKSUMS entry on a registry-sourced gem, which - // hard-fails frozen installs (exit 16). Refuse with the repair path - // instead of the generic "already carries `path:`" Gemfile refusal. - return refused( - "vendor_stale_lock_checksum", - format!( - "Gemfile.lock already wires `{name}` to {copy_rel} but its CHECKSUMS entry is not bundler's bare path-gem form (an earlier socket-patch left the registry line in place); run `vendor --revert` for {purl} and re-vendor to repair it" - ), - ); } // ── dry run: verify-only against the installed dir, no writes ──────── @@ -1793,6 +1868,45 @@ mod tests { ); } + /// Wired Gemfile+lock with a deleted committed copy: the artifact (and + /// its stub gemspec) is rebuilt, the pair stays byte-identical, no entry. + #[tokio::test] + async fn test_wired_missing_copy_rebuilds_artifact_only() { + let (_tmp, root, installed, blobs, record) = fixture(GEMFILE_DIRECT, LOCK_DIRECT).await; + + let (r1, e1, _) = unwrap_done(run_vendor(&root, &blobs, &installed, &record, false).await); + assert!(r1.success); + assert!(e1.is_some()); + let gemfile1 = tokio::fs::read(root.join(GEMFILE)).await.unwrap(); + let lock1 = tokio::fs::read(root.join(GEMFILE_LOCK)).await.unwrap(); + let copy_root = root.join(format!(".socket/vendor/gem/{UUID}/rack-3.2.6")); + assert!(copy_root.exists()); + + crate::patch::copy_tree::remove_tree(©_root) + .await + .unwrap(); + + let (r2, e2, w2) = unwrap_done(run_vendor(&root, &blobs, &installed, &record, false).await); + assert!(r2.success, "{:?}", r2.error); + assert!( + e2.is_none(), + "artifact-only rebuild must not re-record the ledger entry" + ); + assert!( + w2.iter().any(|w| w.code == "vendor_artifact_rebuilt"), + "rebuild is surfaced: {w2:?}" + ); + assert!( + copy_root.join("rack.gemspec").exists(), + "stub gemspec regenerated with the rebuilt copy" + ); + assert_eq!(tokio::fs::read(root.join(GEMFILE)).await.unwrap(), gemfile1); + assert_eq!( + tokio::fs::read(root.join(GEMFILE_LOCK)).await.unwrap(), + lock1 + ); + } + #[tokio::test] async fn test_dry_run_writes_nothing() { let (_tmp, root, installed, blobs, record) = fixture(GEMFILE_DIRECT, LOCK_DIRECT).await; diff --git a/crates/socket-patch-core/src/patch/vendor/golang.rs b/crates/socket-patch-core/src/patch/vendor/golang.rs index f1ad5b5..8134608 100644 --- a/crates/socket-patch-core/src/patch/vendor/golang.rs +++ b/crates/socket-patch-core/src/patch/vendor/golang.rs @@ -33,6 +33,24 @@ use super::state::{ }; use super::{RevertOutcome, VendorOutcome, VendorWarning}; +/// The committed copy exists and every patched file matches its afterHash. +async fn copy_hashes_ok( + copy_dir: &Path, + files: &std::collections::HashMap, +) -> bool { + if tokio::fs::metadata(copy_dir).await.is_err() { + return false; + } + for (file_name, info) in files { + let path = copy_dir.join(crate::patch::apply::normalize_file_path(file_name)); + match crate::patch::file_hash::compute_file_git_sha256(&path).await { + Ok(h) if h == info.after_hash => {} + _ => return false, + } + } + true +} + /// Vendor one Go module: patched copy in the uuid dir + a vendor-owned /// `replace` directive + marker, returning the ledger entry to persist. /// @@ -101,6 +119,18 @@ pub async fn vendor_go_module( .is_some_and(|e| e.owner == Some(ReplaceOwner::GoPatches)); let prior_path = prior.as_ref().and_then(|e| e.path.clone()); + // Re-run shape detection: the replace already points at THIS uuid's copy. + // The engine rebuilds a missing/stale copy and its replace upsert is a + // byte-stable no-op, so a wired re-run must return `entry: None` — the + // first run's ledger entry holds the only pre-vendor original, and the + // `prior_path` recorded here would be our own vendored pointer. + let wired = + prior_path.as_deref() == Some(replace_target_path(&base_rel, module, version).as_str()); + let copy_dir = project_root + .join(&base_rel) + .join(format!("{module}@{version}")); + let copy_was_ok = wired && copy_hashes_ok(©_dir, &record.files).await; + // Vendor auto-force policy (the engine's copy is staged from the // pristine source, never the user's tree — see `force_apply_staged`): // missing patch targets still fail closed unless the caller's own @@ -173,6 +203,45 @@ pub async fn vendor_go_module( }; } + if wired { + // Already wired to this uuid: either the engine's in-sync hot path + // (copy intact) or an artifact-only rebuild (copy was missing/stale). + // Never re-record the ledger entry. + if !copy_was_ok { + // A wholesale-deleted uuid dir lost the informational marker; + // restore it alongside the rebuilt copy (never a trust input — + // a failed write only warns). + let mut vulnerabilities: Vec = record.vulnerabilities.keys().cloned().collect(); + vulnerabilities.sort(); + let marker = VendorMarker { + schema_version: 1, + purl: strip_purl_qualifiers(purl).to_string(), + patch_uuid: record.uuid.clone(), + ecosystem: "golang".to_string(), + vulnerabilities, + vendored_at: vendored_at.to_string(), + }; + if let Err(e) = write_marker(&project_root.join(&base_rel), &marker).await { + warnings.push(VendorWarning::new( + "marker_write_failed", + format!("could not write the vendor marker: {e}"), + )); + } + warnings.push(VendorWarning::new( + "vendor_artifact_rebuilt", + format!( + "the committed vendored copy for {module}@{version} was missing or \ + stale; rebuilt under {base_rel} (go.mod untouched)" + ), + )); + } + return VendorOutcome::Done { + result, + entry: None, + warnings, + }; + } + if takeover { // The `replace` line was already atomically repointed by the upsert; // the apply backend's copy is now unreachable — delete it (built from @@ -596,6 +665,47 @@ mod tests { ); } + /// Wired go.mod with a deleted committed copy: the module copy is + /// rebuilt, go.mod stays byte-identical, no fresh ledger entry. + #[tokio::test] + async fn test_wired_missing_copy_rebuilds_artifact_only() { + let (dir, blobs, pristine, record) = fixture().await; + let root = dir.path(); + expect_done(run_vendor(PURL, root, &blobs, &pristine, &record, false).await); + + let copy = root.join(copy_rel()).join("bar.go"); + let gomod = root.join("go.mod"); + let copy1 = tokio::fs::read(©).await.unwrap(); + let mod1 = tokio::fs::read(&gomod).await.unwrap(); + + crate::patch::copy_tree::remove_tree(&root.join(copy_rel())) + .await + .unwrap(); + + let (result, entry, warnings) = + expect_done(run_vendor(PURL, root, &blobs, &pristine, &record, false).await); + assert!(result.success, "{:?}", result.error); + assert!( + entry.is_none(), + "artifact-only rebuild must not re-record (prior_path is our own \ + vendored pointer here, not a pre-vendor original)" + ); + assert!( + warnings.iter().any(|w| w.code == "vendor_artifact_rebuilt"), + "rebuild is surfaced: {warnings:?}" + ); + assert_eq!( + tokio::fs::read(©).await.unwrap(), + copy1, + "rebuilt copy carries the patched bytes" + ); + assert_eq!( + tokio::fs::read(&gomod).await.unwrap(), + mod1, + "go.mod byte-stable across the rebuild" + ); + } + #[tokio::test] async fn test_idempotent_rerun_is_byte_stable() { let (dir, blobs, pristine, record) = fixture().await; @@ -614,8 +724,11 @@ mod tests { result.files_patched.is_empty(), "in-sync re-run patches nothing" ); - assert!(entry.is_some(), "re-run still reports the ledger entry"); - assert!(!entry.unwrap().took_over_go_patches); + assert!( + entry.is_none(), + "an in-sync re-run records no entry — the first run's ledger \ + entry holds the only pre-vendor original" + ); assert!(warnings.is_empty(), "{warnings:?}"); assert_eq!( tokio::fs::read(©).await.unwrap(), diff --git a/crates/socket-patch-core/src/patch/vendor/lock_inventory.rs b/crates/socket-patch-core/src/patch/vendor/lock_inventory.rs index afc7a61..1fad375 100644 --- a/crates/socket-patch-core/src/patch/vendor/lock_inventory.rs +++ b/crates/socket-patch-core/src/patch/vendor/lock_inventory.rs @@ -983,6 +983,430 @@ async fn inventory_requirements_txt(project_root: &Path) -> Option Result { + let (name, version) = parse_base_purl_coords(&entry.base_purl) + .ok_or_else(|| format!("unparseable base purl `{}`", entry.base_purl))?; + + match entry.ecosystem.as_str() { + "npm" => recover_npm_fragment(entry, &name, &version), + "cargo" => { + let checksum = entry + .lock + .as_ref() + .and_then(|l| l.checksum.clone()) + .filter(|c| is_hex_of_len(c, 64)) + .ok_or_else(|| { + "the ledger records no pre-vendor Cargo.lock checksum".to_string() + })?; + Ok(LockfileEntry { + ecosystem: "cargo", + purl: format!("pkg:cargo/{name}@{version}"), + name, + version, + resolved: None, + integrity: LockIntegrity::Sha256Hex(checksum.to_ascii_lowercase()), + }) + } + "composer" => { + let original = wiring_original(entry, &["composer_lock_package"]) + .ok_or_else(|| "no pre-vendor composer.lock fragment recorded".to_string())?; + let dist = original + .get("dist") + .ok_or_else(|| "the pre-vendor composer.lock fragment has no dist".to_string())?; + let url = dist + .get("url") + .and_then(serde_json::Value::as_str) + .and_then(http_url) + .ok_or_else(|| "the pre-vendor dist has no http(s) url".to_string())?; + let shasum = dist + .get("shasum") + .and_then(serde_json::Value::as_str) + .filter(|s| is_hex_of_len(s, 40)) + .ok_or_else(|| { + "the pre-vendor dist records no shasum; refusing an unverifiable fetch" + .to_string() + })?; + Ok(LockfileEntry { + ecosystem: "composer", + purl: format!("pkg:composer/{name}@{version}"), + name, + version, + resolved: Some(url), + integrity: LockIntegrity::Sha1Hex(shasum.to_ascii_lowercase()), + }) + } + "gem" => { + let line = wiring_original(entry, &["gemfile_lock_checksum"]) + .and_then(|v| v.as_str().map(str::to_string)) + .ok_or_else(|| "no pre-vendor Gemfile.lock checksum recorded".to_string())?; + let sha = line + .split("sha256=") + .nth(1) + .map(|rest| { + rest.trim_end_matches(',') + .trim() + .chars() + .take_while(|c| c.is_ascii_hexdigit()) + .collect::() + }) + .filter(|s| is_hex_of_len(s, 64)) + .ok_or_else(|| { + "the pre-vendor checksum line has no sha256; refusing an unverifiable fetch" + .to_string() + })?; + let base = gem_remote_base(project_root) + .await + .unwrap_or_else(|| "https://rubygems.org".to_string()); + Ok(LockfileEntry { + ecosystem: "gem", + purl: format!("pkg:gem/{name}@{version}"), + resolved: http_url(&format!( + "{}/downloads/{name}-{version}.gem", + base.trim_end_matches('/') + )), + name, + version, + integrity: LockIntegrity::Sha256Hex(sha.to_ascii_lowercase()), + }) + } + "pypi" => { + if entry.artifact.platform_locked == Some(true) { + return Err( + "the vendored wheel is platform-locked (compiled); it cannot be rebuilt from the registry" + .to_string(), + ); + } + let unit = wiring_original(entry, &["uv_lock_package"]) + .and_then(|v| v.as_str().map(str::to_string)) + .ok_or_else(|| "no pre-vendor uv.lock fragment recorded".to_string())?; + let (url, sha) = pure_wheel_from_uv_unit(&unit).ok_or_else(|| { + "the pre-vendor uv.lock fragment lists no verifiable pure wheel".to_string() + })?; + Ok(LockfileEntry { + ecosystem: "pypi", + purl: format!("pkg:pypi/{name}@{version}"), + name, + version, + resolved: Some(url), + integrity: LockIntegrity::Sha256Hex(sha), + }) + } + other => Err(format!( + "no ledger-based registry recovery for ecosystem `{other}`" + )), + } +} + +/// The integrity the REWIRED npm-family lockfile records for a vendored +/// artifact at `artifact_rel` (forward-slashed, no `./` prefix). This is +/// the integrity of OUR deterministically packed tarball — the trust +/// anchor for repair's no-ledger reconstruction: a rebuilt tarball that +/// matches it is exactly what the package manager would have installed. +/// +/// package-lock/shrinkwrap are parsed as JSON; the text formats (pnpm, +/// yarn classic/berry, bun) are scanned with a bounded forward window from +/// each reference line. +pub async fn wired_vendor_integrity( + project_root: &Path, + artifact_rel: &str, +) -> Option { + let rel = artifact_rel.trim_start_matches("./"); + + // JSON locks: resolved == "file:" (npm writes exactly this form). + for lock in ["npm-shrinkwrap.json", "package-lock.json"] { + let Ok(bytes) = tokio::fs::read(project_root.join(lock)).await else { + continue; + }; + let Ok(v) = serde_json::from_slice::(&bytes) else { + continue; + }; + if let Some(pkgs) = v.get("packages").and_then(serde_json::Value::as_object) { + for entry in pkgs.values() { + let resolved = entry.get("resolved").and_then(serde_json::Value::as_str); + if resolved.is_some_and(|r| r.trim_start_matches("file:") == rel) { + if let Some(sri) = entry + .get("integrity") + .and_then(serde_json::Value::as_str) + .filter(|s| looks_like_sri(s)) + { + return Some(LockIntegrity::Sri(sri.to_string())); + } + } + } + } + } + + // Text locks: any line referencing the artifact path, integrity within + // a short forward window (the same block). + for lock in ["pnpm-lock.yaml", "yarn.lock", "bun.lock"] { + let Ok(text) = tokio::fs::read_to_string(project_root.join(lock)).await else { + continue; + }; + let lines: Vec<&str> = text.lines().collect(); + for (i, line) in lines.iter().enumerate() { + if !line.contains(rel) { + continue; + } + for probe in lines.iter().take((i + 6).min(lines.len())).skip(i) { + // pnpm `resolution: {integrity: …}` / classic `integrity …` + // / bun tuple `"sha512-…"`. + if let Some(v) = inline_yaml_field(probe, "integrity:") { + if looks_like_sri(&v) { + return Some(LockIntegrity::Sri(v)); + } + } + if let Some(rest) = probe.trim().strip_prefix("integrity ") { + let v = rest.trim().trim_matches('"'); + if looks_like_sri(v) { + return Some(LockIntegrity::Sri(v.to_string())); + } + } + if let Some(sri) = probe.split('"').rev().find(|tok| looks_like_sri(tok)) { + return Some(LockIntegrity::Sri(sri.to_string())); + } + // yarn berry: `checksum: 10c0/…`. + if let Some(v) = inline_yaml_field(probe, "checksum:") { + if v.split_once('/') + .is_some_and(|(k, b)| !k.is_empty() && !b.is_empty()) + { + return Some(LockIntegrity::BerryChecksum(v)); + } + } + } + } + } + None +} + +/// `pkg:/@` → (name, version). The name may itself +/// contain `/` (npm scopes, go modules); the version is after the LAST `@`. +fn parse_base_purl_coords(base_purl: &str) -> Option<(String, String)> { + let rest = base_purl.strip_prefix("pkg:")?; + let (_, name_ver) = rest.split_once('/')?; + let (name, version) = name_ver.rsplit_once('@')?; + if name.is_empty() || version.is_empty() { + return None; + } + Some((name.to_string(), version.to_string())) +} + +/// First wiring record of one of `kinds` carrying an `original` payload. +fn wiring_original<'a>( + entry: &'a super::state::VendorEntry, + kinds: &[&str], +) -> Option<&'a serde_json::Value> { + entry + .wiring + .iter() + .find(|r| kinds.contains(&r.kind.as_str()) && r.original.is_some()) + .and_then(|r| r.original.as_ref()) +} + +fn is_hex_of_len(s: &str, len: usize) -> bool { + s.len() == len && s.bytes().all(|b| b.is_ascii_hexdigit()) +} + +/// Per-flavor npm recovery: the wiring kinds disambiguate the lock flavor, +/// each fragment yields (resolved?, integrity). +fn recover_npm_fragment( + entry: &super::state::VendorEntry, + name: &str, + version: &str, +) -> Result { + let mk = |resolved: Option, integrity: LockIntegrity| LockfileEntry { + ecosystem: "npm", + purl: format!("pkg:npm/{name}@{version}"), + name: name.to_string(), + version: version.to_string(), + resolved, + integrity, + }; + + // package-lock / shrinkwrap: the original is the full lock entry object. + if let Some(obj) = wiring_original(entry, &["npm_lock_entry", "npm_lock_legacy_entry"]) { + let resolved = obj + .get("resolved") + .and_then(serde_json::Value::as_str) + .and_then(http_url); + if let Some(sri) = obj + .get("integrity") + .and_then(serde_json::Value::as_str) + .filter(|s| looks_like_sri(s)) + { + return Ok(mk(resolved, LockIntegrity::Sri(sri.to_string()))); + } + } + // pnpm: the original is the packages block's lines; pull + // `resolution: {integrity: …, tarball: …}`. + if let Some(lines) = wiring_original(entry, &["pnpm_lock_package"]).and_then(lines_of) { + let mut sri = None; + let mut tarball = None; + for line in &lines { + if let Some(v) = inline_yaml_field(line, "integrity:") { + sri = sri.or(Some(v)); + } + if let Some(v) = inline_yaml_field(line, "tarball:") { + tarball = tarball.or(http_url(&v)); + } + } + if let Some(sri) = sri.filter(|s| looks_like_sri(s)) { + return Ok(mk(tarball, LockIntegrity::Sri(sri))); + } + } + // yarn classic: block lines carry `integrity ` (preferred) and/or + // `resolved "#"`. + if let Some(lines) = wiring_original(entry, &["yarn_lock_block"]).and_then(lines_of) { + let mut url = None; + let mut sha1 = None; + let mut sri = None; + for line in &lines { + let t = line.trim(); + if let Some(rest) = t.strip_prefix("integrity ") { + let v = rest.trim().trim_matches('"'); + if looks_like_sri(v) { + sri = Some(v.to_string()); + } + } + if let Some(rest) = t.strip_prefix("resolved ") { + let v = rest.trim().trim_matches('"'); + let (u, frag) = v.split_once('#').unwrap_or((v, "")); + url = http_url(u); + if is_hex_of_len(frag, 40) { + sha1 = Some(frag.to_ascii_lowercase()); + } + } + } + if let Some(sri) = sri { + return Ok(mk(url, LockIntegrity::Sri(sri))); + } + if let Some(sha1) = sha1 { + return Ok(mk(url, LockIntegrity::Sha1Hex(sha1))); + } + } + // yarn berry: block lines carry `checksum: /`. + if let Some(lines) = wiring_original(entry, &["yarn_berry_lock_entry"]).and_then(lines_of) { + for line in &lines { + if let Some(v) = inline_yaml_field(line, "checksum:") { + if v.split_once('/') + .is_some_and(|(k, b)| !k.is_empty() && !b.is_empty()) + { + return Ok(mk(None, LockIntegrity::BerryChecksum(v))); + } + } + } + } + // bun: the original is the raw tuple line; the integrity is its last + // quoted SRI string. + if let Some(line) = + wiring_original(entry, &["bun_lock_package"]).and_then(|v| v.as_str().map(str::to_string)) + { + if let Some(sri) = line + .split('"') + .rev() + .find(|tok| looks_like_sri(tok)) + .map(str::to_string) + { + return Ok(mk(None, LockIntegrity::Sri(sri))); + } + } + Err("no pre-vendor npm registry fragment with a verifiable integrity recorded".to_string()) +} + +fn looks_like_sri(s: &str) -> bool { + ["sha512-", "sha384-", "sha256-", "sha1-"] + .iter() + .any(|p| s.starts_with(p) && s.len() > p.len()) +} + +/// A wiring `original` recorded as an array of text lines. +fn lines_of(v: &serde_json::Value) -> Option> { + v.as_array().map(|arr| { + arr.iter() + .filter_map(|l| l.as_str().map(str::to_string)) + .collect() + }) +} + +/// `… field: value` (optionally inside an inline `{…}` map) → value, with +/// trailing `,`/`}` and quotes stripped. +fn inline_yaml_field(line: &str, field: &str) -> Option { + let idx = line.find(field)?; + let rest = &line[idx + field.len()..]; + let end = rest.find([',', '}']).unwrap_or(rest.len()); + let v = rest[..end].trim().trim_matches(['\'', '"']).to_string(); + (!v.is_empty()).then_some(v) +} + +/// The `GEM remote:` base of the (unrewired) Gemfile.lock. +async fn gem_remote_base(project_root: &Path) -> Option { + let text = tokio::fs::read_to_string(project_root.join("Gemfile.lock")) + .await + .ok()?; + let mut in_gem = false; + for line in text.lines() { + if line.trim_end() == "GEM" { + in_gem = true; + continue; + } + if in_gem { + if let Some(rest) = line.trim().strip_prefix("remote:") { + return http_url(rest.trim()); + } + if !line.starts_with(' ') && !line.trim().is_empty() { + in_gem = false; + } + } + } + None +} + +/// First `{ url = "…", hash = "sha256:…" }` wheel in a uv.lock `[[package]]` +/// unit whose filename is a PURE wheel (`-none-any.whl`). +fn pure_wheel_from_uv_unit(unit: &str) -> Option<(String, String)> { + let mut search = unit; + while let Some(uidx) = search.find("url = \"") { + let after = &search[uidx + 7..]; + let uend = after.find('"')?; + let url = &after[..uend]; + let rest = &after[uend..]; + let advance = uidx + 7 + uend; + if url.ends_with("-none-any.whl") { + if let Some(hidx) = rest.find("hash = \"sha256:") { + let hafter = &rest[hidx + 15..]; + let hend = hafter.find('"')?; + let sha = &hafter[..hend]; + if is_hex_of_len(sha, 64) { + if let Some(url) = http_url(url) { + return Some((url, sha.to_ascii_lowercase())); + } + } + } + } + search = &search[advance..]; + } + None +} + #[cfg(test)] mod tests { use super::*; @@ -1613,3 +2037,272 @@ source = { editable = "." } assert!(inventory_npm_lock(tmp.path()).await.is_none()); } } + +#[cfg(test)] +mod recover_tests { + use super::super::state::WiringAction; + use super::super::state::{CargoLockOriginal, VendorArtifact, VendorEntry, WiringRecord}; + use super::*; + + const UUID: &str = "9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f"; + + fn entry(eco: &str, base_purl: &str, wiring: Vec) -> VendorEntry { + VendorEntry { + ecosystem: eco.into(), + base_purl: base_purl.into(), + uuid: UUID.into(), + artifact: VendorArtifact { + path: format!(".socket/vendor/{eco}/{UUID}/x"), + sha256: String::new(), + size: None, + platform_locked: None, + }, + wiring, + lock: None, + took_over_go_patches: false, + detached: false, + record: None, + flavor: None, + uv: None, + pnpm: None, + poetry: None, + pdm: None, + pipenv: None, + } + } + + fn rec(kind: &str, original: serde_json::Value) -> WiringRecord { + WiringRecord { + file: "lock".into(), + kind: kind.into(), + action: WiringAction::Rewritten, + key: Some("k".into()), + original: Some(original), + new: None, + } + } + + #[tokio::test] + async fn npm_lock_entry_fragment_recovers_sri_and_url() { + let tmp = tempfile::tempdir().unwrap(); + let e = entry( + "npm", + "pkg:npm/@scope/x@1.2.3", + vec![rec( + "npm_lock_entry", + serde_json::json!({ + "resolved": "https://registry.npmjs.org/@scope/x/-/x-1.2.3.tgz", + "integrity": "sha512-AAAA", + }), + )], + ); + let got = recover_lock_entry(tmp.path(), &e).await.unwrap(); + assert_eq!(got.ecosystem, "npm"); + assert_eq!(got.name, "@scope/x"); + assert_eq!(got.version, "1.2.3"); + assert_eq!( + got.resolved.as_deref(), + Some("https://registry.npmjs.org/@scope/x/-/x-1.2.3.tgz") + ); + assert_eq!(got.integrity, LockIntegrity::Sri("sha512-AAAA".into())); + } + + #[tokio::test] + async fn pnpm_package_lines_recover_integrity_and_tarball() { + let tmp = tempfile::tempdir().unwrap(); + let e = entry( + "npm", + "pkg:npm/left-pad@1.3.0", + vec![rec( + "pnpm_lock_package", + serde_json::json!([ + " left-pad@1.3.0:", + " resolution: {integrity: sha512-BBBB, tarball: https://npm.corp/left-pad-1.3.0.tgz}", + ]), + )], + ); + let got = recover_lock_entry(tmp.path(), &e).await.unwrap(); + assert_eq!(got.integrity, LockIntegrity::Sri("sha512-BBBB".into())); + assert_eq!( + got.resolved.as_deref(), + Some("https://npm.corp/left-pad-1.3.0.tgz") + ); + } + + #[tokio::test] + async fn yarn_classic_block_prefers_sri_else_sha1() { + let tmp = tempfile::tempdir().unwrap(); + let sha1 = "a".repeat(40); + let with_both = entry( + "npm", + "pkg:npm/x@1.0.0", + vec![rec( + "yarn_lock_block", + serde_json::json!([ + "x@^1.0.0:", + " version \"1.0.0\"", + format!(" resolved \"https://registry.yarnpkg.com/x/-/x-1.0.0.tgz#{sha1}\""), + " integrity sha512-CCCC", + ]), + )], + ); + let got = recover_lock_entry(tmp.path(), &with_both).await.unwrap(); + assert_eq!(got.integrity, LockIntegrity::Sri("sha512-CCCC".into())); + assert_eq!( + got.resolved.as_deref(), + Some("https://registry.yarnpkg.com/x/-/x-1.0.0.tgz") + ); + + let sha1_only = entry( + "npm", + "pkg:npm/x@1.0.0", + vec![rec( + "yarn_lock_block", + serde_json::json!([format!( + " resolved \"https://registry.yarnpkg.com/x/-/x-1.0.0.tgz#{sha1}\"" + )]), + )], + ); + let got = recover_lock_entry(tmp.path(), &sha1_only).await.unwrap(); + assert_eq!(got.integrity, LockIntegrity::Sha1Hex(sha1)); + } + + #[tokio::test] + async fn berry_checksum_and_bun_tuple_recover() { + let tmp = tempfile::tempdir().unwrap(); + let berry = entry( + "npm", + "pkg:npm/x@1.0.0", + vec![rec( + "yarn_berry_lock_entry", + serde_json::json!(["x@npm:1.0.0:", " checksum: 10c0/abcdef"]), + )], + ); + let got = recover_lock_entry(tmp.path(), &berry).await.unwrap(); + assert_eq!( + got.integrity, + LockIntegrity::BerryChecksum("10c0/abcdef".into()) + ); + assert_eq!(got.resolved, None); + + let bun = entry( + "npm", + "pkg:npm/x@1.0.0", + vec![rec( + "bun_lock_package", + serde_json::json!(" \"x\": [\"x@1.0.0\", \"\", {}, \"sha512-DDDD\"],"), + )], + ); + let got = recover_lock_entry(tmp.path(), &bun).await.unwrap(); + assert_eq!(got.integrity, LockIntegrity::Sri("sha512-DDDD".into())); + } + + #[tokio::test] + async fn cargo_recovers_from_entry_lock_checksum() { + let tmp = tempfile::tempdir().unwrap(); + let sha = "b".repeat(64); + let mut e = entry("cargo", "pkg:cargo/serde@1.0.0", vec![]); + e.lock = Some(CargoLockOriginal { + source: "registry+https://github.com/rust-lang/crates.io-index".into(), + checksum: Some(sha.clone()), + }); + let got = recover_lock_entry(tmp.path(), &e).await.unwrap(); + assert_eq!(got.ecosystem, "cargo"); + assert_eq!(got.integrity, LockIntegrity::Sha256Hex(sha)); + assert_eq!(got.resolved, None); + + // No checksum recorded → unrecoverable, never an unverified fetch. + let mut bare = entry("cargo", "pkg:cargo/serde@1.0.0", vec![]); + bare.lock = None; + assert!(recover_lock_entry(tmp.path(), &bare).await.is_err()); + } + + #[tokio::test] + async fn composer_gem_uv_fragments_recover() { + let tmp = tempfile::tempdir().unwrap(); + let sha1 = "c".repeat(40); + let composer = entry( + "composer", + "pkg:composer/monolog/monolog@2.9.1", + vec![rec( + "composer_lock_package", + serde_json::json!({ + "name": "monolog/monolog", + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/Seldaek/monolog/zipball/abc", + "shasum": sha1, + }, + }), + )], + ); + let got = recover_lock_entry(tmp.path(), &composer).await.unwrap(); + assert_eq!(got.name, "monolog/monolog"); + assert_eq!(got.integrity, LockIntegrity::Sha1Hex(sha1)); + + // gem: checksum line + remote read from the unrewired Gemfile.lock. + let sha256 = "d".repeat(64); + tokio::fs::write( + tmp.path().join("Gemfile.lock"), + "GEM\n remote: https://rubygems.org/\n specs:\n rack (3.0.0)\n", + ) + .await + .unwrap(); + let gem = entry( + "gem", + "pkg:gem/rack@3.0.0", + vec![rec( + "gemfile_lock_checksum", + serde_json::json!(format!(" rack (3.0.0) sha256={sha256}")), + )], + ); + let got = recover_lock_entry(tmp.path(), &gem).await.unwrap(); + assert_eq!(got.integrity, LockIntegrity::Sha256Hex(sha256.clone())); + assert_eq!( + got.resolved.as_deref(), + Some("https://rubygems.org/downloads/rack-3.0.0.gem") + ); + + // uv: the original [[package]] unit lists wheels; only the PURE one + // is recoverable. + let wheel_sha = "e".repeat(64); + let unit = format!( + "[[package]]\nname = \"six\"\nversion = \"1.16.0\"\nwheels = [\n {{ url = \"https://files.pythonhosted.org/packages/six-1.16.0-cp39-cp39-linux_x86_64.whl\", hash = \"sha256:{}\" }},\n {{ url = \"https://files.pythonhosted.org/packages/six-1.16.0-py2.py3-none-any.whl\", hash = \"sha256:{wheel_sha}\" }},\n]\n", + "f".repeat(64) + ); + let uv = entry( + "pypi", + "pkg:pypi/six@1.16.0", + vec![rec("uv_lock_package", serde_json::json!(unit))], + ); + let got = recover_lock_entry(tmp.path(), &uv).await.unwrap(); + assert_eq!(got.integrity, LockIntegrity::Sha256Hex(wheel_sha)); + assert!(got.resolved.unwrap().ends_with("py2.py3-none-any.whl")); + + // platform-locked wheels are explicitly unrepairable from the registry. + let mut locked = entry("pypi", "pkg:pypi/six@1.16.0", vec![]); + locked.artifact.platform_locked = Some(true); + assert!(recover_lock_entry(tmp.path(), &locked).await.is_err()); + } + + #[tokio::test] + async fn unrecoverable_fragments_fail_closed() { + let tmp = tempfile::tempdir().unwrap(); + // No wiring at all. + let bare = entry("npm", "pkg:npm/x@1.0.0", vec![]); + assert!(recover_lock_entry(tmp.path(), &bare).await.is_err()); + // golang routes through go.sum, never the ledger. + let go = entry("golang", "pkg:golang/golang.org/x/text@v0.14.0", vec![]); + assert!(recover_lock_entry(tmp.path(), &go).await.is_err()); + // Poisoned integrity shapes are rejected. + let bad = entry( + "npm", + "pkg:npm/x@1.0.0", + vec![rec( + "npm_lock_entry", + serde_json::json!({"resolved": "https://x/", "integrity": "lol"}), + )], + ); + assert!(recover_lock_entry(tmp.path(), &bad).await.is_err()); + } +} diff --git a/crates/socket-patch-core/src/patch/vendor/mod.rs b/crates/socket-patch-core/src/patch/vendor/mod.rs index 4551991..1906cfc 100644 --- a/crates/socket-patch-core/src/patch/vendor/mod.rs +++ b/crates/socket-patch-core/src/patch/vendor/mod.rs @@ -76,6 +76,7 @@ pub mod yarn_classic_lock; pub use path::{ecosystem_dir_for_purl, parse_vendor_path, VendorPathParts, VENDOR_DIR}; pub use state::{load_state, save_state, VendorEntry, VendorState, VENDOR_STATE_REL}; +pub use verify::{check_vendored_artifact, file_sha256_hex, ArtifactHealth}; use std::collections::HashMap; use std::path::Path; diff --git a/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs b/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs index 2f74548..179d59d 100644 --- a/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs @@ -1004,6 +1004,34 @@ fn edit_packages( let new_key = ctx.new_key(); let ours_prefix = format!("{}@file:", ctx.name); + // Fail closed on a half-drifted lock: when BOTH the registry-keyed + // entry and a socket file:-keyed entry for this package exist, a rekey + // would splice a DUPLICATE mapping key (pnpm refuses to parse those) + // and surgery cannot decide which block carries the truth. + { + let mut has_registry = false; + let mut has_ours = false; + let mut j = start + 1; + while let Some(block) = next_block(lines, j, end) { + if block.key == reg_key { + has_registry = true; + } else if block + .key + .strip_prefix(&ours_prefix) + .is_some_and(|rest| parse_vendor_path(rest).is_some_and(|p| p.eco == "npm")) + { + has_ours = true; + } + j = block.end; + } + if has_registry && has_ours { + return Err(format!( + "packages section carries BOTH `{reg_key}` and a `{ours_prefix}…` entry (a \ + half-edited lock); run `pnpm install` to re-resolve it, then re-vendor" + )); + } + } + let mut i = start + 1; while let Some(block) = next_block(lines, i, end) { let is_registry = block.key == reg_key; @@ -1086,6 +1114,30 @@ fn edit_snapshot_rekey( let reg_key = ctx.reg_key(); let new_key = ctx.new_key(); let ours_prefix = format!("{}@file:", ctx.name); + // Same duplicate-key fail-closed guard as edit_packages. + { + let mut has_registry = false; + let mut has_ours = false; + let mut j = start + 1; + while let Some(block) = next_block(lines, j, end) { + if block.key == reg_key { + has_registry = true; + } else if block + .key + .strip_prefix(&ours_prefix) + .is_some_and(|rest| parse_vendor_path(rest).is_some_and(|p| p.eco == "npm")) + { + has_ours = true; + } + j = block.end; + } + if has_registry && has_ours { + return Err(format!( + "snapshots section carries BOTH `{reg_key}` and a `{ours_prefix}…` entry (a \ + half-edited lock); run `pnpm install` to re-resolve it, then re-vendor" + )); + } + } let mut i = start + 1; while let Some(block) = next_block(lines, i, end) { let is_registry = block.key == reg_key; @@ -2613,6 +2665,38 @@ snapshots: ); } + /// A half-edited lock carrying BOTH the registry-keyed packages entry + /// AND a socket file:-keyed one: a rekey would splice a DUPLICATE + /// mapping key (pnpm refuses to parse those) — fail closed, nothing + /// written. + #[tokio::test] + async fn half_drifted_duplicate_keys_fail_closed() { + let dup_lock = P1_BEFORE_LOCK.replace( + " left-pad@1.3.0:\n resolution: {integrity: sha512-XI5MPzVNApjAyhQzphX8BkmKsKUxD4LdyK24iZeQGinBN9yTQT3bFlCBy/aVx2HrNcqQGsdot8ghrjyrvMCoEA==}\n deprecated: use String.prototype.padStart()", + &format!( + " left-pad@1.3.0:\n resolution: {{integrity: sha512-XI5MPzVNApjAyhQzphX8BkmKsKUxD4LdyK24iZeQGinBN9yTQT3bFlCBy/aVx2HrNcqQGsdot8ghrjyrvMCoEA==}}\n deprecated: use String.prototype.padStart()\n\n left-pad@file:.socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz:\n resolution: {{integrity: sha512-stale==, tarball: file:.socket/vendor/npm/{UUID}/left-pad-1.3.0.tgz}}\n version: 1.3.0" + ), + ); + assert_ne!(dup_lock, P1_BEFORE_LOCK, "fixture edit must apply"); + let fx = fixture_with(P1_BEFORE_PKG, &dup_lock).await; + let lock_before = fx.read(PNPM_LOCK).await; + let pkg_before = fx.read(PACKAGE_JSON).await; + + let (result, entry, _) = expect_done(fx.vendor(false).await); + assert!(!result.success, "half-drifted lock must fail closed"); + assert!( + result + .error + .as_deref() + .is_some_and(|e| e.contains("half-edited lock")), + "{:?}", + result.error + ); + assert!(entry.is_none()); + assert_eq!(fx.read(PNPM_LOCK).await, lock_before, "lock untouched"); + assert_eq!(fx.read(PACKAGE_JSON).await, pkg_before, "pkg untouched"); + } + #[tokio::test] async fn dry_run_writes_nothing() { let fx = fixture_with(P1_BEFORE_PKG, P1_BEFORE_LOCK).await; diff --git a/crates/socket-patch-core/src/patch/vendor/pypi.rs b/crates/socket-patch-core/src/patch/vendor/pypi.rs index 8b317f3..b9b31f3 100644 --- a/crates/socket-patch-core/src/patch/vendor/pypi.rs +++ b/crates/socket-patch-core/src/patch/vendor/pypi.rs @@ -20,7 +20,7 @@ use super::pypi_poetry::{PoetryProject, PoetryTarget}; use super::pypi_requirements::{preflight_requirements, revert_requirements, wire_requirements}; use super::pypi_uv::{ check_target_guards, classify_dependency, load_uv_project, revert_uv, wire_uv, UvDepClass, - UvProject, + UvProject, UvTarget, }; use super::pypi_wheel::{build_patched_wheel, locate_installed_dist, wheel_file_name}; use super::state::{ @@ -221,6 +221,9 @@ enum WiringPlan { Poetry(Box), Pdm(Box), Pipenv(Box), + /// The lock already routes this package through THIS patch uuid's + /// vendored wheel: no wiring — verify (or rebuild) the artifact only. + InSync, } /// Which `VendorEntry` meta slot a flavor's wiring produced. @@ -232,6 +235,20 @@ enum MetaSlot { None, } +/// The uuid dir holds a wheel artifact — the cheap, flavor-agnostic +/// presence probe for the in-sync hot path (one uuid owns one wheel). +async fn uuid_dir_has_wheel(uuid_dir: &Path) -> bool { + let Ok(mut rd) = tokio::fs::read_dir(uuid_dir).await else { + return false; + }; + while let Ok(Some(e)) = rd.next_entry().await { + if e.file_name().to_string_lossy().ends_with(".whl") { + return true; + } + } + false +} + /// Build the synthesized AlreadyPatched outcome for an in-sync re-run: the /// artifact + lockfile already point at THIS patch uuid, so nothing is built /// or recorded (the first run's ledger entry holds the only copy of the @@ -323,12 +340,15 @@ pub async fn vendor_pypi( Ok(p) => p, Err((code, detail)) => return VendorOutcome::Refused { code, detail }, }; - if let Err((code, detail)) = check_target_guards(&project, &canon_name) { - return VendorOutcome::Refused { code, detail }; + match check_target_guards(&project, &canon_name, &record.uuid) { + Ok(UvTarget::InSync) => WiringPlan::InSync, + Ok(UvTarget::Fresh) => { + warnings.extend(project.warnings.iter().cloned()); + let class = classify_dependency(&project, &canon_name); + WiringPlan::Uv(Box::new(project), class) + } + Err((code, detail)) => return VendorOutcome::Refused { code, detail }, } - warnings.extend(project.warnings.iter().cloned()); - let class = classify_dependency(&project, &canon_name); - WiringPlan::Uv(Box::new(project), class) } PypiFlavor::Requirements => { if let Err((code, detail)) = @@ -349,12 +369,13 @@ pub async fn vendor_pypi( version, &record.uuid, ) { - Ok(PoetryTarget::Fresh) => {} - Ok(PoetryTarget::InSync) => return in_sync_outcome(base, record, warnings), + Ok(PoetryTarget::InSync) => WiringPlan::InSync, + Ok(PoetryTarget::Fresh) => { + warnings.extend(project.warnings.iter().cloned()); + WiringPlan::Poetry(Box::new(project)) + } Err((code, detail)) => return VendorOutcome::Refused { code, detail }, } - warnings.extend(project.warnings.iter().cloned()); - WiringPlan::Poetry(Box::new(project)) } PypiFlavor::Pdm => { let project = match super::pypi_pdm::load_pdm_project(project_root).await { @@ -363,12 +384,13 @@ pub async fn vendor_pypi( }; match super::pypi_pdm::check_target_guards(&project, &canon_name, version, &record.uuid) { - Ok(PdmTarget::Fresh) => {} - Ok(PdmTarget::InSync) => return in_sync_outcome(base, record, warnings), + Ok(PdmTarget::InSync) => WiringPlan::InSync, + Ok(PdmTarget::Fresh) => { + warnings.extend(project.warnings.iter().cloned()); + WiringPlan::Pdm(Box::new(project)) + } Err((code, detail)) => return VendorOutcome::Refused { code, detail }, } - warnings.extend(project.warnings.iter().cloned()); - WiringPlan::Pdm(Box::new(project)) } PypiFlavor::Pipenv => { let project = match super::pypi_pipenv::load_pipenv_project(project_root).await { @@ -376,15 +398,28 @@ pub async fn vendor_pypi( Err((code, detail)) => return VendorOutcome::Refused { code, detail }, }; match super::pypi_pipenv::check_target_guards(&project, &canon_name, &record.uuid) { - Ok(PipenvTarget::Fresh) => {} - Ok(PipenvTarget::InSync) => return in_sync_outcome(base, record, warnings), + Ok(PipenvTarget::InSync) => WiringPlan::InSync, + Ok(PipenvTarget::Fresh) => { + warnings.extend(project.warnings.iter().cloned()); + WiringPlan::Pipenv(Box::new(project)) + } Err((code, detail)) => return VendorOutcome::Refused { code, detail }, } - warnings.extend(project.warnings.iter().cloned()); - WiringPlan::Pipenv(Box::new(project)) } }; + let in_sync = matches!(plan, WiringPlan::InSync); + if in_sync { + // Wired to this uuid already. Intact artifact → the classic in-sync + // skip (no dist lookup — a not-installed re-run must stay green). + // Missing artifact → rebuild the wheel only; the wiring is correct + // and re-running it would re-record live vendored fragments as + // pre-vendor originals. + if uuid_dir_has_wheel(&project_root.join(&uuid_dir_rel)).await || dry_run { + return in_sync_outcome(base, record, warnings); + } + } + let dist = match locate_installed_dist(site_packages, raw_name, version).await { Ok(d) => d, Err((code, detail)) => return VendorOutcome::Refused { code, detail }, @@ -458,6 +493,40 @@ pub async fn vendor_pypi( )); } + if in_sync { + // Artifact rebuilt; wiring untouched, ledger entry stays with the + // first run (the only copy of the pre-vendor originals). + warnings.push(VendorWarning::new( + "vendor_artifact_rebuilt", + format!( + "the committed vendored wheel for {canon_name}=={version} was missing; \ + rebuilt at {rel_wheel} (lockfile untouched)" + ), + )); + // Restore the informational marker the deleted uuid dir lost. + let mut vulns: Vec = record.vulnerabilities.keys().cloned().collect(); + vulns.sort(); + let marker = VendorMarker { + schema_version: 1, + purl: base.to_string(), + patch_uuid: record.uuid.clone(), + ecosystem: "pypi".to_string(), + vulnerabilities: vulns, + vendored_at: vendored_at.to_string(), + }; + if let Err(e) = write_marker(&project_root.join(&uuid_dir_rel), &marker).await { + warnings.push(VendorWarning::new( + "marker_write_failed", + format!("could not write the vendor marker: {e}"), + )); + } + return VendorOutcome::Done { + result, + entry: None, + warnings, + }; + } + // Marker: artifact-side breadcrumb in the uuid dir (informational only — // sweep/verify key off state.json + the path uuid). Written before the // wiring so lockfile edits stay the last mutation. @@ -495,6 +564,7 @@ pub async fn vendor_pypi( &wheel_name, &artifact.sha256_hex, class, + &record.uuid, ) .await .map(|(wiring, meta)| (wiring, MetaSlot::Uv(Some(meta)))), @@ -541,6 +611,8 @@ pub async fn vendor_pypi( ) .await .map(|(wiring, meta)| (wiring, MetaSlot::Pipenv(meta))), + // Returned right after the wheel build above. + WiringPlan::InSync => unreachable!("in-sync rebuilds never reach wiring"), }; let (wiring, meta) = match wired { Ok(pair) => pair, @@ -954,6 +1026,130 @@ mod tests { assert!(!fx.root.join(format!(".socket/vendor/pypi/{UUID}")).exists()); } + /// uv flavor, wired pair with a deleted committed wheel: the wheel is + /// rebuilt at the recorded path, pyproject + lock stay byte-identical, + /// no fresh ledger entry. An INTACT wheel stays the classic in-sync skip. + #[tokio::test] + async fn uv_wired_missing_wheel_rebuilds_artifact_only() { + let fx = e2e_fixture().await; + // Swap the requirements flavor for a uv project. + tokio::fs::remove_file(fx.root.join("requirements.txt")) + .await + .unwrap(); + touch( + &fx.root, + "pyproject.toml", + r#"[project] +name = "proj" +version = "0.1.0" +requires-python = ">=3.10" +dependencies = ["six==1.16.0"] +"#, + ) + .await; + touch( + &fx.root, + "uv.lock", + r#"version = 1 +revision = 3 +requires-python = ">=3.10" + +[[package]] +name = "proj" +version = "0.1.0" +source = { virtual = "." } +dependencies = [ + { name = "six" }, +] + +[package.metadata] +requires-dist = [{ name = "six", specifier = "==1.16.0" }] + +[[package]] +name = "six" +version = "1.16.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/71/39/171f1c67cd00715f190ba0b100d606d440a28c93c7714febeca8b79af85e/six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", size = 34041, upload-time = "2021-05-05T14:18:18.379Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254", size = 11053, upload-time = "2021-05-05T14:18:17.237Z" }, +] +"#, + ) + .await; + let sources = PatchSources::blobs_only(&fx.blobs); + let vendor_one = |dry_run: bool| { + vendor_pypi( + "pkg:pypi/six@1.16.0", + &fx.site_packages, + &fx.root, + &fx.record, + &sources, + "2026-06-09T00:00:00Z", + dry_run, + false, + ) + }; + + let VendorOutcome::Done { result, entry, .. } = vendor_one(false).await else { + panic!("first vendor must be Done"); + }; + assert!(result.success, "{:?}", result.error); + assert!(entry.is_some()); + let pyproject1 = tokio::fs::read(fx.root.join("pyproject.toml")) + .await + .unwrap(); + let lock1 = tokio::fs::read(fx.root.join("uv.lock")).await.unwrap(); + let uuid_dir = fx.root.join(format!(".socket/vendor/pypi/{UUID}")); + let wheel = uuid_dir.join("six-1.16.0-py2.py3-none-any.whl"); + assert!(wheel.is_file()); + + // Intact wheel: in-sync skip (no rebuild, no entry). + let VendorOutcome::Done { + result: r2, + entry: e2, + warnings: w2, + } = vendor_one(false).await + else { + panic!("re-run must be Done"); + }; + assert!(r2.success); + assert!(e2.is_none(), "in-sync re-run records nothing"); + assert!( + !w2.iter().any(|w| w.code == "vendor_artifact_rebuilt"), + "intact wheel must not claim a rebuild: {w2:?}" + ); + + // Deleted wheel: artifact-only rebuild. + tokio::fs::remove_dir_all(&uuid_dir).await.unwrap(); + let VendorOutcome::Done { + result: r3, + entry: e3, + warnings: w3, + } = vendor_one(false).await + else { + panic!("rebuild run must be Done"); + }; + assert!(r3.success, "{:?}", r3.error); + assert!(e3.is_none(), "artifact-only rebuild records no entry"); + assert!( + w3.iter().any(|w| w.code == "vendor_artifact_rebuilt"), + "rebuild is surfaced: {w3:?}" + ); + assert!(wheel.is_file(), "wheel rebuilt at the recorded path"); + assert_eq!( + tokio::fs::read(fx.root.join("pyproject.toml")) + .await + .unwrap(), + pyproject1, + "pyproject untouched by the rebuild" + ); + assert_eq!( + tokio::fs::read(fx.root.join("uv.lock")).await.unwrap(), + lock1, + "uv.lock untouched by the rebuild" + ); + } + #[tokio::test] async fn uuid_traversal_is_refused_before_any_write() { let fx = e2e_fixture().await; diff --git a/crates/socket-patch-core/src/patch/vendor/pypi_uv.rs b/crates/socket-patch-core/src/patch/vendor/pypi_uv.rs index 7fba29c..2f895ab 100644 --- a/crates/socket-patch-core/src/patch/vendor/pypi_uv.rs +++ b/crates/socket-patch-core/src/patch/vendor/pypi_uv.rs @@ -259,10 +259,20 @@ pub fn classify_dependency(p: &UvProject, canon_name: &str) -> UvDepClass { /// Split out of [`load_uv_project`] because they need the target name; the /// orchestrator runs them pre-flight so a refusal happens before the wheel /// artifact is built. +/// Pre-flight wiring state for one package (mirrors `PdmTarget`). +#[derive(Debug, PartialEq, Eq)] +pub(super) enum UvTarget { + Fresh, + /// `[tool.uv.sources]` already routes the package through THIS patch + /// uuid's vendored wheel — the in-sync hot path. + InSync, +} + pub(super) fn check_target_guards( p: &UvProject, canon_name: &str, -) -> Result<(), (&'static str, String)> { + record_uuid: &str, +) -> Result { // The same name at multiple versions/sources (platform forks) means one // surgical [[package]] rewrite would mispin the other forks — refuse. let units = p @@ -309,6 +319,13 @@ pub(super) fn check_target_guards( .and_then(|t| t.get("path")) .and_then(Value::as_str) .unwrap_or(""); + // Ours at the SAME patch generation: in sync — the sources and + // override entries are our own first-run edits, expected here. + if super::path::parse_vendor_path(path) + .is_some_and(|parts| parts.eco == "pypi" && parts.uuid == record_uuid) + { + return Ok(UvTarget::InSync); + } let detail = if path.contains(".socket/vendor/pypi/") { format!( "[tool.uv.sources] already routes {key} to a socket-patch vendored wheel; \ @@ -345,7 +362,7 @@ pub(super) fn check_target_guards( } } } - Ok(()) + Ok(UvTarget::Fresh) } /// Wire the pair for the vendored wheel. Writes `pyproject.toml` FIRST, then @@ -362,8 +379,9 @@ pub async fn wire_uv( wheel_file_name: &str, wheel_sha256_hex: &str, class: UvDepClass, + record_uuid: &str, ) -> Result<(Vec, UvMeta), (&'static str, String)> { - check_target_guards(p, canon_name)?; + check_target_guards(p, canon_name, record_uuid)?; let mut wiring: Vec = Vec::new(); // ── pyproject.toml (computed in memory; committed before the lock) ──── @@ -1296,6 +1314,7 @@ wheels = [ WHEEL_NAME, WHEEL_SHA, UvDepClass::Direct, + "9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f", ) .await .unwrap(); @@ -1343,6 +1362,7 @@ wheels = [ WHEEL_NAME, WHEEL_SHA, UvDepClass::Transitive, + "9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f", ) .await .unwrap(); @@ -1442,6 +1462,7 @@ wheels = [ WHEEL_NAME, WHEEL_SHA, UvDepClass::Direct, + "9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f", ) .await .unwrap_err(); @@ -1459,6 +1480,7 @@ wheels = [ WHEEL_NAME, WHEEL_SHA, UvDepClass::Transitive, + "9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f", ) .await .unwrap_err(); @@ -1480,13 +1502,15 @@ wheels = [ WHEEL_NAME, WHEEL_SHA, UvDepClass::Direct, + "9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f", ) .await .unwrap_err(); assert_eq!(err.0, "pypi_uv_source_already_exists"); assert!(err.1.contains("user-authored"), "{}", err.1); - // an existing SOCKET source refuses too, pointing at --revert + // an existing SOCKET source from a STALE patch generation refuses, + // pointing at --revert; the SAME generation is the in-sync hot path. let tmp = write_pair( &format!("{DIRECT_REGISTRY_PYPROJECT}\n[tool.uv.sources]\nsix = {{ path = \"{REL_WHEEL}\" }}\n"), DIRECT_REGISTRY_LOCK, @@ -1502,11 +1526,17 @@ wheels = [ WHEEL_NAME, WHEEL_SHA, UvDepClass::Direct, + "11111111-2222-4333-8444-555555555555", ) .await .unwrap_err(); assert_eq!(err.0, "pypi_uv_source_already_exists"); assert!(err.1.contains("--revert"), "{}", err.1); + assert_eq!( + check_target_guards(&p, "six", "9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f"), + Ok(UvTarget::InSync), + "the same patch generation is in sync, not a refusal" + ); // a user override for the package let tmp = write_pair( @@ -1524,6 +1554,7 @@ wheels = [ WHEEL_NAME, WHEEL_SHA, UvDepClass::Transitive, + "9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f", ) .await .unwrap_err(); @@ -1567,6 +1598,7 @@ wheels = [ WHEEL_NAME, WHEEL_SHA, UvDepClass::Direct, + "9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f", ) .await .unwrap_err(); @@ -1593,6 +1625,7 @@ wheels = [ WHEEL_NAME, WHEEL_SHA, UvDepClass::Direct, + "9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f", ) .await .unwrap(); @@ -1622,6 +1655,7 @@ wheels = [ WHEEL_NAME, WHEEL_SHA, UvDepClass::Transitive, + "9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f", ) .await .unwrap(); @@ -1654,6 +1688,7 @@ wheels = [ WHEEL_NAME, WHEEL_SHA, UvDepClass::Direct, + "9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f", ) .await .unwrap(); @@ -1682,6 +1717,7 @@ wheels = [ WHEEL_NAME, WHEEL_SHA, UvDepClass::Direct, + "9f6b2c4e-1d3a-4f6b-8c2d-7e5a9b1c3d5f", ) .await .unwrap(); diff --git a/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs b/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs index 22f6626..618b4a0 100644 --- a/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs +++ b/crates/socket-patch-core/src/patch/vendor/registry_fetch.rs @@ -71,7 +71,11 @@ pub enum FetchError { } /// One shared client for all fetches in a run. -pub fn build_registry_client() -> reqwest::Client { +/// The registry HTTP client type, nameable by callers that don't depend on +/// reqwest directly (the CLI's pristine-source ladder). +pub type RegistryClient = reqwest::Client; + +pub fn build_registry_client() -> RegistryClient { reqwest::Client::builder() .user_agent(USER_AGENT) .timeout(Duration::from_secs(60)) @@ -550,34 +554,46 @@ async fn fetch_golang( async fn fetch_npm( entry: &LockfileEntry, client: &reqwest::Client, +) -> Result { + fetch_npm_inner(entry, client, true).await +} + +async fn fetch_npm_inner( + entry: &LockfileEntry, + client: &reqwest::Client, + verify: bool, ) -> Result { let url = entry .resolved .clone() .unwrap_or_else(|| npm_tarball_url(&npm_registry_base(), &entry.name, &entry.version)); let bytes = download(client, &url).await.map_err(FetchError::Failed)?; - match &entry.integrity { - // yarn berry locks never hash the tarball itself — the checksum is - // sha512 of the deterministic cache zip. Rebuild it from the fetched - // bytes (the same spike-pinned recipe the berry wiring uses) and - // compare. Only cacheKey 10c0 (yarn 4 default) is reproducible. - LockIntegrity::BerryChecksum(expected) => { - if !expected.starts_with("10c0/") { - return Err(FetchError::Unverifiable(format!( - "yarn berry checksum `{expected}` uses a cacheKey other than 10c0; the \ - cache-zip recipe is not reproducible for it" - ))); - } - let actual = super::berry_zip::berry_cache_checksum_10c0(&bytes, &entry.name) - .map_err(FetchError::Failed)?; - if &actual != expected { - return Err(FetchError::Failed(format!( - "yarn berry cache checksum mismatch: lockfile records {expected}, the \ - fetched tarball rebuilds to {actual}" - ))); + if !verify { + // fetch_npm_unverified: the caller owns end-to-end verification. + } else { + match &entry.integrity { + // yarn berry locks never hash the tarball itself — the checksum is + // sha512 of the deterministic cache zip. Rebuild it from the fetched + // bytes (the same spike-pinned recipe the berry wiring uses) and + // compare. Only cacheKey 10c0 (yarn 4 default) is reproducible. + LockIntegrity::BerryChecksum(expected) => { + if !expected.starts_with("10c0/") { + return Err(FetchError::Unverifiable(format!( + "yarn berry checksum `{expected}` uses a cacheKey other than 10c0; \ + the cache-zip recipe is not reproducible for it" + ))); + } + let actual = super::berry_zip::berry_cache_checksum_10c0(&bytes, &entry.name) + .map_err(FetchError::Failed)?; + if &actual != expected { + return Err(FetchError::Failed(format!( + "yarn berry cache checksum mismatch: lockfile records {expected}, \ + the fetched tarball rebuilds to {actual}" + ))); + } } + other => verify_integrity(&bytes, other)?, } - other => verify_integrity(&bytes, other)?, } let tmp = tempfile::tempdir() @@ -681,6 +697,62 @@ async fn download(client: &reqwest::Client, url: &str) -> Result, String /// Verify downloaded bytes against the lock-recorded verifier. Runs BEFORE /// any disk write. Berry cache-zip checksums and go.sum dirhashes have /// dedicated verifiers in their ecosystems' fetchers. +/// Fetch + stage an npm package from its conventional registry URL WITHOUT +/// content verification. The download/extract caps still apply. +/// +/// SECURITY: callers MUST end-to-end verify whatever they derive from the +/// staged copy against an independent trust anchor before committing it — +/// repair's ledger reconstruction verifies the deterministically REBUILT +/// vendored tarball against the integrity the rewired lockfile records +/// (`artifact_matches_integrity`); a tampered pristine source then changes +/// the rebuilt bytes and fails closed. +pub async fn fetch_npm_unverified( + name: &str, + version: &str, + client: &reqwest::Client, +) -> Result { + let entry = LockfileEntry { + ecosystem: "npm", + name: name.to_string(), + version: version.to_string(), + purl: format!("pkg:npm/{name}@{version}"), + resolved: None, + integrity: LockIntegrity::None, + }; + fetch_npm_inner(&entry, client, false).await +} + +/// Whole-artifact verification against a lock-recorded integrity (the same +/// verifiers the fetch path uses, including the berry cache-zip rebuild). +/// `name` feeds the berry cache-zip recipe; ignored otherwise. +pub fn artifact_matches_integrity( + bytes: &[u8], + name: &str, + integrity: &LockIntegrity, +) -> Result<(), String> { + match integrity { + LockIntegrity::BerryChecksum(expected) => { + if !expected.starts_with("10c0/") { + return Err(format!( + "yarn berry checksum `{expected}` uses a cacheKey other than 10c0" + )); + } + let actual = super::berry_zip::berry_cache_checksum_10c0(bytes, name)?; + if &actual == expected { + Ok(()) + } else { + Err(format!( + "yarn berry cache checksum mismatch: lockfile records {expected}, the \ + artifact rebuilds to {actual}" + )) + } + } + other => verify_integrity(bytes, other).map_err(|e| match e { + FetchError::Failed(d) | FetchError::Unverifiable(d) => d, + }), + } +} + fn verify_integrity(bytes: &[u8], integrity: &LockIntegrity) -> Result<(), FetchError> { match integrity { LockIntegrity::Sri(sri) => verify_sri(bytes, sri).map_err(FetchError::Failed), diff --git a/crates/socket-patch-core/src/patch/vendor/verify.rs b/crates/socket-patch-core/src/patch/vendor/verify.rs index 562eef6..45622d6 100644 --- a/crates/socket-patch-core/src/patch/vendor/verify.rs +++ b/crates/socket-patch-core/src/patch/vendor/verify.rs @@ -164,6 +164,101 @@ fn read_wheel_to_map(whl: &Path) -> Result>, String> { Ok(out) } +/// Hard cap on whole-artifact bytes hashed by the health check — committed +/// artifacts are small (a package tarball/wheel); a tampered multi-GiB file +/// must not stall `repair`. +const MAX_HEALTH_HASH_BYTES: u64 = 512 * 1024 * 1024; + +/// Classified health of one ledger entry's committed artifact, for +/// `repair`-style callers that need a DECISION (rebuild or not), not just a +/// routing tag. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum ArtifactHealth { + /// Exists and every record file hashes to its afterHash (and, for + /// file-shaped artifacts, the whole file matches the ledger sha256). + Healthy, + /// Nothing at the artifact path: rebuildable. + Missing, + /// Present but failing verification: rebuildable. `reason` is the + /// stable routing tag (`vendor_hash_mismatch`, `file_not_found`, + /// `vendor_artifact_unreadable`, `vendor_sha256_mismatch`). + Corrupt { reason: String }, + /// The ledger/artifact uuid doesn't match the record: a re-vendor is + /// pending — not repair's job. + StaleUuid, + /// The entry can't be judged (poisoned path, empty record): fail + /// closed, never rebuild from it. + Unverifiable { reason: String }, +} + +/// Health-check one vendored artifact against its patch record: the +/// per-file afterHash verification of [`verify_vendored_patch_record`] +/// plus, for file-shaped artifacts (`.tgz`/`.tar.gz`/`.whl`) with a +/// recorded ledger sha256, a whole-file hash cross-check — the rewired +/// lockfile integrity references those exact bytes, so silent drift breaks +/// the package manager even when the patched members still verify. +pub async fn check_vendored_artifact( + project_root: &Path, + entry: &VendorEntry, + record: &PatchRecord, +) -> ArtifactHealth { + match verify_vendored_patch_record(project_root, entry, record).await { + Err(tag) => match tag.as_str() { + "vendor_artifact_missing" => ArtifactHealth::Missing, + "vendor_uuid_mismatch" => ArtifactHealth::StaleUuid, + "vendor_hash_mismatch" | "file_not_found" | "vendor_artifact_unreadable" => { + ArtifactHealth::Corrupt { reason: tag } + } + _ => ArtifactHealth::Unverifiable { reason: tag }, + }, + Ok(()) => { + let norm = entry.artifact.path.replace('\\', "/"); + let file_shaped = + norm.ends_with(".tgz") || norm.ends_with(".tar.gz") || norm.ends_with(".whl"); + if !file_shaped || entry.artifact.sha256.is_empty() { + return ArtifactHealth::Healthy; + } + // The path already passed checked_artifact_path inside the + // verification above. + match file_sha256_hex(&project_root.join(&norm)).await { + Some(hex) if hex.eq_ignore_ascii_case(&entry.artifact.sha256) => { + ArtifactHealth::Healthy + } + Some(_) => ArtifactHealth::Corrupt { + reason: "vendor_sha256_mismatch".to_string(), + }, + None => ArtifactHealth::Corrupt { + reason: "vendor_artifact_unreadable".to_string(), + }, + } + } + } +} + +/// Plain sha256 hex of a regular file, size-capped; `None` on any read +/// failure or cap breach. Public for repair's ledger re-synthesis (the +/// rebuilt artifact's recorded sha). +pub async fn file_sha256_hex(path: &Path) -> Option { + use sha2::{Digest, Sha256}; + use tokio::io::AsyncReadExt; + + let meta = tokio::fs::metadata(path).await.ok()?; + if !meta.is_file() || meta.len() > MAX_HEALTH_HASH_BYTES { + return None; + } + let mut file = tokio::fs::File::open(path).await.ok()?; + let mut hasher = Sha256::new(); + let mut buf = vec![0u8; 64 * 1024]; + loop { + let n = file.read(&mut buf).await.ok()?; + if n == 0 { + break; + } + hasher.update(&buf[..n]); + } + Some(hex::encode(hasher.finalize())) +} + fn verify_member_map( members: &HashMap>, record: &PatchRecord, @@ -421,4 +516,89 @@ mod tests { "vendor_artifact_missing" ); } + + /// Full classification matrix for the repair-facing health check. + #[tokio::test] + async fn artifact_health_classification_matrix() { + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + let rel = format!(".socket/vendor/npm/{UUID}/x-1.0.0.tgz"); + let rec = record(UUID, "package/index.js"); + + // Missing. + let ent = entry("npm", UUID, &rel); + assert_eq!( + check_vendored_artifact(root, &ent, &rec).await, + ArtifactHealth::Missing + ); + + // Healthy (no ledger sha recorded → member verification only). + tokio::fs::create_dir_all(root.join(format!(".socket/vendor/npm/{UUID}"))) + .await + .unwrap(); + write_tgz(&root.join(&rel), "package/index.js", PATCHED); + assert_eq!( + check_vendored_artifact(root, &ent, &rec).await, + ArtifactHealth::Healthy + ); + + // Healthy with a MATCHING ledger sha256. + let tgz_bytes = tokio::fs::read(root.join(&rel)).await.unwrap(); + let mut ent_sha = entry("npm", UUID, &rel); + ent_sha.artifact.sha256 = { + use sha2::{Digest, Sha256}; + hex::encode(Sha256::digest(&tgz_bytes)) + }; + assert_eq!( + check_vendored_artifact(root, &ent_sha, &rec).await, + ArtifactHealth::Healthy + ); + + // Whole-file drift the member check can't see: members verify, but + // the bytes differ from what the lockfile integrity references + // (re-compressed archive → different sha). + ent_sha.artifact.sha256 = "0".repeat(64); + assert_eq!( + check_vendored_artifact(root, &ent_sha, &rec).await, + ArtifactHealth::Corrupt { + reason: "vendor_sha256_mismatch".to_string() + } + ); + + // Member tamper. + write_tgz(&root.join(&rel), "package/index.js", b"tampered"); + assert_eq!( + check_vendored_artifact(root, &ent, &rec).await, + ArtifactHealth::Corrupt { + reason: "vendor_hash_mismatch".to_string() + } + ); + + // Unreadable. + tokio::fs::write(root.join(&rel), b"\x1f\x8b00garbage") + .await + .unwrap(); + assert_eq!( + check_vendored_artifact(root, &ent, &rec).await, + ArtifactHealth::Corrupt { + reason: "vendor_artifact_unreadable".to_string() + } + ); + + // Stale uuid → not repair's job. + let rec_new = record("11111111-2222-4333-8444-555555555555", "package/index.js"); + assert_eq!( + check_vendored_artifact(root, &ent, &rec_new).await, + ArtifactHealth::StaleUuid + ); + + // Poisoned path → fail closed. + let ent_bad = entry("npm", UUID, "../../outside.tgz"); + assert_eq!( + check_vendored_artifact(root, &ent_bad, &rec).await, + ArtifactHealth::Unverifiable { + reason: "vendor_path_unsafe".to_string() + } + ); + } } From c05805ebf2dfe46ee4b683d753dc5c0f9c546afa Mon Sep 17 00:00:00 2001 From: Mikola Lysenko Date: Fri, 12 Jun 2026 02:37:10 -0400 Subject: [PATCH 17/19] =?UTF-8?q?fix(pnpm):=20bind=20vendor=20edits=20to?= =?UTF-8?q?=20name@VERSION=20=E2=80=94=20multi-version=20vendoring=20corru?= =?UTF-8?q?pted=20the=20lock?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every "ours" probe in the pnpm backend matched ANY same-name .socket/vendor path, so a project vendoring the same package at several versions (Flowise: three fast-xml-parser, five minimatch patches) had each version's edit treat its siblings' entries as its own stale wiring: override values were clobbered to the wrong tarball and packages/ snapshots rekeys spliced duplicated mapping keys — which pnpm hard-rejects (ERR_PNPM_BROKEN_LOCKFILE), discovered live when repair's reconstruction re-dispatched all versions in sequence. - EditCtx::is_ours / both is_ours_key block probes / the override classification + lock-side mirror check now require the vendor path's leaf to be THIS name-version.tgz (any uuid — stale-uuid refresh unchanged); sibling-version vendored entries are skipped as coexisting. - edit_packages/edit_snapshot_rekey fail closed when BOTH the registry-keyed and our file:-keyed entry exist (a half-edited lock): refusing beats splicing a duplicate key. - Regression tests: multi-version vendor coexistence (per-section duplicate-key audit), integrity-drift refresh stays single-keyed, half-drifted duplicate guard. Live-verified on Flowise end to end: scan --vendor (16/16) → pnpm install --frozen-lockfile → rm -rf .socket → repair (16/16 reconstructed from the lockfile alone) → frozen install again, exit 0. Co-Authored-By: Claude Fable 5 --- .../src/patch/vendor/pnpm_lock.rs | 163 +++++++++++++++++- 1 file changed, 155 insertions(+), 8 deletions(-) diff --git a/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs b/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs index 179d59d..01e7151 100644 --- a/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs +++ b/crates/socket-patch-core/src/patch/vendor/pnpm_lock.rs @@ -550,10 +550,15 @@ impl EditCtx<'_> { format!("{}@{}", self.name, self.spec) } - /// Does `value` point into `.socket/vendor/npm/` (ours — any uuid; a - /// stale uuid is rewritten to the current one with `original: None`)? + /// Does `value` point at OUR vendored tarball for THIS name@version + /// (any uuid — a stale uuid is rewritten to the current one with + /// `original: None`)? The leaf binding is load-bearing: a project can + /// vendor the SAME package at several versions, and a name-only match + /// would let one version's edit clobber another's entries. fn is_ours(&self, value: &str) -> bool { - parse_vendor_path(value).is_some_and(|p| p.eco == "npm") + parse_vendor_path(value).is_some_and(|p| { + p.eco == "npm" && p.leaf == super::npm_common::tgz_rel_leaf(self.name, self.version) + }) } /// The per-importer `specifier:` spelling: re-relativized for nested @@ -617,6 +622,15 @@ fn is_vendor_value(value: &str) -> bool { parse_vendor_path(value).is_some_and(|p| p.eco == "npm") } +/// A vendor value belonging to THIS `name@version`'s tarball (any uuid). +/// The leaf binding matters: a project can vendor the same package at +/// several versions, and edits must never treat a SIBLING version's +/// override/entry as their own. +fn vendor_value_is_for(value: &str, name: &str, version: &str) -> bool { + parse_vendor_path(value) + .is_some_and(|p| p.eco == "npm" && p.leaf == super::npm_common::tgz_rel_leaf(name, version)) +} + /// How the package.json `pnpm.overrides` table relates to the package /// being vendored. The lock's `overrides:` section must mirror this map /// key-for-key (pnpm hard-checks the two and fails @@ -673,13 +687,18 @@ fn classify_pkg_override( if override_key_name(key) != name { continue; } + let value_str = value.as_str().unwrap_or(""); + // A SIBLING version's vendored override coexists — not ours to + // touch (and not a conflict): skip it entirely. + if is_vendor_value(value_str) && !vendor_value_is_for(value_str, name, version) { + continue; + } if found.is_some() { return Err(format!( "package.json carries more than one pnpm override for `{name}`; vendoring \ cannot pick one — remove the extras first" )); } - let value_str = value.as_str().unwrap_or(""); let classified = if key.contains('>') { None } else if is_vendor_value(value_str) { @@ -727,6 +746,10 @@ fn check_lock_override( if override_key_name(&key) != name { continue; } + // A sibling version's vendored override coexists — skip it. + if is_vendor_value(&rest) && !vendor_value_is_for(&rest, name, version) { + continue; + } if key != effective_key { return Err(format!( "{PNPM_LOCK} carries an override key `{key}` for `{name}` that does not \ @@ -1018,7 +1041,7 @@ fn edit_packages( } else if block .key .strip_prefix(&ours_prefix) - .is_some_and(|rest| parse_vendor_path(rest).is_some_and(|p| p.eco == "npm")) + .is_some_and(|rest| ctx.is_ours(rest)) { has_ours = true; } @@ -1038,7 +1061,7 @@ fn edit_packages( let is_ours_key = block .key .strip_prefix(&ours_prefix) - .is_some_and(|rest| parse_vendor_path(rest).is_some_and(|p| p.eco == "npm")); + .is_some_and(|rest| ctx.is_ours(rest)); if !is_registry && !is_ours_key { i = block.end; continue; @@ -1125,7 +1148,7 @@ fn edit_snapshot_rekey( } else if block .key .strip_prefix(&ours_prefix) - .is_some_and(|rest| parse_vendor_path(rest).is_some_and(|p| p.eco == "npm")) + .is_some_and(|rest| ctx.is_ours(rest)) { has_ours = true; } @@ -1144,7 +1167,7 @@ fn edit_snapshot_rekey( let is_ours_key = block .key .strip_prefix(&ours_prefix) - .is_some_and(|rest| parse_vendor_path(rest).is_some_and(|p| p.eco == "npm")); + .is_some_and(|rest| ctx.is_ours(rest)); if !is_registry && !is_ours_key { i = block.end; continue; @@ -2697,6 +2720,130 @@ snapshots: assert_eq!(fx.read(PACKAGE_JSON).await, pkg_before, "pkg untouched"); } + /// Two VERSIONS of the same package vendored in sequence: each edit + /// must bind to its own version's entries — a name-only "ours" match + /// would let the second vendor clobber/rekey the first one's blocks + /// (live-debugged on Flowise: identical duplicated mapping keys). + #[tokio::test] + async fn multi_version_vendor_does_not_clobber_sibling_entries() { + let fx = fixture_with(P1_BEFORE_PKG, P1_BEFORE_LOCK).await; + let (r1, e1, _) = expect_done(fx.vendor(false).await); + assert!(r1.success); + assert!(e1.is_some()); + let tgz_13 = fx.rel_tgz(); + + // Vendor left-pad@1.2.0 under a DIFFERENT uuid (the `left-pad-old` + // npm: alias resolves it in the same lock). + let uuid2 = "22222222-3333-4444-8555-666666666666"; + let installed2 = fx.root().join("node_modules/left-pad-old"); + tokio::fs::create_dir_all(&installed2).await.unwrap(); + tokio::fs::write( + installed2.join("package.json"), + br#"{"name":"left-pad","version":"1.2.0"}"#, + ) + .await + .unwrap(); + tokio::fs::write(installed2.join("index.js"), ORIG_INDEX) + .await + .unwrap(); + let mut record2 = fx.record.clone(); + record2.uuid = uuid2.to_string(); + let blobs = fx.root().join(".socket/blobs"); + let sources = PatchSources::blobs_only(&blobs); + let outcome = vendor_pnpm( + "pkg:npm/left-pad@1.2.0", + &installed2, + fx.root(), + &record2, + &sources, + "2026-06-09T00:00:00Z", + false, + false, + ) + .await; + let (r2, e2, _) = expect_done(outcome); + assert!(r2.success, "{:?}", r2.error); + assert!(e2.is_some()); + + let lock = fx.read(PNPM_LOCK).await; + let key13 = format!(" left-pad@file:{tgz_13}:"); + let key12 = format!(" left-pad@file:.socket/vendor/npm/{uuid2}/left-pad-1.2.0.tgz:"); + // Both versions' packages + snapshots blocks exist exactly once + // each (snapshot entries may be inline `key: {}`). + for (key, label) in [(&key13, "1.3.0"), (&key12, "1.2.0")] { + assert_eq!( + lock.lines().filter(|l| l.starts_with(key.as_str())).count(), + 2, // packages + snapshots + "{label} entries intact:\n{lock}" + ); + } + // No duplicated mapping keys within a section (what pnpm + // hard-rejects): each section's 2-space keys are unique. + for section in ["overrides", "packages", "snapshots"] { + let Some((start, end)) = section_bounds(&split_lines(&lock), section) else { + continue; + }; + let lines = split_lines(&lock); + let mut keys: Vec = lines[start + 1..end] + .iter() + .filter_map(|l| parse_key_line(l, 2).map(|(k, _, _)| k)) + .collect(); + let total = keys.len(); + keys.sort_unstable(); + keys.dedup(); + assert_eq!(total, keys.len(), "duplicated keys in {section}:\n{lock}"); + } + } + + /// Re-vendor over a wired lock whose recorded integrity DRIFTED (e.g. + /// the artifact was rebuilt from a differently-shaped source): the + /// stale-ours refresh must REPLACE the file:-keyed blocks, never + /// duplicate them. + #[tokio::test] + async fn integrity_drift_refresh_never_duplicates_keys() { + let fx = fixture_with(P1_BEFORE_PKG, P1_BEFORE_LOCK).await; + let (_, entry, _) = expect_done(fx.vendor(false).await); + assert!(entry.is_some()); + + // Simulate drift: the lock records a DIFFERENT integrity for OUR + // file: entry (only) than the tarball the next run will pack. + let lock = fx.read(PNPM_LOCK).await; + let drifted = lock + .lines() + .map(|l| { + if l.contains("tarball: file:.socket") { + l.replace("integrity: sha512-", "integrity: sha512-DRIFT") + } else { + l.to_string() + } + }) + .collect::>() + .join("\n"); + assert_ne!(drifted, lock); + tokio::fs::write(fx.root().join(PNPM_LOCK), &drifted) + .await + .unwrap(); + + let (result, _, _) = expect_done(fx.vendor(false).await); + assert!(result.success, "{:?}", result.error); + let healed = fx.read(PNPM_LOCK).await; + let ours_key = format!(" left-pad@file:{}:", fx.rel_tgz()); + let count = healed.lines().filter(|l| *l == ours_key.as_str()).count(); + assert_eq!( + count, 1, + "exactly one file:-keyed packages/snapshots block per section; lock: +{healed}" + ); + let snap_count = healed + .matches(&format!("left-pad@file:{}", fx.rel_tgz())) + .count(); + assert!( + !healed.contains("sha512-DRIFT"), + "drifted integrity healed: {snap_count} refs +{healed}" + ); + } + #[tokio::test] async fn dry_run_writes_nothing() { let fx = fixture_with(P1_BEFORE_PKG, P1_BEFORE_LOCK).await; From 6091280792b8e17d86cd6719cd6d956b91b1025b Mon Sep 17 00:00:00 2001 From: Mikola Lysenko Date: Fri, 12 Jun 2026 11:23:59 -0400 Subject: [PATCH 18/19] test(docker): pin scan --sync to --strict where apply --force must stay the writer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cargo/composer/golang/maven/nuget docker chains use an all-zeros beforeHash fixture and assert that the dedicated `apply --force` step is the one that patches (exactly one applied, skipped:0, marker written by apply). The new mismatch-warn default makes `scan --sync` overwrite the mismatched file with the verified blob during the scan itself, so the later apply reported already_patched and every gate failed. `--strict` restores the hard-error scan these scripts encode; the warn-overwrite default keeps its coverage in the wiremock apply/scan suites and the deno/gem/pypi docker chains (real beforeHashes). The remaining red CI (3-OS test, test-release, coverage, deno/pypi docker baselines) was the live patches API returning 503 "Service temporarily over capacity" during the run — transient, recovered. Co-Authored-By: Claude Fable 5 --- crates/socket-patch-cli/tests/docker_e2e_cargo.rs | 13 +++++++------ .../socket-patch-cli/tests/docker_e2e_composer.rs | 4 ++-- crates/socket-patch-cli/tests/docker_e2e_golang.rs | 2 +- crates/socket-patch-cli/tests/docker_e2e_maven.rs | 2 +- crates/socket-patch-cli/tests/docker_e2e_nuget.rs | 4 ++-- 5 files changed, 13 insertions(+), 12 deletions(-) diff --git a/crates/socket-patch-cli/tests/docker_e2e_cargo.rs b/crates/socket-patch-cli/tests/docker_e2e_cargo.rs index 9966217..7f71f1e 100644 --- a/crates/socket-patch-cli/tests/docker_e2e_cargo.rs +++ b/crates/socket-patch-cli/tests/docker_e2e_cargo.rs @@ -177,12 +177,13 @@ chmod u+w "$LIB_RS" || true # scan --sync writes manifest + blob; the cargo crawler with --global # probes $CARGO_HOME/registry/src/. Note: in this fixture scan's own -# apply pass matches 0 files (the all-zeros beforeHash doesn't match the -# real cfg-if bytes), so scan exits non-zero (partial_failure) BY DESIGN -# — the dedicated `apply --force` step below does the real patching. -# Exit code is logged for diagnostics, not gated; the gate is the exact -# content-hash check at the end. -socket-patch scan --json --sync --yes --global \ +# apply pass meets an all-zeros beforeHash that doesn't match the real +# cfg-if bytes; `--strict` pins the hard-error behavior (the default +# would warn and apply the full blob) so scan exits non-zero +# (partial_failure) BY DESIGN and the dedicated `apply --force` step +# below stays the verified writer. Exit code is logged for diagnostics, +# not gated; the gate is the exact content-hash check at the end. +socket-patch scan --json --sync --strict --yes --global \ --api-url '{api_url}' --api-token fake --org {ORG} \ --ecosystems cargo > /tmp/sync.out 2>/tmp/sync.err SCAN_RC=$? diff --git a/crates/socket-patch-cli/tests/docker_e2e_composer.rs b/crates/socket-patch-cli/tests/docker_e2e_composer.rs index 6686e82..b71ec49 100644 --- a/crates/socket-patch-cli/tests/docker_e2e_composer.rs +++ b/crates/socket-patch-cli/tests/docker_e2e_composer.rs @@ -225,7 +225,7 @@ fi PRE_SHA=$(sha256sum "$PHP_FILE" | cut -d' ' -f1) # scan exit code is intentionally not gated (see verify_snippet); capture JSON. -socket-patch scan --json --sync --yes \ +socket-patch scan --json --sync --strict --yes \ --api-url '{api_url}' --api-token fake --org {ORG} \ --ecosystems composer > /tmp/scan.json 2>/tmp/sync.err cat /tmp/sync.err >&2 @@ -264,7 +264,7 @@ PRE_SHA=$(sha256sum "$PHP_FILE" | cut -d' ' -f1) mkdir -p /workspace/proj && cd /workspace/proj # scan exit code is intentionally not gated (see verify_snippet); capture JSON. -socket-patch scan --json --sync --yes --global \ +socket-patch scan --json --sync --strict --yes --global \ --api-url '{api_url}' --api-token fake --org {ORG} \ --ecosystems composer > /tmp/scan.json 2>/tmp/sync.err cat /tmp/sync.err >&2 diff --git a/crates/socket-patch-cli/tests/docker_e2e_golang.rs b/crates/socket-patch-cli/tests/docker_e2e_golang.rs index e281432..9f793db 100644 --- a/crates/socket-patch-cli/tests/docker_e2e_golang.rs +++ b/crates/socket-patch-cli/tests/docker_e2e_golang.rs @@ -166,7 +166,7 @@ chmod u+w "$GIN_GO" || true # exits non-zero (partial_failure) BY DESIGN — the dedicated `apply # --force` step below does the real patching. Exit code is logged for # diagnostics, not gated; the gate is the exact content-hash check below. -socket-patch scan --json --sync --yes --global \ +socket-patch scan --json --sync --strict --yes --global \ --api-url '{api_url}' --api-token fake --org {ORG} \ --ecosystems golang > /tmp/sync.out 2>/tmp/sync.err SCAN_RC=$? diff --git a/crates/socket-patch-cli/tests/docker_e2e_maven.rs b/crates/socket-patch-cli/tests/docker_e2e_maven.rs index 526bbf4..c40c583 100644 --- a/crates/socket-patch-cli/tests/docker_e2e_maven.rs +++ b/crates/socket-patch-cli/tests/docker_e2e_maven.rs @@ -190,7 +190,7 @@ chmod u+w "$POM_FILE" || true # gated (scan's own apply pass matches 0 files because the all-zeros # beforeHash doesn't match the real .pom bytes); the gate is the exact # content-hash check at the end. -socket-patch scan --json --sync --yes --global \ +socket-patch scan --json --sync --strict --yes --global \ --api-url '{api_url}' --api-token fake --org {ORG} \ --ecosystems maven > /tmp/sync.out 2>/tmp/sync.err SCAN_RC=$? diff --git a/crates/socket-patch-cli/tests/docker_e2e_nuget.rs b/crates/socket-patch-cli/tests/docker_e2e_nuget.rs index 7cad49e..13b56de 100644 --- a/crates/socket-patch-cli/tests/docker_e2e_nuget.rs +++ b/crates/socket-patch-cli/tests/docker_e2e_nuget.rs @@ -198,7 +198,7 @@ echo "===SCAN VERIFIED===" >&2 # because the fixture's placeholder beforeHash doesn't match the real # installed bytes. That's expected — the separate forced apply below # is what actually writes the patch, so we only log sync's exit code. -socket-patch scan --json --sync --yes "${{COMMON_ARGS[@]}}" >/tmp/sync.out 2>/tmp/sync.err +socket-patch scan --json --sync --strict --yes "${{COMMON_ARGS[@]}}" >/tmp/sync.out 2>/tmp/sync.err echo "sync exit=$?" >&2 cat /tmp/sync.out >&2 || true cat /tmp/sync.err >&2 || true @@ -326,7 +326,7 @@ echo "===SCAN VERIFIED===" >&2 # 2. scan --sync. May exit non-zero (un-forced sync-apply HashMismatch # against the fixture's placeholder beforeHash); the forced apply # below is what writes the patch, so only log sync's exit code. -socket-patch scan --json --sync --yes "${{COMMON_ARGS[@]}}" >/tmp/sync.out 2>/tmp/sync.err +socket-patch scan --json --sync --strict --yes "${{COMMON_ARGS[@]}}" >/tmp/sync.out 2>/tmp/sync.err echo "sync exit=$?" >&2 cat /tmp/sync.out >&2 || true cat /tmp/sync.err >&2 || true From cb1b0a601467698ced220d91bef46edcc68800d9 Mon Sep 17 00:00:00 2001 From: Mikola Lysenko Date: Fri, 12 Jun 2026 11:29:16 -0400 Subject: [PATCH 19/19] test(docker): pin the content_mismatch_overwritten warning in the force-apply gates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With scan pinned to --strict, the dedicated `apply --force` step is the writer again — but force-overwriting the all-zeros-baseline fixture now also surfaces the content_mismatch_overwritten warning as a Skipped event, so the old `skipped:0` gates fail. Assert the new contract instead: exactly one skip AND the warning's errorCode present (cargo/golang/maven/nuget×2; composer has no skipped gate). All five suites verified locally against freshly built images. Co-Authored-By: Claude Fable 5 --- .../tests/docker_e2e_cargo.rs | 12 ++++++++-- .../tests/docker_e2e_golang.rs | 12 ++++++++-- .../tests/docker_e2e_maven.rs | 12 ++++++++-- .../tests/docker_e2e_nuget.rs | 24 +++++++++++++++---- 4 files changed, 50 insertions(+), 10 deletions(-) diff --git a/crates/socket-patch-cli/tests/docker_e2e_cargo.rs b/crates/socket-patch-cli/tests/docker_e2e_cargo.rs index 7f71f1e..95cd40a 100644 --- a/crates/socket-patch-cli/tests/docker_e2e_cargo.rs +++ b/crates/socket-patch-cli/tests/docker_e2e_cargo.rs @@ -228,8 +228,16 @@ grep -q '"failed": 0,' /tmp/apply.out || {{ cat /tmp/apply.out >&2 exit 1 }} -grep -q '"skipped": 0,' /tmp/apply.out || {{ - echo "FAIL: apply JSON did not report skipped:0" >&2 +# The --force overwrite of the mismatched baseline surfaces the +# content_mismatch_overwritten warning as a Skipped event (the +# mismatch-warn contract) — exactly that one, nothing else skipped. +grep -q '"skipped": 1,' /tmp/apply.out || {{ + echo "FAIL: apply JSON did not report skipped:1 (the mismatch-overwrite warning)" >&2 + cat /tmp/apply.out >&2 + exit 1 +}} +grep -q '"errorCode": "content_mismatch_overwritten"' /tmp/apply.out || {{ + echo "FAIL: apply JSON missing the content_mismatch_overwritten warning event" >&2 cat /tmp/apply.out >&2 exit 1 }} diff --git a/crates/socket-patch-cli/tests/docker_e2e_golang.rs b/crates/socket-patch-cli/tests/docker_e2e_golang.rs index 9f793db..e2d5d16 100644 --- a/crates/socket-patch-cli/tests/docker_e2e_golang.rs +++ b/crates/socket-patch-cli/tests/docker_e2e_golang.rs @@ -209,8 +209,16 @@ grep -qE '^[[:space:]]*"failed": 0,[[:space:]]*$' /tmp/apply.out || {{ cat /tmp/apply.out >&2 exit 1 }} -grep -qE '^[[:space:]]*"skipped": 0,[[:space:]]*$' /tmp/apply.out || {{ - echo "FAIL: apply JSON reported a non-zero skipped count" >&2 +# The --force overwrite of the mismatched baseline surfaces the +# content_mismatch_overwritten warning as a Skipped event (the +# mismatch-warn contract) — exactly that one, nothing else skipped. +grep -qE '^[[:space:]]*"skipped": 1,[[:space:]]*$' /tmp/apply.out || {{ + echo "FAIL: apply JSON did not report skipped:1 (the mismatch-overwrite warning)" >&2 + cat /tmp/apply.out >&2 + exit 1 +}} +grep -q '"errorCode": "content_mismatch_overwritten"' /tmp/apply.out || {{ + echo "FAIL: apply JSON missing the content_mismatch_overwritten warning event" >&2 cat /tmp/apply.out >&2 exit 1 }} diff --git a/crates/socket-patch-cli/tests/docker_e2e_maven.rs b/crates/socket-patch-cli/tests/docker_e2e_maven.rs index c40c583..5dc3474 100644 --- a/crates/socket-patch-cli/tests/docker_e2e_maven.rs +++ b/crates/socket-patch-cli/tests/docker_e2e_maven.rs @@ -235,8 +235,16 @@ grep -q '"failed": 0,' /tmp/apply.out || {{ cat /tmp/apply.out >&2 exit 1 }} -grep -q '"skipped": 0,' /tmp/apply.out || {{ - echo "FAIL: apply JSON did not report skipped:0" >&2 +# The --force overwrite of the mismatched baseline surfaces the +# content_mismatch_overwritten warning as a Skipped event (the +# mismatch-warn contract) — exactly that one, nothing else skipped. +grep -q '"skipped": 1,' /tmp/apply.out || {{ + echo "FAIL: apply JSON did not report skipped:1 (the mismatch-overwrite warning)" >&2 + cat /tmp/apply.out >&2 + exit 1 +}} +grep -q '"errorCode": "content_mismatch_overwritten"' /tmp/apply.out || {{ + echo "FAIL: apply JSON missing the content_mismatch_overwritten warning event" >&2 cat /tmp/apply.out >&2 exit 1 }} diff --git a/crates/socket-patch-cli/tests/docker_e2e_nuget.rs b/crates/socket-patch-cli/tests/docker_e2e_nuget.rs index 13b56de..622b9a2 100644 --- a/crates/socket-patch-cli/tests/docker_e2e_nuget.rs +++ b/crates/socket-patch-cli/tests/docker_e2e_nuget.rs @@ -240,8 +240,16 @@ grep -q '"failed": 0,' /tmp/apply.out || {{ cat /tmp/apply.out >&2 exit 1 }} -grep -q '"skipped": 0,' /tmp/apply.out || {{ - echo "FAIL: apply JSON did not report skipped:0" >&2 +# The --force overwrite of the mismatched baseline surfaces the +# content_mismatch_overwritten warning as a Skipped event (the +# mismatch-warn contract) — exactly that one, nothing else skipped. +grep -q '"skipped": 1,' /tmp/apply.out || {{ + echo "FAIL: apply JSON did not report skipped:1 (the mismatch-overwrite warning)" >&2 + cat /tmp/apply.out >&2 + exit 1 +}} +grep -q '"errorCode": "content_mismatch_overwritten"' /tmp/apply.out || {{ + echo "FAIL: apply JSON missing the content_mismatch_overwritten warning event" >&2 cat /tmp/apply.out >&2 exit 1 }} @@ -362,8 +370,16 @@ grep -q '"failed": 0,' /tmp/apply.out || {{ cat /tmp/apply.out >&2 exit 1 }} -grep -q '"skipped": 0,' /tmp/apply.out || {{ - echo "FAIL: apply JSON did not report skipped:0" >&2 +# The --force overwrite of the mismatched baseline surfaces the +# content_mismatch_overwritten warning as a Skipped event (the +# mismatch-warn contract) — exactly that one, nothing else skipped. +grep -q '"skipped": 1,' /tmp/apply.out || {{ + echo "FAIL: apply JSON did not report skipped:1 (the mismatch-overwrite warning)" >&2 + cat /tmp/apply.out >&2 + exit 1 +}} +grep -q '"errorCode": "content_mismatch_overwritten"' /tmp/apply.out || {{ + echo "FAIL: apply JSON missing the content_mismatch_overwritten warning event" >&2 cat /tmp/apply.out >&2 exit 1 }}