diff --git a/.github/bump_version.py b/.github/bump_version.py index 391c64dc..3dd57a82 100644 --- a/.github/bump_version.py +++ b/.github/bump_version.py @@ -1,5 +1,6 @@ """Infer semver bump from towncrier fragment types and update version.""" +import json import re import subprocess import sys @@ -116,41 +117,34 @@ def update_file(path: Path, new_version: str): print(f" Updated {path}") -def sync_release_manifest_versions(manifest_dir: Path, new_version: str): - if not manifest_dir.exists(): +def sync_bundle_versions(bundle_path: Path, new_version: str): + if not bundle_path.exists(): return - - for manifest_path in sorted(manifest_dir.glob("*.json")): - country_id = manifest_path.stem - text = manifest_path.read_text() - updated = text - updated, bundle_id_replacements = re.subn( - r'("bundle_id"\s*:\s*")[^"]+(")', - rf"\g<1>{country_id}-{new_version}\g<2>", - updated, - count=1, + bundle = json.loads(bundle_path.read_text()) + required = ["bundle_version", "policyengine_version", "packages"] + missing = [field for field in required if field not in bundle] + if missing: + print( + f"Could not update {bundle_path}: missing fields {', '.join(missing)}", + file=sys.stderr, ) - updated, policyengine_version_replacements = re.subn( - r'("policyengine_version"\s*:\s*")[^"]+(")', - rf"\g<1>{new_version}\g<2>", - updated, - count=1, + sys.exit(1) + bundle["bundle_version"] = new_version + bundle["policyengine_version"] = new_version + try: + bundle["packages"]["policyengine"]["version"] = new_version + except KeyError: + print( + f"Could not update {bundle_path}: missing packages.policyengine.version", + file=sys.stderr, ) - missing_fields = [] - if bundle_id_replacements == 0: - missing_fields.append("bundle_id") - if policyengine_version_replacements == 0: - missing_fields.append("policyengine_version") - if missing_fields: - print( - f"Could not update {manifest_path}: missing fields " - f"{', '.join(missing_fields)}", - file=sys.stderr, - ) - sys.exit(1) - if updated != text: - manifest_path.write_text(updated) - print(f" Updated {manifest_path}") + sys.exit(1) + for country_id, data_release in bundle.get("data_releases", {}).items(): + if isinstance(data_release, dict): + data_release["policyengine_version"] = new_version + data_release["bundle_id"] = f"{country_id}-{new_version}" + bundle_path.write_text(json.dumps(bundle, indent=2, sort_keys=True) + "\n") + print(f" Updated {bundle_path}") def main(): @@ -158,7 +152,7 @@ def main(): pyproject = root / "pyproject.toml" changelog = root / "CHANGELOG.md" changelog_dir = root / "changelog.d" - manifest_dir = root / "src" / "policyengine" / "data" / "release_manifests" + bundle_path = root / "src" / "policyengine" / "data" / "bundle" / "manifest.json" current = get_current_version(pyproject, changelog, root) bump = infer_bump(changelog_dir) @@ -167,7 +161,7 @@ def main(): print(f"Version: {current} -> {new} ({bump})") update_file(pyproject, new) - sync_release_manifest_versions(manifest_dir, new) + sync_bundle_versions(bundle_path, new) if __name__ == "__main__": diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml index fcbf08e5..78711dbf 100644 --- a/.github/workflows/pr_code_changes.yaml +++ b/.github/workflows/pr_code_changes.yaml @@ -6,8 +6,11 @@ on: paths: - src/** - tests/** + - scripts/** - .github/** - changelog.d/** + - pyproject.toml + - src/policyengine/data/bundle/manifest.json workflow_dispatch: jobs: @@ -73,6 +76,45 @@ jobs: run: uv pip install --system . h5py - name: Smoke-import core modules run: python -c "import policyengine; from policyengine.core import Dataset, Policy, Simulation; from policyengine.outputs import aggregate, poverty, inequality; print('import OK')" + BundleVerification: + name: Verify bundle metadata + runs-on: ubuntu-latest + env: + POLICYENGINE_SKIP_COUNTRY_IMPORTS: "1" + steps: + - uses: actions/checkout@v6 + - name: Install uv + uses: astral-sh/setup-uv@v8.1.0 + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: '3.13' + - name: Check derived bundle metadata + run: python scripts/bundle.py check + - name: Install bundle package scaffold + run: | + uv pip install -e ".[models]" --system + python - <<'PY' + import subprocess + import sys + + from policyengine import bundle + + requirements = [ + requirement + for requirement in bundle.bundle_install_requirements( + countries=["us", "uk"] + ) + if not requirement.startswith("policyengine==") + ] + subprocess.check_call( + [sys.executable, "-m", "pip", "install", *requirements] + ) + PY + - name: Check installed package consistency + run: python -m pip check + - name: Verify bundle packages + run: policyengine bundle verify --country us --country uk --packages-only --json Test: runs-on: macos-latest strategy: diff --git a/.github/workflows/push.yaml b/.github/workflows/push.yaml index 5ccaa6e2..834e6bb9 100644 --- a/.github/workflows/push.yaml +++ b/.github/workflows/push.yaml @@ -117,6 +117,8 @@ jobs: python-version: '3.13' - name: Build changelog run: pip install yaml-changelog towncrier && make changelog + - name: Generate derived bundle metadata + run: python scripts/bundle.py generate - name: Preview changelog update run: ".github/get-changelog-diff.sh" - name: Install package for TRO regeneration @@ -124,8 +126,8 @@ jobs: - name: Regenerate bundled TRACE TROs env: HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} - run: python scripts/generate_trace_tros.py - - name: Update changelog and TROs + run: python scripts/bundle.py generate --include-tros + - name: Update changelog, bundle metadata, and TROs uses: EndBug/add-and-commit@v9 with: add: "." @@ -154,6 +156,15 @@ jobs: run: ".github/publish-git-tag.sh" - name: Build package run: python -m build + - name: Export bundle release assets + run: python scripts/export_bundle_release_assets.py --dist-dir dist + - name: Verify bundle package metadata + env: + POLICYENGINE_SKIP_COUNTRY_IMPORTS: "1" + run: | + VERSION=$(python .github/fetch_version.py) + policyengine bundle verify --country us --country uk --packages-only --json \ + > "dist/policyengine-bundle-$VERSION.verification.json" - name: Publish a Python distribution to PyPI uses: pypa/gh-action-pypi-publish@release/v1 with: @@ -166,4 +177,8 @@ jobs: gh release create "$VERSION" \ --title "v$VERSION" \ --notes "See [CHANGELOG.md](https://github.com/PolicyEngine/policyengine.py/blob/main/CHANGELOG.md) for details." \ - --latest + --latest \ + "dist/policyengine-bundle-$VERSION.json" \ + "dist/policyengine-bundle-$VERSION.constraints.txt" \ + "dist/policyengine-bundle-$VERSION.citation.txt" \ + "dist/policyengine-bundle-$VERSION.verification.json" diff --git a/README.md b/README.md index 6c9aab18..86481cbc 100644 --- a/README.md +++ b/README.md @@ -94,6 +94,19 @@ pip install policyengine[uk] # UK model only pip install policyengine[us] # US model only ``` +For a certified package-plus-dataset bundle, use the bundle installer as the +single setup command: + +```bash +uvx --from policyengine policyengine bundle install +``` + +This installs the bundled package scaffold with pip, downloads the certified US +and UK datasets into `./data`, and writes a local receipt that can be checked +with `policyengine bundle status`. When run from `uvx` or `pipx`, the installer +creates or reuses `./.venv`; inside an existing virtualenv or conda environment, +it installs into the active environment. + ### For development ```bash @@ -111,6 +124,7 @@ uv pip install -e .[dev] # install with dev dependencies (pytest, ruff, m | **Library user** | `pip install policyengine` | Using the package in your own code | | **UK only** | `pip install policyengine[uk]` | Only need UK simulations | | **US only** | `pip install policyengine[us]` | Only need US simulations | +| **Certified bundle** | `uvx --from policyengine policyengine bundle install` | Reproducible model-plus-data setup | | **Developer** | `uv pip install -e .[dev]` | Contributing to the package | ### Common commands diff --git a/changelog.d/bundle-system.added.md b/changelog.d/bundle-system.added.md new file mode 100644 index 00000000..5c878164 --- /dev/null +++ b/changelog.d/bundle-system.added.md @@ -0,0 +1 @@ +Add PolicyEngine bundle metadata and CLI support for installing, inspecting, and verifying a cited package-plus-dataset release. diff --git a/docs/bundles.md b/docs/bundles.md new file mode 100644 index 00000000..5d2e1ab5 --- /dev/null +++ b/docs/bundles.md @@ -0,0 +1,100 @@ +# PolicyEngine bundles + +A PolicyEngine bundle is the exact first-party package set and certified +dataset set for a `policyengine` release. The bundle version is the +`policyengine` version. + +Regular package installation remains standard pip: + +```bash +pip install "policyengine==4.19.1" +pip install "policyengine[us]==4.19.1" +pip install "policyengine[uk]==4.19.1" +``` + +For a certified model-plus-data install, run the bundle installer as the single +setup command: + +```bash +uvx --from policyengine==4.19.1 policyengine bundle install 4.19.1 +``` + +With no version pin, `uvx` uses the newest published `policyengine` release: + +```bash +uvx --from policyengine policyengine bundle install +``` + +When run from `uvx` or `pipx`, the installer creates or reuses `./.venv`. +Inside an existing virtualenv or conda environment, it installs into that active +environment. The installer then installs the +exact bundled package scaffold with pip, downloads certified US and UK datasets +into `./data`, moves replaced dataset files into +`./data/.policyengine-bundle-backups//`, and writes a +`./data/.policyengine-bundle-receipt.json` receipt that records the target +Python. + +Country-specific and package-only installs are supported: + +```bash +uvx --from policyengine policyengine bundle install --country uk +uvx --from policyengine policyengine bundle install --no-datasets +``` + +Use `--yes` for CI/CD. Without `--yes`, dataset downloads ask for confirmation. + +The canonical bundle manifest is `src/policyengine/data/bundle/manifest.json`. +Derived artifacts are: + +- `pyproject.toml` extras +- `src/policyengine/data/bundle/{country}.trace.tro.jsonld` +- GitHub release assets exported from the bundle manifest + +Inspect or verify a local setup with: + +```bash +uvx --from policyengine policyengine bundle status --data-dir ./data +uvx --from policyengine policyengine bundle verify 4.19.1 --data-dir ./data +policyengine bundle manifest 4.19.1 +``` + +`status` and `verify` read the receipt and inspect the Python environment that +`install` targeted. Use `--venv` or `--python` only to inspect a different +target explicitly. + +## Bundle-only PRs + +Run: + +```bash +python scripts/bundle.py update-packages \ + --core 3.27.0 \ + --us 1.730.0 \ + --uk 2.91.0 \ + --us-data 1.118.0 +``` + +To certify a new data release from a data-producer manifest, run: + +```bash +python scripts/bundle.py certify-data \ + --country uk \ + --data-producer populace \ + --manifest-uri hf://dataset/policyengine/populace-uk-private@/releases//release_manifest.json +``` + +Use `python scripts/bundle.py generate` to regenerate derived bundle metadata, +and `python scripts/bundle.py generate --include-tros` when TRACE TRO sidecars +should also be regenerated. Private data releases require `HUGGING_FACE_TOKEN` +or `HF_TOKEN` for TRO regeneration. + +This updates bundle metadata and creates a patch changelog fragment. Do not bump +the `policyengine` version manually in the PR; the existing release workflow +bumps the package and bundle versions together after merge. + +CI checks derived bundle metadata, installs the package scaffold from the +bundle manifest, runs `pip check`, and verifies the packaged bundle metadata +with lightweight URI checks. Dataset downloads are handled by +`policyengine bundle install`, so certified UK data can be pinned by manifest +version and downloaded from Hugging Face even when the matching +`policyengine-uk-data` package is not published to PyPI. diff --git a/docs/data-publishing-design.md b/docs/data-publishing-design.md index bee9b213..26ad03a5 100644 --- a/docs/data-publishing-design.md +++ b/docs/data-publishing-design.md @@ -75,9 +75,9 @@ pairing does not: `policyengine.py` release. **These are operational aliases, not a scientific citation surface** — release bundles remain the thing papers cite. -4. **Simpler refresh mechanics.** `refresh_release_bundle(country, - ...)` becomes "fetch channel → read manifest → write the - certified release manifest" with no sha256 juggling. +4. **Simpler certification mechanics.** `certify_data_release(country, + data_producer=...)` becomes "fetch producer manifest → validate → write + the certified bundle data release" with no sha256 juggling. Notably absent from that list compared to earlier drafts: **no claim of org-independent build identity**, **no claim of @@ -145,9 +145,9 @@ different things to four different audiences. - The certification process (who signs off, what validations, what compatibility checks) — unchanged. -- `src/policyengine/data/release_manifests/{country}.json` remains - the shipped record of what a given `policyengine.py` release - guarantees. +- `src/policyengine/data/bundle/manifest.json` remains the source record of + what a given `policyengine.py` release guarantees and is packaged directly + into the wheel. - The staged `provisional → certified → retired` lifecycle — unchanged. - `*.trace.tro.jsonld` sidecars — unchanged (shorter to build diff --git a/docs/engineering/skills/data-certification.md b/docs/engineering/skills/data-certification.md index b2a88051..ff522638 100644 --- a/docs/engineering/skills/data-certification.md +++ b/docs/engineering/skills/data-certification.md @@ -12,9 +12,9 @@ and region dataset templates. Certification asserts that *this* `pyproject.toml`, serves that data release — and the assertion is only made good by the test suite passing on the exact pinned pair. -There is no intermediate bundle repo. The vendored country manifest at -`src/policyengine/data/release_manifests/{country}.json` is derived directly -from the data release manifest. +There is no intermediate bundle repo. The `data_releases.{country}` entry in +`src/policyengine/data/bundle/manifest.json` is derived directly from the data +release manifest. ## Certifying a release @@ -22,30 +22,29 @@ Open the work on a fresh branch from current `main` (use a clean worktree if the checkout is dirty). ```bash -python scripts/certify_data_release.py --country us \ - --manifest-uri "hf://dataset/policyengine/populace-us@/releases//release_manifest.json" +python scripts/bundle.py certify-data --country uk --data-producer populace \ + --manifest-uri "hf://dataset/policyengine/populace-uk-private@/releases//release_manifest.json" ``` The script fetches and validates the manifest (every artifact must carry a -revision pin; the certified dataset must be reachable), writes the vendored -country manifest, exact-pins the country model package and raises the core -floor in `pyproject.toml`, regenerates the TRACE TRO sidecar, and writes a -Towncrier changelog fragment. +revision pin; the certified dataset must be reachable), writes the canonical +bundle manifest, exact-pins the country model package in that same manifest, +regenerates derived bundle metadata, and writes a Towncrier changelog fragment. Private data (UK) requires `HUGGING_FACE_TOKEN` or `HF_TOKEN`. After running: -- `uv lock` if pins moved, then `uv sync --all-extras`, +- run `python scripts/bundle.py check`, - run the full test suite — snapshot drift from a model bump is refreshed with `PE_UPDATE_SNAPSHOTS=1 pytest tests/test_household_calculator_snapshot.py`, -- commit the manifest, TRO, `pyproject.toml`, `uv.lock`, and fragment - together. +- commit `src/policyengine/data/bundle/manifest.json`, `pyproject.toml`, the + Towncrier fragment, and any regenerated TRO sidecars together. A certification PR should normally change only: -- `src/policyengine/data/release_manifests/{country}.json` (+ `.trace.tro.jsonld`) -- `pyproject.toml` / `uv.lock` +- `src/policyengine/data/bundle/manifest.json` (+ `{country}.trace.tro.jsonld`) +- `pyproject.toml` - one Towncrier fragment under `changelog.d/` - test constants/snapshots that pin certified versions @@ -69,15 +68,9 @@ publisher-claim basis above. ## Legacy paths -Countries whose current data release predates release manifests (the UK -enhanced FRS) are refreshed with the legacy single-country tool until their -next release certifies through a manifest: - -```bash -python scripts/refresh_release_bundle.py --country uk --model-version 2.89.0 -``` - -Do not hand-edit vendored country manifests for normal updates. +Do not hand-edit bundle data releases for normal updates. Countries whose +current data release predates release manifests need a data-producer strategy +before they can be updated through this path. The retired `policyengine-bundles` flow (candidates → generated bundle → archive import) is preserved read-only in that repo's history; bundles diff --git a/docs/release-bundles.md b/docs/release-bundles.md index 5d755bca..20ada90f 100644 --- a/docs/release-bundles.md +++ b/docs/release-bundles.md @@ -1,7 +1,7 @@ # Release Bundles > **Current process.** Certification now runs inside this repository: -> `scripts/certify_data_release.py` derives the vendored country manifest +> `scripts/bundle.py certify-data` derives the vendored country manifest > directly from a country's data release manifest (see the > [data certification](engineering/skills/data-certification.md) > engineering skill). The intermediate `policyengine-bundles` repository @@ -20,6 +20,50 @@ The key design decision is: This keeps country-specific data construction in the country data repos while still giving users a single top-level version to cite and pin. +## Installing a certified bundle + +Use pip for ordinary library installs: + +```bash +pip install policyengine +``` + +Use the bundle installer when you want the certified package scaffold and the +certified datasets for a cited `policyengine` version: + +```bash +uvx --from policyengine==4.19.1 policyengine bundle install 4.19.1 +``` + +When run from `uvx` or `pipx`, the command creates or reuses `.venv`. Inside an +existing virtualenv or conda environment, it installs into that active +environment. It installs the bundled Python packages with pip, downloads the +certified US and UK datasets into `./data`, and writes a +`./data/.policyengine-bundle-receipt.json` receipt that records the target +Python. +Existing dataset files with the same filename are moved to +`./data/.policyengine-bundle-backups//`. + +Useful variants: + +```bash +uvx --from policyengine policyengine bundle install +uvx --from policyengine policyengine bundle install --country uk +uvx --from policyengine policyengine bundle install --no-datasets +uvx --from policyengine policyengine bundle install --yes +``` + +Check a local environment against a bundle: + +```bash +uvx --from policyengine policyengine bundle status --data-dir ./data +uvx --from policyengine policyengine bundle verify 4.19.1 --data-dir ./data +policyengine bundle manifest 4.19.1 +``` + +`status` and `verify` use the receipt's recorded target Python by default. Pass +`--venv` or `--python` only to inspect a different environment explicitly. + ## Why this boundary exists For countries like the UK, the data package is not model-independent. Dataset construction, imputations, and calibration steps call the country model directly. That means a published dataset artifact depends on: @@ -69,13 +113,13 @@ It does not define the final supported runtime bundle exposed to users. It does not rebuild microdata artifacts. -Certification runs in this repository: the vendored country release -manifest under `src/policyengine/data/release_manifests/` is derived -directly from the country's published data release manifest. The -entrypoint is: +Certification runs in this repository: +`src/policyengine/data/bundle/manifest.json` carries the certified +`data_releases.{country}` entry derived directly from the country's published +data release manifest. The entrypoint is: ```bash -python scripts/certify_data_release.py --country us \ +python scripts/bundle.py certify-data --country us \ --manifest-uri "hf://dataset/policyengine/populace-us@/releases//release_manifest.json" ``` @@ -263,7 +307,7 @@ policyengine trace-tro us --out us.trace.tro.jsonld ``` At release time, `scripts/generate_trace_tros.py` regenerates the bundled -`data/release_manifests/{country}.trace.tro.jsonld` files, and the +`data/bundle/{country}.trace.tro.jsonld` files, and the `Versioning` CI job commits them alongside the changelog so every published wheel ships with the matching TRO. @@ -279,7 +323,7 @@ write_results_with_trace_tro( reform_payload={"salt_cap": 0}, bundle_tro_url=( "https://raw.githubusercontent.com/PolicyEngine/policyengine.py/" - "v3.4.5/src/policyengine/data/release_manifests/us.trace.tro.jsonld" + "v3.4.5/src/policyengine/data/bundle/us.trace.tro.jsonld" ), ) ``` diff --git a/docs/run-records.md b/docs/run-records.md index ec9dd073..590bc406 100644 --- a/docs/run-records.md +++ b/docs/run-records.md @@ -32,7 +32,7 @@ record = simulation.write_run_record( "./record", bundle_tro_url=( "https://raw.githubusercontent.com/PolicyEngine/policyengine.py/" - "main/src/policyengine/data/release_manifests/us.trace.tro.jsonld" + "main/src/policyengine/data/bundle/us.trace.tro.jsonld" ), ) print(record.composition_fingerprint) # the citable id diff --git a/docs/trace-case-study.md b/docs/trace-case-study.md index d6a8ccaf..12bf2774 100644 --- a/docs/trace-case-study.md +++ b/docs/trace-case-study.md @@ -8,7 +8,7 @@ The implementation has moved past several "not yet live" markers in the April draft below. As of June 2026: - **Certified bundle TROs ship in every `policyengine` release.** - `data/release_manifests/{us,uk}.trace.tro.jsonld` bind the bundle + `data/bundle/{us,uk}.trace.tro.jsonld` bind the bundle manifest, the certified dataset sha256, the country model wheel, and the data release manifest. Certification now reads country data release manifests directly from their Hugging Face repos diff --git a/pyproject.toml b/pyproject.toml index c5268a83..18732deb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,12 +43,17 @@ plotting = [ graph = [ "networkx>=3.0", ] +models = [ + "policyengine-core==3.27.1", + "policyengine-us==1.729.0", + "policyengine-uk==2.89.2", +] uk = [ - "policyengine_core>=3.27.1", + "policyengine-core==3.27.1", "policyengine-uk==2.89.2", ] us = [ - "policyengine_core>=3.27.1", + "policyengine-core==3.27.1", "policyengine-us==1.729.0", ] dev = [ @@ -62,12 +67,12 @@ dev = [ "plotly>=5.0.0", "pytest-asyncio>=0.26.0", "ruff>=0.9.0", - "policyengine_core>=3.27.1", - "policyengine-uk==2.89.2", - "policyengine-us==1.729.0", "towncrier>=24.8.0", "mypy>=1.11.0", "pytest-cov>=5.0.0", + "policyengine-core==3.27.1", + "policyengine-us==1.729.0", + "policyengine-uk==2.89.2", ] [tool.setuptools] diff --git a/scripts/bundle.py b/scripts/bundle.py new file mode 100644 index 00000000..2eb0c860 --- /dev/null +++ b/scripts/bundle.py @@ -0,0 +1,222 @@ +"""PolicyEngine bundle maintenance entry point. + +This script is the operator-facing wrapper around the lower-level bundle +maintenance scripts. It keeps the main workflows discoverable for humans and +AI agents while preserving the smaller implementation modules underneath. + +Examples: + + python scripts/bundle.py update-packages --us 1.730.0 --uk 2.91.0 + python scripts/bundle.py certify-data --country uk --manifest-uri hf://... + python scripts/bundle.py generate + python scripts/bundle.py generate --include-tros + python scripts/bundle.py check +""" + +from __future__ import annotations + +import argparse +import os +import sys +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(REPO_ROOT / "scripts")) +sys.path.insert(0, str(REPO_ROOT / "src")) + + +def _certify_data(args: argparse.Namespace) -> int: + from certify_data_release import main as certify_data_release_main + + argv = [ + "--country", + args.country, + "--manifest-uri", + args.manifest_uri, + ] + if args.data_producer: + argv.extend(["--data-producer", args.data_producer]) + if args.model_version: + argv.extend(["--model-version", args.model_version]) + if args.no_generate: + argv.append("--no-generate") + if args.no_changelog: + argv.append("--no-changelog") + if args.skip_artifact_check: + argv.append("--skip-artifact-check") + return certify_data_release_main(argv) + + +def _update_packages(args: argparse.Namespace) -> int: + from prepare_package_bundle_update import main as prepare_package_bundle_update_main + + argv: list[str] = [] + for option in ("core", "us", "uk", "us_data"): + value = getattr(args, option) + if value: + argv.extend([f"--{option.replace('_', '-')}", value]) + if args.changelog: + argv.extend(["--changelog", args.changelog]) + if args.fragment_name: + argv.extend(["--fragment-name", args.fragment_name]) + return prepare_package_bundle_update_main(argv) + + +def _generate(args: argparse.Namespace) -> int: + from generate_bundle_artifacts import generate + + result = generate(check=False) + if not args.include_tros: + return result + return _generate_tros() + + +def _check(args: argparse.Namespace) -> int: + from generate_bundle_artifacts import generate + + result = generate(check=True) + if not args.include_tros: + return result + return result or _check_tros() + + +def _generate_tros() -> int: + os.environ.setdefault("POLICYENGINE_SKIP_COUNTRY_IMPORTS", "1") + from generate_trace_tros import regenerate_all + + written, regressions = regenerate_all() + for path in written: + print(f"wrote {path}") + for country_id, tro_path, reason in regressions: + print( + f"error: {country_id} already has {tro_path.name} but regeneration " + f"failed: {reason}", + file=sys.stderr, + ) + if regressions: + return 1 + if not written: + print("no countries could be regenerated (all skipped)", file=sys.stderr) + return 0 + + +def _check_tros() -> int: + os.environ.setdefault("POLICYENGINE_SKIP_COUNTRY_IMPORTS", "1") + from generate_trace_tros import generated_tros + + changed = False + for path, payload in generated_tros(): + if path.exists() and path.read_bytes() == payload: + continue + print( + f"{path.relative_to(REPO_ROOT)} is not up to date.", + file=sys.stderr, + ) + changed = True + return 1 if changed else 0 + + +def _parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Maintain PolicyEngine bundle metadata and derived artifacts." + ) + subparsers = parser.add_subparsers(dest="command", required=True) + + certify = subparsers.add_parser( + "certify-data", + help="Certify a country data release into the bundle manifest.", + ) + certify.add_argument("--country", required=True, choices=["us", "uk"]) + certify.add_argument( + "--data-producer", + choices=["legacy", "populace"], + help="Data-producer strategy. Defaults to the certification script default.", + ) + certify.add_argument( + "--manifest-uri", + required=True, + help="hf://dataset/@/", + ) + certify.add_argument( + "--model-version", + help="Model package version to certify for. Defaults to installed metadata.", + ) + certify.add_argument( + "--no-generate", + action="store_true", + help="Do not regenerate pyproject.toml and derived bundle metadata.", + ) + certify.add_argument( + "--no-changelog", + action="store_true", + help="Do not write a Towncrier changelog fragment.", + ) + certify.add_argument( + "--skip-artifact-check", + action="store_true", + help="Skip the certified dataset reachability check.", + ) + certify.set_defaults(func=_certify_data) + + packages = subparsers.add_parser( + "update-packages", + help="Update package pins in the bundle manifest.", + ) + packages.add_argument("--core", help="Exact version for policyengine-core.") + packages.add_argument("--us", help="Exact version for policyengine-us.") + packages.add_argument("--uk", help="Exact version for policyengine-uk.") + packages.add_argument( + "--us-data", + dest="us_data", + help="Exact version for policyengine-us-data.", + ) + packages.add_argument( + "--changelog", + default="Update the certified PolicyEngine bundle pins.", + help="Patch changelog text to include with the bundle update.", + ) + packages.add_argument( + "--fragment-name", + default="bundle-update.fixed.md", + help="Changelog fragment filename under changelog.d/.", + ) + packages.set_defaults(func=_update_packages) + + generate = subparsers.add_parser( + "generate", + help="Regenerate derived bundle artifacts.", + ) + generate.add_argument( + "--include-tros", + action="store_true", + help=( + "Also regenerate TRACE TRO sidecars. Private data releases require " + "HUGGING_FACE_TOKEN or HF_TOKEN." + ), + ) + generate.set_defaults(func=_generate) + + check = subparsers.add_parser( + "check", + help="Check derived bundle metadata.", + ) + check.add_argument( + "--include-tros", + action="store_true", + help=( + "Also check TRACE TRO sidecars. Private data releases require " + "HUGGING_FACE_TOKEN or HF_TOKEN." + ), + ) + check.set_defaults(func=_check) + + return parser + + +def main(argv: list[str] | None = None) -> int: + args = _parser().parse_args(argv) + return args.func(args) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/certify_data_release.py b/scripts/certify_data_release.py index b22c5ab0..5da26f83 100644 --- a/scripts/certify_data_release.py +++ b/scripts/certify_data_release.py @@ -1,20 +1,20 @@ -"""Certify a country data release from its HF release manifest. +"""Certify a country data release into the PolicyEngine bundle manifest. -Replaces the policyengine-bundles import flow: fetches the data release -manifest, validates it, writes the vendored country manifest, exact-pins -the country model package in ``pyproject.toml``, regenerates the TRACE -TRO sidecar, and writes a Towncrier changelog fragment. +Fetches the data release manifest, validates it with the selected +data-producer strategy, writes the certified data release into the canonical +bundle manifest, regenerates derived bundle metadata, and writes a Towncrier +changelog fragment. Usage:: - python scripts/certify_data_release.py --country us \\ - --manifest-uri "hf://dataset/policyengine/populace-us@/releases//release_manifest.json" + python scripts/bundle.py certify-data --country uk --data-producer populace \\ + --manifest-uri "hf://dataset/policyengine/populace-uk-private@/releases//release_manifest.json" Private data (UK) requires ``HUGGING_FACE_TOKEN`` or ``HF_TOKEN`` in the -environment. After running: commit the changed manifest / TRO / -pyproject.toml / changelog fragment, re-lock if pins moved, and run the -test suite — certification is only asserted once the suite passes on the -exact pinned pair. +environment. After running: commit the changed bundle manifest, pyproject.toml, +changelog fragment, and any regenerated TRO sidecars, re-lock if pins moved, +and run the test suite. Certification is only asserted once the suite passes on +the exact pinned pair. """ from __future__ import annotations @@ -26,18 +26,24 @@ REPO_ROOT = Path(__file__).resolve().parents[1] sys.path.insert(0, str(REPO_ROOT / "src")) +sys.path.insert(0, str(REPO_ROOT / "scripts")) + +from generate_bundle_artifacts import generate # noqa: E402 from policyengine.provenance.certification import ( # noqa: E402 certify_data_release, ) -from policyengine.provenance.pyproject_pins import ( # noqa: E402 - update_country_pins, -) def main(argv=None) -> int: parser = argparse.ArgumentParser(description="Certify a country data release.") parser.add_argument("--country", required=True, choices=["us", "uk"]) + parser.add_argument( + "--data-producer", + default=None, + choices=["legacy", "populace"], + help="Data-producer strategy. Defaults to populace for UK, legacy otherwise.", + ) parser.add_argument( "--manifest-uri", required=True, @@ -48,8 +54,7 @@ def main(argv=None) -> int: default=None, help="Model package version to certify for (default: installed).", ) - parser.add_argument("--no-pyproject", action="store_true") - parser.add_argument("--no-tro", action="store_true") + parser.add_argument("--no-generate", action="store_true") parser.add_argument("--no-changelog", action="store_true") parser.add_argument( "--skip-artifact-check", @@ -62,32 +67,22 @@ def main(argv=None) -> int: result = certify_data_release( country=args.country, + data_producer=args.data_producer, manifest_uri=args.manifest_uri, model_version=args.model_version, token=token, + bundle_path=REPO_ROOT + / "src" + / "policyengine" + / "data" + / "bundle" + / "manifest.json", check_artifacts=not args.skip_artifact_check, ) print(result.summary()) - if not args.no_pyproject: - from importlib.metadata import version as installed_version - - update_country_pins( - pyproject_path=REPO_ROOT / "pyproject.toml", - country=args.country, - model_package=result.model_package, - model_version=result.model_version, - core_version=installed_version("policyengine_core"), - ) - print(f"pinned {result.model_package}=={result.model_version}") - - if not args.no_tro: - from policyengine.provenance.bundle import regenerate_trace_tro - - tro_path = regenerate_trace_tro( - args.country, result.country_manifest_path.parent - ) - print(f"trace tro: {tro_path}") + if not args.no_generate: + generate(check=False) if not args.no_changelog: changelog_dir = REPO_ROOT / "changelog.d" @@ -97,10 +92,10 @@ def main(argv=None) -> int: / f"certify-{args.country}-{result.build_id or 'data'}.changed.md" ) fragment.write_text( - f"Certify the {args.country.upper()} data release " - f"`{result.build_id}` ({result.default_dataset}, " - f"{result.model_package} {result.model_version}) directly from " - "its data release manifest.\n" + f"Certify the {args.country.upper()} {result.data_producer} " + f"data release `{result.build_id}` ({result.default_dataset}, " + f"{result.model_package} {result.model_version}) into the " + "PolicyEngine bundle manifest.\n" ) print(f"changelog: {fragment}") @@ -108,4 +103,4 @@ def main(argv=None) -> int: if __name__ == "__main__": - raise SystemExit(main()) + sys.exit(main()) diff --git a/scripts/export_bundle_release_assets.py b/scripts/export_bundle_release_assets.py new file mode 100644 index 00000000..c0ff3a08 --- /dev/null +++ b/scripts/export_bundle_release_assets.py @@ -0,0 +1,77 @@ +"""Export bundle metadata files for the GitHub release.""" + +from __future__ import annotations + +import argparse +import json +from pathlib import Path + +from generate_bundle_artifacts import BUNDLE_MANIFEST, REPO_ROOT + + +def _write_json(dist_dir: Path, name: str, payload: object) -> Path: + path = dist_dir / name + path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n") + return path + + +def _write_text(dist_dir: Path, name: str, text: str) -> Path: + path = dist_dir / name + path.write_text(text) + return path + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument("--dist-dir", type=Path, default=REPO_ROOT / "dist") + args = parser.parse_args() + + bundle = json.loads(BUNDLE_MANIFEST.read_text()) + version = bundle["bundle_version"] + args.dist_dir.mkdir(parents=True, exist_ok=True) + + manifest_path = _write_json( + args.dist_dir, + f"policyengine-bundle-{version}.json", + bundle, + ) + + constraint_packages = [ + name + for name, component in bundle["packages"].items() + if component.get("installable") is not False + ] + constraints = [ + bundle["packages"][package]["install_requirement"] + for package in constraint_packages + ] + constraints_path = _write_text( + args.dist_dir, + f"policyengine-bundle-{version}.constraints.txt", + "\n".join(constraints) + "\n", + ) + + citation_path = _write_text( + args.dist_dir, + f"policyengine-bundle-{version}.citation.txt", + "\n".join( + [ + f"PolicyEngine bundle {version}", + f"PolicyEngine package version: {bundle['policyengine_version']}", + "Components:", + *( + f"- {component['name']} {component['version']}" + for _, component in sorted(bundle["packages"].items()) + ), + ] + ) + + "\n", + ) + + for path in (manifest_path, constraints_path, citation_path): + print(path) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/generate_bundle_artifacts.py b/scripts/generate_bundle_artifacts.py new file mode 100644 index 00000000..d99b993d --- /dev/null +++ b/scripts/generate_bundle_artifacts.py @@ -0,0 +1,187 @@ +"""Generate pip extras from the canonical PolicyEngine bundle manifest.""" + +from __future__ import annotations + +import argparse +import copy +import json +import re +import sys +from pathlib import Path +from typing import Any, Mapping + +try: + import tomllib +except ModuleNotFoundError: # pragma: no cover - for local Python 3.10 users. + import tomli as tomllib # type: ignore[no-redef] + +REPO_ROOT = Path(__file__).resolve().parents[1] +PYPROJECT = REPO_ROOT / "pyproject.toml" +BUNDLE_MANIFEST = ( + REPO_ROOT / "src" / "policyengine" / "data" / "bundle" / "manifest.json" +) + +OPTIONAL_DEPENDENCIES_HEADER = "[project.optional-dependencies]" +NEXT_SECTION_PATTERN = re.compile(r"\n\[tool\.setuptools\]", re.MULTILINE) +RETIRED_BUNDLE_EXTRAS = { + "data", + "full", + "uk-data", + "uk-full", + "us-data", + "us-full", +} + + +def load_bundle_manifest(path: Path = BUNDLE_MANIFEST) -> dict[str, Any]: + return json.loads(path.read_text()) + + +def write_bundle_manifest( + bundle: Mapping[str, Any], path: Path = BUNDLE_MANIFEST +) -> None: + path.write_text(json.dumps(bundle, indent=2, sort_keys=True) + "\n") + + +def normalized_manifest(bundle: Mapping[str, Any]) -> dict[str, Any]: + """Return the canonical on-disk manifest with derived convenience fields. + + ``manifest.json`` is the single editable/runtime manifest. We still + normalize redundant convenience fields such as package install requirements + so release assets and runtime commands can consume one stable shape. + """ + + packages = { + key: { + **value, + "install_requirement": exact_requirement(value), + } + for key, value in bundle["packages"].items() + } + manifest = copy.deepcopy(dict(bundle)) + manifest["packages"] = packages + manifest.pop("source", None) + manifest["citation"] = { + "title": f"PolicyEngine bundle {bundle['bundle_version']}", + "version": bundle["bundle_version"], + "type": "software-bundle", + "publisher": "PolicyEngine", + } + return manifest + + +def manifest_text(bundle: Mapping[str, Any]) -> str: + return json.dumps(normalized_manifest(bundle), indent=2, sort_keys=True) + "\n" + + +def update_pyproject_text(pyproject_text: str, bundle: Mapping[str, Any]) -> str: + pyproject = tomllib.loads(pyproject_text) + optional = pyproject["project"].get("optional-dependencies", {}) + bundle_extras = bundle["extras"] + + kept_extras: dict[str, list[str]] = {} + for name, dependencies in optional.items(): + if name == "dev" or name in bundle_extras or name in RETIRED_BUNDLE_EXTRAS: + continue + kept_extras[name] = list(dependencies) + + generated_extras = { + name: [ + exact_requirement(bundle["packages"][package_name]) + for package_name in package_names + ] + for name, package_names in bundle_extras.items() + } + + first_party_package_names = { + normalized_requirement_name(component["name"]) + for component in bundle["packages"].values() + } + dev_dependencies = [ + dependency + for dependency in optional.get("dev", []) + if normalized_requirement_name(dependency) not in first_party_package_names + ] + for package_name in bundle_extras.get("models", []): + dev_dependencies.append(exact_requirement(bundle["packages"][package_name])) + + replacement = format_optional_dependencies( + { + **kept_extras, + **generated_extras, + "dev": dev_dependencies, + } + ) + start = pyproject_text.index(OPTIONAL_DEPENDENCIES_HEADER) + next_section = NEXT_SECTION_PATTERN.search(pyproject_text, start) + if next_section is None: + raise ValueError("Could not find section after project.optional-dependencies.") + return ( + pyproject_text[:start] + + replacement + + pyproject_text[next_section.start() + 1 :] + ) + + +def exact_requirement(component: Mapping[str, Any]) -> str: + requirement = f"{component['name']}=={component['version']}" + markers = component.get("markers") + if markers: + requirement += f"; {markers}" + return requirement + + +def normalized_requirement_name(dependency: str) -> str: + match = re.match(r"\s*([A-Za-z0-9_.-]+)", dependency) + if match is None: + return "" + return match.group(1).replace("_", "-").lower() + + +def format_optional_dependencies(extras: Mapping[str, list[str]]) -> str: + lines = [OPTIONAL_DEPENDENCIES_HEADER] + for extra_name, dependencies in extras.items(): + lines.append(f"{extra_name} = [") + for dependency in dependencies: + lines.append(f' "{dependency}",') + lines.append("]") + return "\n".join(lines) + "\n\n" + + +def write_or_check(path: Path, content: str, *, check: bool) -> bool: + if path.exists() and path.read_text() == content: + return False + if check: + print(f"{path.relative_to(REPO_ROOT)} is not up to date.", file=sys.stderr) + return True + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(content) + print(f"Updated {path.relative_to(REPO_ROOT)}") + return True + + +def generate(*, check: bool = False) -> int: + bundle = load_bundle_manifest(BUNDLE_MANIFEST) + changed = False + changed |= write_or_check(BUNDLE_MANIFEST, manifest_text(bundle), check=check) + changed |= write_or_check( + PYPROJECT, + update_pyproject_text(PYPROJECT.read_text(), bundle), + check=check, + ) + return 1 if check and changed else 0 + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument( + "--check", + action="store_true", + help="Fail if generated files are not up to date.", + ) + args = parser.parse_args() + return generate(check=args.check) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/generate_trace_tros.py b/scripts/generate_trace_tros.py index 57013643..0b252be7 100644 --- a/scripts/generate_trace_tros.py +++ b/scripts/generate_trace_tros.py @@ -1,15 +1,16 @@ -"""Regenerate bundled TRACE TRO artifacts for every country release manifest. +"""Regenerate bundled TRACE TRO artifacts for every certified bundle country. -Writes ``data/release_manifests/{country}.trace.tro.jsonld`` for each -country whose bundled manifest ships in the wheel. Run this before +Writes ``data/bundle/{country}.trace.tro.jsonld`` for each country whose +certified data release ships in the bundle manifest. Run this before releasing a new ``policyengine.py`` version so the packaged TRO matches the pinned bundle. The richer data release manifest is included when available; otherwise the TRO still binds the certified dataset -sha256 and URI pinned in the bundled release manifest. +sha256 and URI pinned in the bundle manifest. """ from __future__ import annotations +import json import sys from pathlib import Path @@ -23,21 +24,28 @@ serialize_trace_tro, ) -MANIFEST_DIR = ( - Path(__file__).resolve().parent.parent - / "src" - / "policyengine" - / "data" - / "release_manifests" +REPO_ROOT = Path(__file__).resolve().parent.parent +BUNDLE_MANIFEST = ( + REPO_ROOT / "src" / "policyengine" / "data" / "bundle" / "manifest.json" ) +BUNDLE_TRO_DIR = REPO_ROOT / "src" / "policyengine" / "data" / "bundle" def regenerate_all() -> tuple[list[Path], list[tuple[str, Path, str]]]: written: list[Path] = [] regressions: list[tuple[str, Path, str]] = [] - for manifest_path in sorted(MANIFEST_DIR.glob("*.json")): - country_id = manifest_path.stem - tro_path = manifest_path.with_suffix(".trace.tro.jsonld") + for tro_path, payload in generated_tros(): + tro_path.write_bytes(payload) + written.append(tro_path) + return written, regressions + + +def generated_tros() -> list[tuple[Path, bytes]]: + payloads: list[tuple[Path, bytes]] = [] + bundle = json.loads(BUNDLE_MANIFEST.read_text()) + BUNDLE_TRO_DIR.mkdir(parents=True, exist_ok=True) + for country_id in sorted(bundle.get("data_releases", {})): + tro_path = BUNDLE_TRO_DIR / f"{country_id}.trace.tro.jsonld" country_manifest = get_release_manifest(country_id) try: data_release_manifest = get_data_release_manifest(country_id) @@ -51,15 +59,16 @@ def regenerate_all() -> tuple[list[Path], list[tuple[str, Path, str]]]: country_manifest, data_release_manifest, certification=country_manifest.certification, + model_wheel_sha256=country_manifest.model_package.sha256, + model_wheel_url=country_manifest.model_package.wheel_url, ) - tro_path.write_bytes(serialize_trace_tro(tro)) - written.append(tro_path) - return written, regressions + payloads.append((tro_path, serialize_trace_tro(tro))) + return payloads def main() -> int: - if not MANIFEST_DIR.is_dir(): - print(f"no manifest dir at {MANIFEST_DIR}", file=sys.stderr) + if not BUNDLE_MANIFEST.is_file(): + print(f"no bundle manifest at {BUNDLE_MANIFEST}", file=sys.stderr) return 1 written, regressions = regenerate_all() for path in written: diff --git a/scripts/prepare_package_bundle_update.py b/scripts/prepare_package_bundle_update.py new file mode 100644 index 00000000..a4f6ef27 --- /dev/null +++ b/scripts/prepare_package_bundle_update.py @@ -0,0 +1,74 @@ +"""Prepare a PR that only updates package pins in the PolicyEngine bundle.""" + +from __future__ import annotations + +import argparse +from typing import Any, Mapping + +from generate_bundle_artifacts import ( + BUNDLE_MANIFEST, + REPO_ROOT, + generate, + load_bundle_manifest, + write_bundle_manifest, +) + +PACKAGE_ARGS = { + "core": "policyengine-core", + "us": "policyengine-us", + "uk": "policyengine-uk", + "us_data": "policyengine-us-data", +} + + +def write_changelog(message: str, fragment_name: str) -> None: + changelog_dir = REPO_ROOT / "changelog.d" + changelog_dir.mkdir(exist_ok=True) + path = changelog_dir / fragment_name + path.write_text(message.strip() + "\n") + print(f"Updated {path.relative_to(REPO_ROOT)}") + + +def update_package_pins(bundle: Mapping[str, Any], args: argparse.Namespace) -> dict: + updated = dict(bundle) + packages = {key: dict(value) for key, value in bundle["packages"].items()} + updated["packages"] = packages + for arg_name, package_key in PACKAGE_ARGS.items(): + version = getattr(args, arg_name) + if version: + packages[package_key]["version"] = version + return updated + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser( + description="Prepare a package-pin-only PolicyEngine bundle PR." + ) + for arg_name, package_key in PACKAGE_ARGS.items(): + parser.add_argument( + f"--{arg_name.replace('_', '-')}", + dest=arg_name, + help=f"Exact version for {package_key}.", + ) + parser.add_argument( + "--changelog", + default="Update the certified PolicyEngine bundle pins.", + help="Patch changelog text to include with the bundle update.", + ) + parser.add_argument( + "--fragment-name", + default="bundle-update.fixed.md", + help="Changelog fragment filename under changelog.d/.", + ) + args = parser.parse_args(argv) + + bundle = update_package_pins(load_bundle_manifest(), args) + write_bundle_manifest(bundle) + print(f"Updated {BUNDLE_MANIFEST.relative_to(REPO_ROOT)}") + generate(check=False) + write_changelog(args.changelog, args.fragment_name) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/refresh_release_bundle.py b/scripts/refresh_release_bundle.py deleted file mode 100644 index 2bfac281..00000000 --- a/scripts/refresh_release_bundle.py +++ /dev/null @@ -1,98 +0,0 @@ -"""CLI wrapper around :func:`policyengine.provenance.refresh_release_bundle`. - -Usage:: - - python scripts/refresh_release_bundle.py --country us \\ - --model-version 1.653.3 --data-version 1.83.4 - -Fetches PyPI wheel metadata and streams the HF dataset to compute its -sha256, then writes updated ``data/release_manifests/{country}.json``, -bumps the matching pin in ``pyproject.toml`` (unless -``--no-pyproject``), and regenerates the bundle's TRACE TRO sidecar -(unless ``--no-tro``). - -Private HF datasets require ``HUGGING_FACE_TOKEN`` in the env. - -After running: - -- commit the changed manifest / TRO / pyproject.toml, -- manually rerun - ``PE_UPDATE_SNAPSHOTS=1 pytest tests/test_household_calculator_snapshot.py`` - to rebaseline expected household outputs — those numbers will - almost certainly drift when the data version bumps, and the drift - deserves human review before being committed. -""" - -from __future__ import annotations - -import argparse -import os -import sys - -os.environ.setdefault("POLICYENGINE_SKIP_COUNTRY_IMPORTS", "1") - -from policyengine.provenance.bundle import ( - refresh_release_bundle, - regenerate_trace_tro, -) - - -def main(argv: list[str] | None = None) -> int: - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--country", required=True, choices=("us", "uk")) - parser.add_argument( - "--model-version", - help="New policyengine-{country} version (e.g. 1.653.3)", - ) - parser.add_argument( - "--data-version", - help="New policyengine-{country}-data version (e.g. 1.83.4)", - ) - parser.add_argument( - "--release-manifest-path", - help=( - "Override the data release manifest path, e.g. " - "releases/crfb-longrun-20260518/release_manifest.json" - ), - ) - parser.add_argument( - "--release-manifest-revision", - help=( - "HF revision to fetch the data release manifest from before " - "pinning the immutable repo commit." - ), - ) - parser.add_argument( - "--no-pyproject", - action="store_true", - help="Do not bump the country extra in pyproject.toml", - ) - parser.add_argument( - "--no-tro", - action="store_true", - help="Skip TRACE TRO regeneration", - ) - args = parser.parse_args(argv) - - if args.model_version is None and args.data_version is None: - parser.error("Pass at least --model-version or --data-version") - - result = refresh_release_bundle( - country=args.country, - model_version=args.model_version, - data_version=args.data_version, - release_manifest_path=args.release_manifest_path, - release_manifest_revision=args.release_manifest_revision, - update_pyproject=not args.no_pyproject, - ) - print(result.summary()) - - if not args.no_tro: - tro_path = regenerate_trace_tro(args.country) - print(f" TRO regenerated: {tro_path}") - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/src/policyengine/bundle.py b/src/policyengine/bundle.py new file mode 100644 index 00000000..eeb45376 --- /dev/null +++ b/src/policyengine/bundle.py @@ -0,0 +1,920 @@ +"""PolicyEngine bundle installation and verification helpers. + +The bundle manifest is packaged with ``policyengine`` and names the exact +first-party packages plus certified data artifacts for a PolicyEngine release. +This module keeps installation pip-based while adding the dataset handling that +plain pip cannot provide. +""" + +from __future__ import annotations + +import hashlib +import json +import os +import shutil +import subprocess +import tempfile +import venv as venv_module +from dataclasses import dataclass +from datetime import datetime, timezone +from importlib import metadata +from importlib.resources import files +from pathlib import Path +from typing import Any, Iterable, Mapping, Optional, Sequence +from urllib.parse import quote + +import requests + +BUNDLE_MANIFEST_RESOURCE = ("data", "bundle", "manifest.json") +BUNDLE_HISTORY_RESOURCE = ("data", "bundles") +DEFAULT_COUNTRIES = ("us", "uk") +DEFAULT_DATA_DIR = Path("./data") +DEFAULT_VENV = Path(".venv") +RECEIPT_FILENAME = ".policyengine-bundle-receipt.json" +BACKUP_DIR_NAME = ".policyengine-bundle-backups" +DOWNLOAD_TIMEOUT_SECONDS = 60 + + +class BundleError(ValueError): + """Raised when bundle metadata or local installation state is invalid.""" + + +@dataclass(frozen=True) +class DatasetPlan: + country: str + dataset: str + uri: str + filename: str + data_version: Optional[str] + release_manifest_uri: Optional[str] + data_producer: str + repo_type: str + destination: Path + expected_sha256: Optional[str] + build_id: Optional[str] + + +class DataProducerRuntimeStrategy: + """Runtime install/verification behavior for a certified data producer.""" + + data_producer = "legacy" + + def dataset_plan( + self, + *, + country: str, + release: Mapping[str, Any], + data_dir: Path, + ) -> Optional[DatasetPlan]: + uri = release.get("default_dataset_uri") + dataset = release.get("default_dataset") + if not uri or not dataset: + return None + data_package = release.get("data_package", {}) + repo_type = ( + data_package.get("repo_type", "model") + if isinstance(data_package, Mapping) + else "model" + ) + dataset_name = str(dataset) + filename = _filename_from_uri(str(uri)) + return DatasetPlan( + country=country, + dataset=dataset_name, + uri=str(uri), + filename=filename, + data_version=( + str(release["version"]) if release.get("version") is not None else None + ), + release_manifest_uri=( + str(release["release_manifest_uri"]) + if release.get("release_manifest_uri") + else None + ), + data_producer=str(release.get("data_producer") or self.data_producer), + repo_type=str(repo_type), + destination=data_dir / filename, + expected_sha256=self.expected_sha256(release, dataset_name), + build_id=self.build_id(release), + ) + + def expected_sha256( + self, + release: Mapping[str, Any], + dataset: str, + ) -> Optional[str]: + dataset_artifact = self.default_dataset_artifact(release, dataset) + if dataset_artifact is not None and dataset_artifact.get("sha256"): + return str(dataset_artifact["sha256"]) + certified_artifact = release.get("certified_data_artifact") + if isinstance(certified_artifact, Mapping) and certified_artifact.get("sha256"): + return str(certified_artifact["sha256"]) + return None + + def default_dataset_artifact( + self, + release: Mapping[str, Any], + dataset: str, + ) -> Optional[Mapping[str, Any]]: + datasets = release.get("datasets") + if isinstance(datasets, Mapping): + artifact = datasets.get(dataset) + if isinstance(artifact, Mapping): + return artifact + return None + + def build_id(self, release: Mapping[str, Any]) -> Optional[str]: + build_id = release.get("build_id") + if build_id: + return str(build_id) + certified_artifact = release.get("certified_data_artifact") + if isinstance(certified_artifact, Mapping) and certified_artifact.get( + "build_id" + ): + return str(certified_artifact["build_id"]) + version = release.get("version") + return str(version) if version is not None else None + + def verify_download(self, plan: DatasetPlan, path: Path) -> str: + actual_sha256 = _sha256_file(path) + if plan.expected_sha256 and actual_sha256 != plan.expected_sha256: + raise BundleError( + f"Downloaded {plan.country.upper()} dataset {plan.dataset} " + f"has sha256 {actual_sha256}, expected {plan.expected_sha256}." + ) + return actual_sha256 + + def dataset_check( + self, + plan: DatasetPlan, + receipt_dataset: Optional[Mapping[str, Any]], + ) -> dict[str, Any]: + check: dict[str, Any] = { + "country": plan.country, + "dataset": plan.dataset, + "expected_version": plan.data_version, + "expected_path": str(plan.destination), + } + if plan.expected_sha256: + check["expected_sha256"] = plan.expected_sha256 + if receipt_dataset is None: + check["status"] = "missing_receipt" + return check + if receipt_dataset.get("version") != plan.data_version: + check["status"] = "mismatch" + check["installed_version"] = receipt_dataset.get("version") + return check + path = Path(str(receipt_dataset.get("path", plan.destination))) + if not path.exists(): + check["status"] = "missing_file" + return check + check["installed_version"] = receipt_dataset.get("version") + check["path"] = str(path) + if plan.expected_sha256: + actual_sha256 = _sha256_file(path) + check["installed_sha256"] = actual_sha256 + if actual_sha256 != plan.expected_sha256: + check["status"] = "sha256_mismatch" + return check + check["status"] = "ok" + return check + + +class LegacyDataProducerRuntimeStrategy(DataProducerRuntimeStrategy): + data_producer = "legacy" + + +class PopulaceDataProducerRuntimeStrategy(DataProducerRuntimeStrategy): + data_producer = "populace" + + def expected_sha256( + self, + release: Mapping[str, Any], + dataset: str, + ) -> str: + expected = super().expected_sha256(release, dataset) + if not expected: + raise BundleError( + f"Populace data release for dataset {dataset!r} is missing " + "a certified sha256." + ) + return expected + + +def _bundle_resource_path(): + path = files("policyengine") + for part in BUNDLE_MANIFEST_RESOURCE: + path = path.joinpath(part) + return path + + +def _bundle_history_path(version: str): + path = files("policyengine") + for part in BUNDLE_HISTORY_RESOURCE: + path = path.joinpath(part) + return path.joinpath(f"{version}.json") + + +def _normalise_manifest(manifest: Mapping[str, Any]) -> dict[str, Any]: + payload = dict(manifest) + bundle_version = payload.get("bundle_version") or payload.get( + "policyengine_version" + ) + if not bundle_version: + raise BundleError("Bundle manifest is missing a bundle version.") + payload["bundle_version"] = str(bundle_version) + payload.setdefault("policyengine_version", str(bundle_version)) + payload.setdefault("countries", {}) + payload.setdefault("packages", {}) + payload.setdefault("extras", {}) + payload.setdefault("data_releases", _data_releases_from_countries(payload)) + return payload + + +def _data_releases_from_countries(manifest: Mapping[str, Any]) -> dict[str, Any]: + releases: dict[str, Any] = {} + packages = manifest.get("packages", {}) + for country, country_meta in manifest.get("countries", {}).items(): + if not isinstance(country_meta, Mapping): + continue + data_package_name = country_meta.get("data_package") + data_package = ( + packages.get(data_package_name, {}) + if isinstance(data_package_name, str) and isinstance(packages, Mapping) + else {} + ) + data_version = ( + country_meta.get("data_artifact_version") + or country_meta.get("data_version") + or data_package.get("version") + ) + releases[str(country)] = { + "data_producer": country_meta.get("data_producer", "legacy"), + "data_package": data_package_name, + "version": data_version, + "default_dataset": country_meta.get("default_dataset"), + "default_dataset_uri": country_meta.get("default_dataset_uri"), + "release_manifest_uri": country_meta.get("release_manifest_uri"), + } + return releases + + +def get_current_bundle() -> dict[str, Any]: + """Return the bundle manifest packaged with this ``policyengine`` wheel.""" + + resource = _bundle_resource_path() + try: + return _normalise_manifest(json.loads(resource.read_text())) + except FileNotFoundError as exc: + raise BundleError("No packaged PolicyEngine bundle manifest found.") from exc + + +def load_bundle_manifest( + version: Optional[str] = None, + *, + manifest_ref: Optional[str] = None, +) -> dict[str, Any]: + """Load a packaged, historical, or custom bundle manifest.""" + + if manifest_ref: + if manifest_ref.startswith(("http://", "https://")): + response = requests.get(manifest_ref, timeout=DOWNLOAD_TIMEOUT_SECONDS) + response.raise_for_status() + return _normalise_manifest(response.json()) + return _normalise_manifest(json.loads(Path(manifest_ref).read_text())) + + current = get_current_bundle() + if version in (None, "latest", current["bundle_version"]): + return current + + history_path = _bundle_history_path(str(version)) + if history_path.is_file(): + return _normalise_manifest(json.loads(history_path.read_text())) + raise BundleError( + f"Bundle {version!r} is not packaged with this policyengine release." + ) + + +def normalise_countries( + countries: Optional[Sequence[str]], + manifest: Optional[Mapping[str, Any]] = None, +) -> list[str]: + manifest = manifest or get_current_bundle() + available = set(manifest.get("countries", {}) or DEFAULT_COUNTRIES) + selected = list(countries or sorted(available)) + normalised = [] + for country in selected: + country_id = country.lower() + if country_id not in available: + raise BundleError(f"Unsupported bundle country: {country}") + if country_id not in normalised: + normalised.append(country_id) + return normalised + + +def bundle_install_requirements( + manifest: Optional[Mapping[str, Any]] = None, + *, + countries: Optional[Sequence[str]] = None, +) -> list[str]: + """Return exact pip requirements for the selected bundle package scaffold.""" + + bundle = _normalise_manifest(manifest or get_current_bundle()) + selected = set(normalise_countries(countries, bundle)) + requirements: list[str] = [] + for key, component in bundle.get("packages", {}).items(): + if not _include_component(str(key), component, selected): + continue + requirements.append( + component.get("install_requirement") or _requirement(component) + ) + return requirements + + +def _include_component( + key: str, + component: Mapping[str, Any], + countries: set[str], +) -> bool: + if component.get("installable") is False: + return False + role = component.get("role") + country = component.get("country") + if role in {"bundle_carrier", "runtime_dependency"}: + return True + if isinstance(country, str): + return country in countries + return key == "policyengine" + + +def _requirement(component: Mapping[str, Any]) -> str: + requirement = f"{component['name']}=={component['version']}" + markers = component.get("markers") + if markers: + requirement += f"; {markers}" + return requirement + + +def runtime_strategy(data_producer: Optional[str]) -> DataProducerRuntimeStrategy: + producer = data_producer or "legacy" + if producer == "populace": + return PopulaceDataProducerRuntimeStrategy() + return LegacyDataProducerRuntimeStrategy() + + +def resolve_target_python( + *, + python: Optional[str] = None, + venv: Optional[Path] = None, + create_venv: bool = True, +) -> Path: + """Resolve the Python interpreter that package installation should target.""" + + if python and venv: + raise BundleError("Pass either --python or --venv, not both.") + if venv is not None: + return _resolve_venv_python(venv, create_venv=create_venv) + if python: + candidate = Path(shutil.which(python) or python).expanduser().resolve() + if not candidate.exists(): + raise BundleError(f"Python interpreter not found: {python}") + return candidate + + active_env = os.environ.get("VIRTUAL_ENV") or os.environ.get("CONDA_PREFIX") + if active_env: + candidate = Path(active_env) / ( + "Scripts/python.exe" if os.name == "nt" else "bin/python" + ) + if _looks_like_runner_env(candidate): + return _resolve_venv_python(DEFAULT_VENV, create_venv=create_venv) + if candidate.exists(): + return candidate + return _resolve_venv_python(DEFAULT_VENV, create_venv=create_venv) + + +def _resolve_venv_python(path: Path, *, create_venv: bool) -> Path: + path = path.expanduser() + if not path.is_absolute(): + path = Path.cwd() / path + if create_venv: + return _ensure_venv(path) + return _venv_python(path) + + +def _ensure_venv(path: Path) -> Path: + if not path.exists(): + venv_module.EnvBuilder(with_pip=True).create(str(path)) + python = _venv_python(path) + if not python.exists(): + raise BundleError(f"Virtualenv at {path} does not contain Python.") + return python + + +def _venv_python(path: Path) -> Path: + return path / ("Scripts/python.exe" if os.name == "nt" else "bin/python") + + +def _looks_like_runner_env(python: Path) -> bool: + text = str(python).lower() + return "uvx" in text or "pipx" in text or "/uv/tools/" in text + + +def install_package_scaffold( + target_python: Path, + requirements: Sequence[str], + *, + dry_run: bool = False, +) -> None: + command = [str(target_python), "-m", "pip", "install", *requirements] + if dry_run: + print(" ".join(command)) + return + subprocess.run(command, check=True) + + +def dataset_plans( + manifest: Optional[Mapping[str, Any]] = None, + *, + countries: Optional[Sequence[str]] = None, + data_dir: Path = DEFAULT_DATA_DIR, +) -> list[DatasetPlan]: + bundle = _normalise_manifest(manifest or get_current_bundle()) + releases = bundle.get("data_releases") or _data_releases_from_countries(bundle) + plans: list[DatasetPlan] = [] + for country in normalise_countries(countries, bundle): + release = releases.get(country, {}) if isinstance(releases, Mapping) else {} + strategy = runtime_strategy(str(release.get("data_producer") or "legacy")) + plan = strategy.dataset_plan( + country=country, release=release, data_dir=data_dir + ) + if plan is None: + continue + plans.append(plan) + return plans + + +def _filename_from_uri(uri: str) -> str: + without_revision = uri.rsplit("@", 1)[0] + if without_revision.startswith("hf://"): + return ( + without_revision.removeprefix("hf://").split("/", 2)[2].rsplit("/", 1)[-1] + ) + if without_revision.startswith("gs://"): + return ( + without_revision.removeprefix("gs://").split("/", 1)[1].rsplit("/", 1)[-1] + ) + return Path(without_revision).name + + +def install_datasets( + manifest: Mapping[str, Any], + *, + countries: Optional[Sequence[str]] = None, + data_dir: Path = DEFAULT_DATA_DIR, + yes: bool = False, + dry_run: bool = False, + session=requests, +) -> list[dict[str, Any]]: + plans = dataset_plans(manifest, countries=countries, data_dir=data_dir) + if not plans: + return [] + _confirm_dataset_install(plans, data_dir=data_dir, yes=yes, dry_run=dry_run) + installed = [] + for plan in plans: + if dry_run: + print(f"download {plan.uri} -> {plan.destination}") + installed.append(_receipt_dataset(plan)) + continue + downloaded = _download_to_temp(plan, data_dir=data_dir, session=session) + installed_sha256 = None + try: + installed_sha256 = runtime_strategy(plan.data_producer).verify_download( + plan, + downloaded, + ) + _backup_existing(plan.destination) + plan.destination.parent.mkdir(parents=True, exist_ok=True) + shutil.move(str(downloaded), str(plan.destination)) + finally: + if downloaded.exists(): + downloaded.unlink() + installed.append(_receipt_dataset(plan, installed_sha256=installed_sha256)) + return installed + + +def _confirm_dataset_install( + plans: Sequence[DatasetPlan], + *, + data_dir: Path, + yes: bool, + dry_run: bool, +) -> None: + countries = ", ".join(plan.country for plan in plans) + print( + "This will download certified PolicyEngine datasets for " + f"{countries} into {data_dir}." + ) + print( + "Existing matching dataset files will be moved to " + f"{data_dir / BACKUP_DIR_NAME}//." + ) + if yes or dry_run: + return + answer = input("Continue? [y/N] ").strip().lower() + if answer not in {"y", "yes"}: + raise BundleError("Dataset installation cancelled.") + + +def _download_to_temp(plan: DatasetPlan, *, data_dir: Path, session=requests) -> Path: + data_dir.mkdir(parents=True, exist_ok=True) + url = _download_url(plan.uri, repo_type=plan.repo_type) + headers = _auth_headers(plan.uri) + suffix = Path(plan.filename).suffix or ".download" + fd, temp_name = tempfile.mkstemp( + prefix=".policyengine-download-", suffix=suffix, dir=data_dir + ) + os.close(fd) + temp_path = Path(temp_name) + try: + with session.get( + url, + headers=headers, + stream=True, + timeout=DOWNLOAD_TIMEOUT_SECONDS, + ) as response: + if response.status_code in {401, 403}: + raise BundleError( + f"Could not download {plan.country.upper()} dataset. " + "If this is a private Hugging Face dataset, set HUGGING_FACE_TOKEN." + ) + response.raise_for_status() + with temp_path.open("wb") as stream: + for chunk in response.iter_content(chunk_size=1024 * 1024): + if chunk: + stream.write(chunk) + except Exception: + if temp_path.exists(): + temp_path.unlink() + raise + return temp_path + + +def _download_url(uri: str, *, repo_type: str = "model") -> str: + without_revision, revision = _split_revision(uri) + if without_revision.startswith("hf://"): + parts = without_revision.removeprefix("hf://").split("/", 2) + if len(parts) != 3: + raise BundleError(f"Invalid Hugging Face dataset URI: {uri}") + repo_id = f"{parts[0]}/{parts[1]}" + path = parts[2] + if not revision: + raise BundleError(f"Hugging Face dataset URI must pin a revision: {uri}") + prefix = "datasets/" if repo_type == "dataset" else "" + return ( + f"https://huggingface.co/{prefix}{repo_id}/resolve/{quote(revision)}/{path}" + ) + if without_revision.startswith("gs://"): + bucket_and_path = without_revision.removeprefix("gs://") + bucket, _, path = bucket_and_path.partition("/") + return f"https://storage.googleapis.com/{bucket}/{quote(path)}" + if without_revision.startswith(("http://", "https://")): + return uri + return uri + + +def _split_revision(uri: str) -> tuple[str, Optional[str]]: + if "@" not in uri: + return uri, None + without_revision, revision = uri.rsplit("@", 1) + return without_revision, revision + + +def _auth_headers(uri: str) -> dict[str, str]: + if not uri.startswith(("hf://", "https://huggingface.co/")): + return {} + token = ( + os.environ.get("HUGGING_FACE_TOKEN") + or os.environ.get("HF_TOKEN") + or os.environ.get("HUGGINGFACE_HUB_TOKEN") + ) + return {"Authorization": f"Bearer {token}"} if token else {} + + +def _backup_existing(path: Path) -> None: + if not path.exists(): + return + timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") + backup_dir = path.parent / BACKUP_DIR_NAME / timestamp + backup_dir.mkdir(parents=True, exist_ok=True) + shutil.move(str(path), str(backup_dir / path.name)) + + +def _receipt_dataset( + plan: DatasetPlan, + *, + installed_sha256: Optional[str] = None, +) -> dict[str, Any]: + receipt = { + "country": plan.country, + "dataset": plan.dataset, + "version": plan.data_version, + "uri": plan.uri, + "path": str(plan.destination), + "release_manifest_uri": plan.release_manifest_uri, + "data_producer": plan.data_producer, + "repo_type": plan.repo_type, + } + if plan.build_id: + receipt["build_id"] = plan.build_id + if plan.expected_sha256: + receipt["expected_sha256"] = plan.expected_sha256 + if installed_sha256: + receipt["installed_sha256"] = installed_sha256 + return receipt + + +def write_receipt( + manifest: Mapping[str, Any], + *, + data_dir: Path, + countries: Sequence[str], + datasets: Sequence[Mapping[str, Any]], + target_python: Optional[Path] = None, +) -> Path: + receipt = { + "schema_version": 1, + "installed_at": datetime.now(timezone.utc).isoformat(), + "bundle_version": manifest["bundle_version"], + "policyengine_version": manifest["policyengine_version"], + "countries": list(countries), + "packages": manifest.get("packages", {}), + "datasets": list(datasets), + } + if target_python is not None: + receipt["target_python"] = str(target_python.resolve()) + data_dir.mkdir(parents=True, exist_ok=True) + path = data_dir / RECEIPT_FILENAME + path.write_text(json.dumps(receipt, indent=2, sort_keys=True) + "\n") + return path + + +def read_receipt(data_dir: Path = DEFAULT_DATA_DIR) -> Optional[dict[str, Any]]: + path = data_dir / RECEIPT_FILENAME + if not path.exists(): + return None + try: + payload = json.loads(path.read_text()) + except (OSError, json.JSONDecodeError): + return None + return payload if isinstance(payload, dict) else None + + +def install_bundle( + version: Optional[str] = None, + *, + manifest_ref: Optional[str] = None, + python: Optional[str] = None, + venv: Optional[Path] = None, + countries: Optional[Sequence[str]] = None, + data_dir: Path = DEFAULT_DATA_DIR, + no_datasets: bool = False, + yes: bool = False, + dry_run: bool = False, +) -> dict[str, Any]: + manifest = load_bundle_manifest(version, manifest_ref=manifest_ref) + selected_countries = normalise_countries(countries, manifest) + requirements = bundle_install_requirements(manifest, countries=selected_countries) + target_python = resolve_target_python( + python=python, + venv=venv, + create_venv=not dry_run, + ) + install_package_scaffold(target_python, requirements, dry_run=dry_run) + installed_datasets: list[dict[str, Any]] = [] + if not no_datasets: + installed_datasets = install_datasets( + manifest, + countries=selected_countries, + data_dir=data_dir, + yes=yes, + dry_run=dry_run, + ) + if not dry_run: + write_receipt( + manifest, + data_dir=data_dir, + countries=selected_countries, + datasets=installed_datasets, + target_python=target_python, + ) + return { + "bundle_version": manifest["bundle_version"], + "requirements": requirements, + "countries": selected_countries, + "datasets": installed_datasets, + "data_dir": str(data_dir), + "target_python": str(target_python), + } + + +def inspect_bundle_status( + version: Optional[str] = None, + *, + manifest_ref: Optional[str] = None, + python: Optional[str] = None, + venv: Optional[Path] = None, + countries: Optional[Sequence[str]] = None, + data_dir: Path = DEFAULT_DATA_DIR, + packages_only: bool = False, +) -> dict[str, Any]: + manifest = load_bundle_manifest(version, manifest_ref=manifest_ref) + selected_countries = normalise_countries(countries, manifest) + receipt = read_receipt(data_dir) + target_python = _resolve_status_python(python=python, venv=venv, receipt=receipt) + package_checks = _package_checks( + list(_selected_components(manifest, selected_countries)), + target_python=target_python, + ) + dataset_checks = ( + [] + if packages_only + else _dataset_checks(manifest, selected_countries, data_dir, receipt) + ) + passed = all( + check["status"] == "ok" for check in [*package_checks, *dataset_checks] + ) + return { + "schema_version": 1, + "bundle_version": manifest["bundle_version"], + "policyengine_version": manifest["policyengine_version"], + "countries": selected_countries, + "matched": passed, + "target_python": str(target_python) if target_python is not None else None, + "packages": package_checks, + "datasets": dataset_checks, + "receipt": receipt, + } + + +def _resolve_status_python( + *, + python: Optional[str], + venv: Optional[Path], + receipt: Optional[Mapping[str, Any]], +) -> Optional[Path]: + if python or venv: + return resolve_target_python(python=python, venv=venv, create_venv=False) + if isinstance(receipt, Mapping): + target = receipt.get("target_python") + if isinstance(target, str) and target: + return Path(target) + return None + + +def _selected_components( + manifest: Mapping[str, Any], countries: Sequence[str] +) -> Iterable[Mapping[str, Any]]: + selected = set(countries) + for key, component in manifest.get("packages", {}).items(): + if _include_component(str(key), component, selected): + yield component + + +def _package_checks( + components: Sequence[Mapping[str, Any]], + *, + target_python: Optional[Path], +) -> list[dict[str, Any]]: + if target_python is not None: + if not target_python.exists(): + return [ + _package_target_error_check( + component, + target_python=target_python, + status="target_python_missing", + detail=f"Target Python does not exist: {target_python}", + ) + for component in components + ] + versions, error = _package_versions_from_python(target_python, components) + if error is not None: + return [ + _package_target_error_check( + component, + target_python=target_python, + status="target_python_error", + detail=error, + ) + for component in components + ] + return [ + _package_check(component, installed_versions=versions) + for component in components + ] + return [_package_check(component) for component in components] + + +def _package_versions_from_python( + target_python: Path, + components: Sequence[Mapping[str, Any]], +) -> tuple[dict[str, Optional[str]], Optional[str]]: + package_names = sorted({str(component["name"]) for component in components}) + script = """ +import importlib.metadata as metadata +import json +import sys + +versions = {} +for package_name in json.loads(sys.argv[1]): + try: + versions[package_name] = metadata.version(package_name) + except metadata.PackageNotFoundError: + versions[package_name] = None +print(json.dumps(versions, sort_keys=True)) +""" + result = subprocess.run( + [str(target_python), "-c", script, json.dumps(package_names)], + capture_output=True, + text=True, + check=False, + ) + if result.returncode != 0: + detail = result.stderr.strip() or result.stdout.strip() + return {}, detail or f"{target_python} exited with {result.returncode}" + try: + payload = json.loads(result.stdout) + except json.JSONDecodeError: + return {}, f"{target_python} returned invalid package metadata JSON" + if not isinstance(payload, dict): + return {}, f"{target_python} returned invalid package metadata" + return {str(key): value for key, value in payload.items()}, None + + +def _package_target_error_check( + component: Mapping[str, Any], + *, + target_python: Path, + status: str, + detail: str, +) -> dict[str, Any]: + return { + "package": str(component["name"]), + "expected_version": str(component["version"]), + "target_python": str(target_python), + "status": status, + "detail": detail, + } + + +def _package_check( + component: Mapping[str, Any], + installed_versions: Optional[Mapping[str, Optional[str]]] = None, +) -> dict[str, Any]: + package_name = str(component["name"]) + expected = str(component["version"]) + check: dict[str, Any] = { + "package": package_name, + "expected_version": expected, + } + if installed_versions is None: + try: + installed = metadata.version(package_name) + except metadata.PackageNotFoundError: + check["status"] = "missing" + return check + else: + installed = installed_versions.get(package_name) + if installed is None: + check["status"] = "missing" + return check + check["installed_version"] = installed + check["status"] = "ok" if installed == expected else "mismatch" + return check + + +def _dataset_checks( + manifest: Mapping[str, Any], + countries: Sequence[str], + data_dir: Path, + receipt: Optional[Mapping[str, Any]], +) -> list[dict[str, Any]]: + receipt_datasets = {} + if isinstance(receipt, Mapping): + for dataset in receipt.get("datasets", []): + if isinstance(dataset, Mapping) and dataset.get("country"): + receipt_datasets[str(dataset["country"])] = dataset + checks = [] + for plan in dataset_plans(manifest, countries=countries, data_dir=data_dir): + receipt_dataset = receipt_datasets.get(plan.country) + checks.append( + runtime_strategy(plan.data_producer).dataset_check(plan, receipt_dataset) + ) + return checks + + +def _sha256_file(path: Path) -> str: + digest = hashlib.sha256() + with path.open("rb") as file: + for chunk in iter(lambda: file.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() diff --git a/src/policyengine/cli.py b/src/policyengine/cli.py index 1d713d8b..7bd47bdb 100644 --- a/src/policyengine/cli.py +++ b/src/policyengine/cli.py @@ -19,6 +19,12 @@ from pathlib import Path from typing import Optional, Sequence +from policyengine.bundle import ( + BundleError, + inspect_bundle_status, + install_bundle, + load_bundle_manifest, +) from policyengine.provenance.manifest import ( DataReleaseManifestUnavailableError, get_data_release_manifest, @@ -97,6 +103,168 @@ def _parser() -> argparse.ArgumentParser: ) bundle.add_argument("country", help="Country id (e.g. us, uk).") + bundle = subparsers.add_parser( + "bundle", + help="Install, inspect, or verify a PolicyEngine bundle.", + ) + bundle_subparsers = bundle.add_subparsers( + dest="bundle_command", + required=True, + ) + + bundle_install = bundle_subparsers.add_parser( + "install", + help="Install a bundle's package scaffold and certified datasets.", + ) + bundle_install.add_argument( + "version", + nargs="?", + help="Bundle version to install. Defaults to the newest packaged bundle.", + ) + bundle_install.add_argument( + "--manifest", + help="Custom bundle manifest path or URL.", + ) + bundle_install.add_argument( + "--python", + help=( + "Python interpreter to install the package scaffold into. Defaults " + "to the active environment, or ./.venv when run from uvx/pipx." + ), + ) + bundle_install.add_argument( + "--venv", + type=Path, + help=( + "Virtual environment to create or reuse as the installation target. " + "Defaults to ./.venv when run from uvx/pipx." + ), + ) + bundle_install.add_argument( + "--country", + action="append", + choices=("us", "uk"), + help="Country to include. Repeat for multiple countries. Defaults to all.", + ) + bundle_install.add_argument( + "--no-datasets", + action="store_true", + help="Install packages without downloading certified datasets.", + ) + bundle_install.add_argument( + "--data-dir", + type=Path, + default=Path("./data"), + help="Directory for certified dataset files and the bundle receipt.", + ) + bundle_install.add_argument( + "--yes", + action="store_true", + help="Confirm dataset downloads non-interactively.", + ) + bundle_install.add_argument( + "--dry-run", + action="store_true", + help="Print installation actions without changing packages or datasets.", + ) + + bundle_status = bundle_subparsers.add_parser( + "status", + help="Show local package and dataset status for a bundle.", + ) + bundle_status.add_argument("version", nargs="?", help="Bundle version to compare.") + bundle_status.add_argument("--manifest", help="Custom bundle manifest path or URL.") + bundle_status.add_argument( + "--python", + help=( + "Python interpreter to inspect. Defaults to the receipt target, " + "then the current process." + ), + ) + bundle_status.add_argument( + "--venv", + type=Path, + help=( + "Virtual environment to inspect. Defaults to the receipt target, " + "then the current process." + ), + ) + bundle_status.add_argument( + "--country", + action="append", + choices=("us", "uk"), + help="Country to inspect. Repeat for multiple countries. Defaults to all.", + ) + bundle_status.add_argument( + "--data-dir", + type=Path, + default=Path("./data"), + help="Directory containing the bundle receipt and datasets.", + ) + bundle_status.add_argument( + "--json", + action="store_true", + help="Print the full status report as JSON.", + ) + bundle_status.add_argument( + "--packages-only", + action="store_true", + help="Skip dataset receipt checks and verify only installed packages.", + ) + + bundle_verify = bundle_subparsers.add_parser( + "verify", + help="Verify local packages and datasets against a bundle.", + ) + bundle_verify.add_argument("version", nargs="?", help="Bundle version to verify.") + bundle_verify.add_argument("--manifest", help="Custom bundle manifest path or URL.") + bundle_verify.add_argument( + "--python", + help=( + "Python interpreter to inspect. Defaults to the receipt target, " + "then the current process." + ), + ) + bundle_verify.add_argument( + "--venv", + type=Path, + help=( + "Virtual environment to inspect. Defaults to the receipt target, " + "then the current process." + ), + ) + bundle_verify.add_argument( + "--country", + action="append", + choices=("us", "uk"), + help="Country to verify. Repeat for multiple countries. Defaults to all.", + ) + bundle_verify.add_argument( + "--data-dir", + type=Path, + default=Path("./data"), + help="Directory containing the bundle receipt and datasets.", + ) + bundle_verify.add_argument( + "--json", + action="store_true", + help="Print the full verification report as JSON.", + ) + bundle_verify.add_argument( + "--packages-only", + action="store_true", + help="Skip dataset receipt checks and verify only installed packages.", + ) + + bundle_manifest = bundle_subparsers.add_parser( + "manifest", + help="Print a bundle manifest as JSON.", + ) + bundle_manifest.add_argument("version", nargs="?", help="Bundle version to print.") + bundle_manifest.add_argument( + "--manifest", help="Custom bundle manifest path or URL." + ) + return parser @@ -171,6 +339,97 @@ def _emit_release_manifest(country_id: str) -> int: return 0 +def _install_bundle(args: argparse.Namespace) -> int: + try: + result = install_bundle( + args.version, + manifest_ref=args.manifest, + python=args.python, + venv=args.venv, + countries=args.country, + data_dir=args.data_dir, + no_datasets=args.no_datasets, + yes=args.yes, + dry_run=args.dry_run, + ) + except BundleError as exc: + print(f"error: {exc}", file=sys.stderr) + return 1 + print(json.dumps(result, indent=2, sort_keys=True)) + return 0 + + +def _bundle_status(args: argparse.Namespace) -> int: + try: + report = inspect_bundle_status( + args.version, + manifest_ref=args.manifest, + python=args.python, + venv=args.venv, + countries=args.country, + data_dir=args.data_dir, + packages_only=args.packages_only, + ) + except BundleError as exc: + print(f"error: {exc}", file=sys.stderr) + return 1 + if args.json: + print(json.dumps(report, indent=2, sort_keys=True)) + else: + status = "matched" if report["matched"] else "mismatch" + print(f"PolicyEngine bundle {report['bundle_version']}: {status}") + if report["target_python"]: + print(f"target Python: {report['target_python']}") + for check in report["packages"]: + installed = check.get("installed_version", "missing") + print( + f"- {check['package']}: {check['status']} " + f"(expected {check['expected_version']}, installed {installed})" + ) + for check in report["datasets"]: + installed = check.get("installed_version", "missing") + print( + f"- {check['country']} dataset {check['dataset']}: " + f"{check['status']} (expected {check['expected_version']}, " + f"installed {installed})" + ) + return 0 if report["matched"] else 1 + + +def _bundle_verify(args: argparse.Namespace) -> int: + try: + report = inspect_bundle_status( + args.version, + manifest_ref=args.manifest, + python=args.python, + venv=args.venv, + countries=args.country, + data_dir=args.data_dir, + packages_only=args.packages_only, + ) + except BundleError as exc: + print(f"error: {exc}", file=sys.stderr) + return 1 + if args.json: + print(json.dumps(report, indent=2, sort_keys=True)) + else: + print( + f"PolicyEngine bundle {report['bundle_version']}: " + f"{'ok' if report['matched'] else 'failed'}" + ) + return 0 if report["matched"] else 1 + + +def _emit_bundle_manifest(args: argparse.Namespace) -> int: + try: + manifest = load_bundle_manifest(args.version, manifest_ref=args.manifest) + except BundleError as exc: + print(f"error: {exc}", file=sys.stderr) + return 1 + print(json.dumps(manifest, indent=2, sort_keys=True)) + return 0 + + def main(argv: Optional[Sequence[str]] = None) -> int: args = _parser().parse_args(argv) if args.command == "trace-tro": @@ -181,6 +440,15 @@ def main(argv: Optional[Sequence[str]] = None) -> int: return _verify_tro(args.path, args.base_dir, args.skip) if args.command == "release-manifest": return _emit_release_manifest(args.country) + if args.command == "bundle": + if args.bundle_command == "install": + return _install_bundle(args) + if args.bundle_command == "status": + return _bundle_status(args) + if args.bundle_command == "verify": + return _bundle_verify(args) + if args.bundle_command == "manifest": + return _emit_bundle_manifest(args) return 1 diff --git a/src/policyengine/data/bundle/manifest.json b/src/policyengine/data/bundle/manifest.json new file mode 100644 index 00000000..531c51ec --- /dev/null +++ b/src/policyengine/data/bundle/manifest.json @@ -0,0 +1,242 @@ +{ + "bundle_version": "4.17.11", + "citation": { + "publisher": "PolicyEngine", + "title": "PolicyEngine bundle 4.17.11", + "type": "software-bundle", + "version": "4.17.11" + }, + "countries": { + "uk": { + "model_package": "policyengine-uk" + }, + "us": { + "model_package": "policyengine-us" + } + }, + "data_releases": { + "uk": { + "build_id": "populace-uk-2023-dd68c73-4aa4b14-20260619T023711Z", + "bundle_id": "uk-4.17.11", + "certification": { + "built_with_model_version": "2.89.2", + "certified_by": "policyengine.py bundle certification", + "certified_for_model_version": "2.89.2", + "compatibility_basis": "built_with_model_package", + "data_build_id": "populace-uk-2023-dd68c73-4aa4b14-20260619T023711Z" + }, + "certified_data_artifact": { + "build_id": "populace-uk-2023-dd68c73-4aa4b14-20260619T023711Z", + "data_package": { + "name": "populace-data", + "version": "0.1.0" + }, + "dataset": "populace_uk_2023", + "sha256": "f17306ccb2aad7ff0130be3589b560afb2e2a12a943570911cd0c77f07934833", + "uri": "hf://policyengine/populace-uk-private/populace_uk_2023.h5@populace-uk-2023-dd68c73-4aa4b14-20260619T023711Z" + }, + "country_id": "uk", + "data_package": { + "name": "populace-data", + "release_manifest_path": "releases/populace-uk-2023-dd68c73-4aa4b14-20260619T023711Z/release_manifest.json", + "release_manifest_revision": "populace-uk-2023-dd68c73-4aa4b14-20260619T023711Z", + "repo_id": "policyengine/populace-uk-private", + "repo_type": "dataset", + "version": "0.1.0" + }, + "data_producer": "populace", + "datasets": { + "calibration_diagnostics": { + "path": "releases/populace-uk-2023-dd68c73-4aa4b14-20260619T023711Z/calibration_diagnostics.json", + "repo_id": "policyengine/populace-uk-private", + "revision": "populace-uk-2023-dd68c73-4aa4b14-20260619T023711Z", + "sha256": "80b98127020aafb049846e0877a3818476aaf7adf13539d62d512fdd6727745d" + }, + "enhanced_frs_2023_24": { + "path": "enhanced_frs_2023_24.h5", + "repo_id": "policyengine/policyengine-uk-data-private", + "revision": "655dd07e4bb9c777b00dac044949611f1feb824f", + "sha256": "584ae33d80ca0431254610a3f8254d132da73477d31966d6446282861ecae50d" + }, + "frs_2023_24": { + "path": "frs_2023_24.h5", + "repo_id": "policyengine/policyengine-uk-data-private", + "revision": "655dd07e4bb9c777b00dac044949611f1feb824f", + "sha256": "df26d4d7af9d164aa2d064181b39290292d2f62bb26fee6126fc095fc06da292" + }, + "populace_uk_2023": { + "path": "populace_uk_2023.h5", + "repo_id": "policyengine/populace-uk-private", + "revision": "populace-uk-2023-dd68c73-4aa4b14-20260619T023711Z", + "sha256": "f17306ccb2aad7ff0130be3589b560afb2e2a12a943570911cd0c77f07934833" + }, + "populace_uk_2023_calibration": { + "path": "populace_uk_2023_calibration.npz", + "repo_id": "policyengine/populace-uk-private", + "revision": "populace-uk-2023-dd68c73-4aa4b14-20260619T023711Z", + "sha256": "fb2fc115fbae53a501b8acbc1529f319b9e07b74478c7bd02d00c674d4c10022" + } + }, + "default_dataset": "populace_uk_2023", + "default_dataset_uri": "hf://policyengine/populace-uk-private/populace_uk_2023.h5@populace-uk-2023-dd68c73-4aa4b14-20260619T023711Z", + "model_package": { + "name": "policyengine-uk", + "sha256": "80965d3dd7dc767db9b083820d40262ce543020d5a8880a0cf88da10ae641b24", + "version": "2.89.2", + "wheel_url": "https://files.pythonhosted.org/packages/83/db/ce3154ba69b6fcd1e9e922ceee705ef4ddb1f81553da1e63b9296e74a4dc/policyengine_uk-2.89.2-py3-none-any.whl" + }, + "policyengine_version": "4.17.11", + "region_datasets": { + "national": { + "path_template": "populace_uk_2023.h5" + } + }, + "release_manifest_uri": "https://huggingface.co/datasets/policyengine/populace-uk-private/resolve/populace-uk-2023-dd68c73-4aa4b14-20260619T023711Z/releases/populace-uk-2023-dd68c73-4aa4b14-20260619T023711Z/release_manifest.json", + "schema_version": 1, + "source_manifest_uri": "hf://dataset/policyengine/populace-uk-private@populace-uk-2023-dd68c73-4aa4b14-20260619T023711Z/releases/populace-uk-2023-dd68c73-4aa4b14-20260619T023711Z/release_manifest.json", + "version": "populace-uk-2023-dd68c73-4aa4b14-20260619T023711Z" + }, + "us": { + "build_id": "populace-us-2024-f0af251-703bd81a565c-20260620T201958Z", + "bundle_id": "us-4.17.11", + "certification": { + "built_with_model_version": "1.729.0", + "certified_by": "policyengine.py bundle certification", + "certified_for_model_version": "1.729.0", + "compatibility_basis": "built_with_model_package", + "data_build_id": "populace-us-2024-f0af251-703bd81a565c-20260620T201958Z" + }, + "certified_data_artifact": { + "build_id": "populace-us-2024-f0af251-703bd81a565c-20260620T201958Z", + "data_package": { + "name": "populace-data", + "version": "0.1.0" + }, + "dataset": "populace_us_2024", + "sha256": "16be6338f9d0b3c339883dae59949e995663b64cf145de6728b3dd0f916c5d5f", + "uri": "hf://policyengine/populace-us/populace_us_2024.h5@populace-us-2024-f0af251-703bd81a565c-20260620T201958Z" + }, + "country_id": "us", + "data_package": { + "name": "populace-data", + "release_manifest_path": "releases/populace-us-2024-f0af251-703bd81a565c-20260620T201958Z/release_manifest.json", + "release_manifest_revision": "populace-us-2024-f0af251-703bd81a565c-20260620T201958Z", + "repo_id": "policyengine/populace-us", + "repo_type": "dataset", + "version": "0.1.0" + }, + "data_producer": "populace", + "datasets": { + "calibration_diagnostics": { + "path": "releases/populace-us-2024-f0af251-703bd81a565c-20260620T201958Z/calibration_diagnostics.json", + "repo_id": "policyengine/populace-us", + "revision": "populace-us-2024-f0af251-703bd81a565c-20260620T201958Z", + "sha256": "448a3c7ff0bda996332764d3c0a831f0e1df05a0eaaff8c2b3d65a40d9383c13" + }, + "demographics": { + "path": "releases/populace-us-2024-f0af251-703bd81a565c-20260620T201958Z/demographics.json", + "repo_id": "policyengine/populace-us", + "revision": "populace-us-2024-f0af251-703bd81a565c-20260620T201958Z", + "sha256": "eea9b24a198ff654390cf8a14c2dc12eb5b9894d203d0fd9674747bbe3ab7815" + }, + "populace_us_2024": { + "path": "populace_us_2024.h5", + "repo_id": "policyengine/populace-us", + "revision": "populace-us-2024-f0af251-703bd81a565c-20260620T201958Z", + "sha256": "16be6338f9d0b3c339883dae59949e995663b64cf145de6728b3dd0f916c5d5f" + }, + "populace_us_2024_calibration": { + "path": "populace_us_2024_calibration.npz", + "repo_id": "policyengine/populace-us", + "revision": "populace-us-2024-f0af251-703bd81a565c-20260620T201958Z", + "sha256": "bf79d64a5e18a028ecd1236f393b72fea26b32411b2e41c0a0e68e48e556adbb" + }, + "reform_validation": { + "path": "releases/populace-us-2024-f0af251-703bd81a565c-20260620T201958Z/reform_validation.json", + "repo_id": "policyengine/populace-us", + "revision": "populace-us-2024-f0af251-703bd81a565c-20260620T201958Z", + "sha256": "c577e0b1419f9c12de0cad7a4b3bf32507e718f5ea9d79bc3a32d0c48160cc89" + }, + "us_source_coverage": { + "path": "releases/populace-us-2024-f0af251-703bd81a565c-20260620T201958Z/us_source_coverage.json", + "repo_id": "policyengine/populace-us", + "revision": "populace-us-2024-f0af251-703bd81a565c-20260620T201958Z", + "sha256": "c3e0c388be2f53ca03b59dddc110db28bc4b0acaa21e2eb86003c735e4f47d8e" + } + }, + "default_dataset": "populace_us_2024", + "default_dataset_uri": "hf://policyengine/populace-us/populace_us_2024.h5@populace-us-2024-f0af251-703bd81a565c-20260620T201958Z", + "model_package": { + "name": "policyengine-us", + "sha256": "8d21d3f7c0e82a9415edffe8ea53939330a63d9c8f6bd334299bddb697cf2c00", + "version": "1.729.0", + "wheel_url": "https://files.pythonhosted.org/packages/b9/7d/778f92ae94997b00c3c9ac34b345f6c9333435f905670ee4eeb2f5e19809/policyengine_us-1.729.0-py3-none-any.whl" + }, + "policyengine_version": "4.17.11", + "region_datasets": {}, + "release_manifest_uri": "https://huggingface.co/datasets/policyengine/populace-us/resolve/populace-us-2024-f0af251-703bd81a565c-20260620T201958Z/releases/populace-us-2024-f0af251-703bd81a565c-20260620T201958Z/release_manifest.json", + "schema_version": 1, + "source_manifest_uri": "hf://dataset/policyengine/populace-us@populace-us-2024-f0af251-703bd81a565c-20260620T201958Z/releases/populace-us-2024-f0af251-703bd81a565c-20260620T201958Z/release_manifest.json", + "version": "populace-us-2024-f0af251-703bd81a565c-20260620T201958Z" + } + }, + "extras": { + "models": [ + "policyengine-core", + "policyengine-us", + "policyengine-uk" + ], + "uk": [ + "policyengine-core", + "policyengine-uk" + ], + "us": [ + "policyengine-core", + "policyengine-us" + ] + }, + "packages": { + "policyengine": { + "import_name": "policyengine", + "install_requirement": "policyengine==4.17.11", + "name": "policyengine", + "role": "bundle_carrier", + "version": "4.17.11" + }, + "policyengine-core": { + "import_name": "policyengine_core", + "install_requirement": "policyengine-core==3.27.1", + "name": "policyengine-core", + "role": "runtime_dependency", + "version": "3.27.1" + }, + "policyengine-uk": { + "country": "uk", + "import_name": "policyengine_uk", + "install_requirement": "policyengine-uk==2.89.2", + "name": "policyengine-uk", + "role": "country_model", + "version": "2.89.2" + }, + "policyengine-us": { + "country": "us", + "import_name": "policyengine_us", + "install_requirement": "policyengine-us==1.729.0", + "name": "policyengine-us", + "role": "country_model", + "version": "1.729.0" + }, + "policyengine-us-data": { + "country": "us", + "import_name": "policyengine_us_data", + "install_requirement": "policyengine-us-data==1.78.2; python_version >= '3.12' and python_version < '3.15'", + "markers": "python_version >= '3.12' and python_version < '3.15'", + "name": "policyengine-us-data", + "optional": true, + "role": "country_data", + "version": "1.78.2" + } + }, + "policyengine_version": "4.17.11", + "schema_version": 2 +} diff --git a/src/policyengine/data/release_manifests/uk.trace.tro.jsonld b/src/policyengine/data/bundle/uk.trace.tro.jsonld similarity index 90% rename from src/policyengine/data/release_manifests/uk.trace.tro.jsonld rename to src/policyengine/data/bundle/uk.trace.tro.jsonld index 311132c4..7328dae3 100644 --- a/src/policyengine/data/release_manifests/uk.trace.tro.jsonld +++ b/src/policyengine/data/bundle/uk.trace.tro.jsonld @@ -37,7 +37,7 @@ "trov:hasArtifact": { "@id": "composition/1/artifact/bundle_manifest" }, - "trov:hasLocation": "data/release_manifests/uk.json" + "trov:hasLocation": "data/bundle/manifest.json" }, { "@id": "arrangement/1/location/data_release_manifest", @@ -75,7 +75,7 @@ "@type": "trov:ResearchArtifact", "schema:name": "policyengine.py bundle manifest for uk", "trov:mimeType": "application/json", - "trov:sha256": "eac8716c73d87fd1843589f0c768c9e0036d0af3cb35b6413ca373d906b5f0ba" + "trov:sha256": "dacd2f935270948e290bf0e8233c05463231f91ae5cbabee08f3f6b3b7337a3a" }, { "@id": "composition/1/artifact/data_release_manifest", @@ -102,21 +102,18 @@ "trov:hasFingerprint": { "@id": "composition/1/fingerprint", "@type": "trov:CompositionFingerprint", - "trov:sha256": "4a40448670ab27e857b38dd77eacde14655b313b644d964e4f33829d13f31857" + "trov:sha256": "27396616f0b473bd88ea80c543030ee0efde87e93bf9d8372e3f83f1c09ee209" } }, "trov:hasPerformance": { "@id": "trp/1", "@type": "trov:TransparentResearchPerformance", "pe:builtWithModelVersion": "2.89.2", - "pe:certifiedBy": "policyengine.py certification", + "pe:certifiedBy": "policyengine.py bundle certification", "pe:certifiedForModelVersion": "2.89.2", - "pe:ciGitRef": "refs/heads/main", - "pe:ciGitSha": "59509f48da0d5884b7b6c6dc9c6ab3321e8115ea", - "pe:ciRunUrl": "https://github.com/PolicyEngine/policyengine.py/actions/runs/27883888353", "pe:compatibilityBasis": "built_with_model_package", "pe:dataBuildId": "populace-uk-2023-dd68c73-4aa4b14-20260619T023711Z", - "pe:emittedIn": "github-actions", + "pe:emittedIn": "local", "rdfs:comment": "Certification of build populace-uk-2023-dd68c73-4aa4b14-20260619T023711Z for policyengine-uk 2.89.2.", "trov:accessedArrangement": { "@id": "arrangement/1" diff --git a/src/policyengine/data/release_manifests/us.trace.tro.jsonld b/src/policyengine/data/bundle/us.trace.tro.jsonld similarity index 90% rename from src/policyengine/data/release_manifests/us.trace.tro.jsonld rename to src/policyengine/data/bundle/us.trace.tro.jsonld index 2b902728..24411ef0 100644 --- a/src/policyengine/data/release_manifests/us.trace.tro.jsonld +++ b/src/policyengine/data/bundle/us.trace.tro.jsonld @@ -37,7 +37,7 @@ "trov:hasArtifact": { "@id": "composition/1/artifact/bundle_manifest" }, - "trov:hasLocation": "data/release_manifests/us.json" + "trov:hasLocation": "data/bundle/manifest.json" }, { "@id": "arrangement/1/location/data_release_manifest", @@ -75,7 +75,7 @@ "@type": "trov:ResearchArtifact", "schema:name": "policyengine.py bundle manifest for us", "trov:mimeType": "application/json", - "trov:sha256": "d29cfd6a4b8e4af57f13963ce1884e3da832aba650674871db23fa7f3a1d82f0" + "trov:sha256": "dacd2f935270948e290bf0e8233c05463231f91ae5cbabee08f3f6b3b7337a3a" }, { "@id": "composition/1/artifact/data_release_manifest", @@ -102,21 +102,18 @@ "trov:hasFingerprint": { "@id": "composition/1/fingerprint", "@type": "trov:CompositionFingerprint", - "trov:sha256": "18fcf6abf4eab6d367335a11b33ca2b218171973c32a1cd6b70c5f4ca34b745a" + "trov:sha256": "dd8ab19358c13d3cc8a3292043644bbf71a567327949f901b4d9eac203ffe048" } }, "trov:hasPerformance": { "@id": "trp/1", "@type": "trov:TransparentResearchPerformance", "pe:builtWithModelVersion": "1.729.0", - "pe:certifiedBy": "policyengine.py certification", + "pe:certifiedBy": "policyengine.py bundle certification", "pe:certifiedForModelVersion": "1.729.0", - "pe:ciGitRef": "refs/heads/main", - "pe:ciGitSha": "59509f48da0d5884b7b6c6dc9c6ab3321e8115ea", - "pe:ciRunUrl": "https://github.com/PolicyEngine/policyengine.py/actions/runs/27883888353", "pe:compatibilityBasis": "built_with_model_package", "pe:dataBuildId": "populace-us-2024-f0af251-703bd81a565c-20260620T201958Z", - "pe:emittedIn": "github-actions", + "pe:emittedIn": "local", "rdfs:comment": "Certification of build populace-us-2024-f0af251-703bd81a565c-20260620T201958Z for policyengine-us 1.729.0.", "trov:accessedArrangement": { "@id": "arrangement/1" diff --git a/src/policyengine/data/release_manifests/uk.json b/src/policyengine/data/release_manifests/uk.json deleted file mode 100644 index 9ca2b525..00000000 --- a/src/policyengine/data/release_manifests/uk.json +++ /dev/null @@ -1,75 +0,0 @@ -{ - "bundle_id": "uk-4.17.11", - "certification": { - "built_with_model_version": "2.89.2", - "certified_by": "policyengine.py certification", - "certified_for_model_version": "2.89.2", - "compatibility_basis": "built_with_model_package", - "data_build_id": "populace-uk-2023-dd68c73-4aa4b14-20260619T023711Z" - }, - "certified_data_artifact": { - "build_id": "populace-uk-2023-dd68c73-4aa4b14-20260619T023711Z", - "data_package": { - "name": "populace-data", - "version": "0.1.0" - }, - "dataset": "populace_uk_2023", - "sha256": "f17306ccb2aad7ff0130be3589b560afb2e2a12a943570911cd0c77f07934833", - "uri": "hf://policyengine/populace-uk-private/populace_uk_2023.h5@populace-uk-2023-dd68c73-4aa4b14-20260619T023711Z" - }, - "country_id": "uk", - "data_package": { - "name": "populace-data", - "release_manifest_path": "releases/populace-uk-2023-dd68c73-4aa4b14-20260619T023711Z/release_manifest.json", - "release_manifest_revision": "populace-uk-2023-dd68c73-4aa4b14-20260619T023711Z", - "repo_id": "policyengine/populace-uk-private", - "repo_type": "dataset", - "version": "0.1.0" - }, - "datasets": { - "frs_2023_24": { - "path": "frs_2023_24.h5", - "repo_id": "policyengine/policyengine-uk-data-private", - "revision": "655dd07e4bb9c777b00dac044949611f1feb824f", - "sha256": "df26d4d7af9d164aa2d064181b39290292d2f62bb26fee6126fc095fc06da292" - }, - "enhanced_frs_2023_24": { - "path": "enhanced_frs_2023_24.h5", - "repo_id": "policyengine/policyengine-uk-data-private", - "revision": "655dd07e4bb9c777b00dac044949611f1feb824f", - "sha256": "584ae33d80ca0431254610a3f8254d132da73477d31966d6446282861ecae50d" - }, - "calibration_diagnostics": { - "path": "releases/populace-uk-2023-dd68c73-4aa4b14-20260619T023711Z/calibration_diagnostics.json", - "repo_id": "policyengine/populace-uk-private", - "revision": "populace-uk-2023-dd68c73-4aa4b14-20260619T023711Z", - "sha256": "80b98127020aafb049846e0877a3818476aaf7adf13539d62d512fdd6727745d" - }, - "populace_uk_2023": { - "path": "populace_uk_2023.h5", - "repo_id": "policyengine/populace-uk-private", - "revision": "populace-uk-2023-dd68c73-4aa4b14-20260619T023711Z", - "sha256": "f17306ccb2aad7ff0130be3589b560afb2e2a12a943570911cd0c77f07934833" - }, - "populace_uk_2023_calibration": { - "path": "populace_uk_2023_calibration.npz", - "repo_id": "policyengine/populace-uk-private", - "revision": "populace-uk-2023-dd68c73-4aa4b14-20260619T023711Z", - "sha256": "fb2fc115fbae53a501b8acbc1529f319b9e07b74478c7bd02d00c674d4c10022" - } - }, - "default_dataset": "populace_uk_2023", - "model_package": { - "name": "policyengine-uk", - "sha256": "80965d3dd7dc767db9b083820d40262ce543020d5a8880a0cf88da10ae641b24", - "version": "2.89.2", - "wheel_url": "https://files.pythonhosted.org/packages/83/db/ce3154ba69b6fcd1e9e922ceee705ef4ddb1f81553da1e63b9296e74a4dc/policyengine_uk-2.89.2-py3-none-any.whl" - }, - "policyengine_version": "4.17.11", - "region_datasets": { - "national": { - "path_template": "populace_uk_2023.h5" - } - }, - "schema_version": 1 -} diff --git a/src/policyengine/data/release_manifests/us.json b/src/policyengine/data/release_manifests/us.json deleted file mode 100644 index f4fd5e3f..00000000 --- a/src/policyengine/data/release_manifests/us.json +++ /dev/null @@ -1,77 +0,0 @@ -{ - "bundle_id": "us-4.17.11", - "certification": { - "built_with_model_version": "1.729.0", - "certified_by": "policyengine.py certification", - "certified_for_model_version": "1.729.0", - "compatibility_basis": "built_with_model_package", - "data_build_id": "populace-us-2024-f0af251-703bd81a565c-20260620T201958Z" - }, - "certified_data_artifact": { - "build_id": "populace-us-2024-f0af251-703bd81a565c-20260620T201958Z", - "data_package": { - "name": "populace-data", - "version": "0.1.0" - }, - "dataset": "populace_us_2024", - "sha256": "16be6338f9d0b3c339883dae59949e995663b64cf145de6728b3dd0f916c5d5f", - "uri": "hf://policyengine/populace-us/populace_us_2024.h5@populace-us-2024-f0af251-703bd81a565c-20260620T201958Z" - }, - "country_id": "us", - "data_package": { - "name": "populace-data", - "release_manifest_path": "releases/populace-us-2024-f0af251-703bd81a565c-20260620T201958Z/release_manifest.json", - "release_manifest_revision": "populace-us-2024-f0af251-703bd81a565c-20260620T201958Z", - "repo_id": "policyengine/populace-us", - "repo_type": "dataset", - "version": "0.1.0" - }, - "datasets": { - "calibration_diagnostics": { - "path": "releases/populace-us-2024-f0af251-703bd81a565c-20260620T201958Z/calibration_diagnostics.json", - "repo_id": "policyengine/populace-us", - "revision": "populace-us-2024-f0af251-703bd81a565c-20260620T201958Z", - "sha256": "448a3c7ff0bda996332764d3c0a831f0e1df05a0eaaff8c2b3d65a40d9383c13" - }, - "demographics": { - "path": "releases/populace-us-2024-f0af251-703bd81a565c-20260620T201958Z/demographics.json", - "repo_id": "policyengine/populace-us", - "revision": "populace-us-2024-f0af251-703bd81a565c-20260620T201958Z", - "sha256": "eea9b24a198ff654390cf8a14c2dc12eb5b9894d203d0fd9674747bbe3ab7815" - }, - "populace_us_2024": { - "path": "populace_us_2024.h5", - "repo_id": "policyengine/populace-us", - "revision": "populace-us-2024-f0af251-703bd81a565c-20260620T201958Z", - "sha256": "16be6338f9d0b3c339883dae59949e995663b64cf145de6728b3dd0f916c5d5f" - }, - "populace_us_2024_calibration": { - "path": "populace_us_2024_calibration.npz", - "repo_id": "policyengine/populace-us", - "revision": "populace-us-2024-f0af251-703bd81a565c-20260620T201958Z", - "sha256": "bf79d64a5e18a028ecd1236f393b72fea26b32411b2e41c0a0e68e48e556adbb" - }, - "reform_validation": { - "path": "releases/populace-us-2024-f0af251-703bd81a565c-20260620T201958Z/reform_validation.json", - "repo_id": "policyengine/populace-us", - "revision": "populace-us-2024-f0af251-703bd81a565c-20260620T201958Z", - "sha256": "c577e0b1419f9c12de0cad7a4b3bf32507e718f5ea9d79bc3a32d0c48160cc89" - }, - "us_source_coverage": { - "path": "releases/populace-us-2024-f0af251-703bd81a565c-20260620T201958Z/us_source_coverage.json", - "repo_id": "policyengine/populace-us", - "revision": "populace-us-2024-f0af251-703bd81a565c-20260620T201958Z", - "sha256": "c3e0c388be2f53ca03b59dddc110db28bc4b0acaa21e2eb86003c735e4f47d8e" - } - }, - "default_dataset": "populace_us_2024", - "model_package": { - "name": "policyengine-us", - "sha256": "8d21d3f7c0e82a9415edffe8ea53939330a63d9c8f6bd334299bddb697cf2c00", - "version": "1.729.0", - "wheel_url": "https://files.pythonhosted.org/packages/b9/7d/778f92ae94997b00c3c9ac34b345f6c9333435f905670ee4eeb2f5e19809/policyengine_us-1.729.0-py3-none-any.whl" - }, - "policyengine_version": "4.17.11", - "region_datasets": {}, - "schema_version": 1 -} diff --git a/src/policyengine/data/schemas/trace_tro.schema.json b/src/policyengine/data/schemas/trace_tro.schema.json index 244f2d01..f7b902ca 100644 --- a/src/policyengine/data/schemas/trace_tro.schema.json +++ b/src/policyengine/data/schemas/trace_tro.schema.json @@ -72,7 +72,7 @@ }, "trov:hasLocation": { "type": "string", - "pattern": "^(https://[^\\s]+$|data/release_manifests/[a-z]{2,3}\\.json$|(bundle\\.trace\\.tro\\.jsonld(#[a-f0-9]{64})?)|[a-z_]+\\.json$)" + "pattern": "^(https://[^\\s]+$|data/bundle/(manifest\\.json|[a-z]{2,3}\\.trace\\.tro\\.jsonld)$|(bundle\\.trace\\.tro\\.jsonld(#[a-f0-9]{64})?)|[a-z_]+\\.json$)" } } }, diff --git a/src/policyengine/provenance/__init__.py b/src/policyengine/provenance/__init__.py index b3ad336f..5ef0789d 100644 --- a/src/policyengine/provenance/__init__.py +++ b/src/policyengine/provenance/__init__.py @@ -15,12 +15,6 @@ ) """ -from .bundle import RefreshResult as RefreshResult -from .bundle import refresh_release_bundle as refresh_release_bundle -from .bundle import regenerate_trace_tro as regenerate_trace_tro -from .bundle import ( - sync_release_manifest_policyengine_version as sync_release_manifest_policyengine_version, -) from .certification import ( CertificationError as CertificationError, ) diff --git a/src/policyengine/provenance/bundle.py b/src/policyengine/provenance/bundle.py deleted file mode 100644 index aeee68c4..00000000 --- a/src/policyengine/provenance/bundle.py +++ /dev/null @@ -1,971 +0,0 @@ -"""Refresh a country release manifest in place. - -The release manifest at ``data/release_manifests/{country}.json`` pins -three artifacts by content hash: - -- the country model wheel (sha256 + PyPI download URL), -- the certified microdata artifact (sha256 + HF resolve URL), -- the data package metadata used to compute the build fingerprint. - -When a country bumps its PyPI wheel or HF dataset, every one of those -pins has to move together, and the TRACE TRO sidecar at -``data/release_manifests/{country}.trace.tro.jsonld`` must be -regenerated so replication reviewers see the right hashes. - -This module exposes the refresh as a library function: - -.. code-block:: python - - from policyengine.provenance.bundle import refresh_release_bundle - - result = refresh_release_bundle( - country="us", - model_version="1.653.3", - data_version="1.83.4", - ) - print(result.summary()) - -``scripts/refresh_release_bundle.py`` is a thin argparse wrapper for -operational use. Network access is required (PyPI JSON API + HF HEAD -against the dataset URI). Private country data (UK) additionally -needs ``HUGGING_FACE_TOKEN``. -""" - -from __future__ import annotations - -import hashlib -import json -import os -import posixpath -import re -from dataclasses import dataclass -from pathlib import Path -from typing import Optional -from urllib.request import Request, urlopen - -from packaging.specifiers import InvalidSpecifier, SpecifierSet -from packaging.version import InvalidVersion, Version - -from policyengine.provenance.manifest import ( - CountryReleaseManifest, - get_release_manifest, - https_dataset_uri, -) - -# --------------------------------------------------------------------------- -# Paths inside the installed / source-tree wheel. -# --------------------------------------------------------------------------- - -REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent -MANIFEST_DIR = REPO_ROOT / "src" / "policyengine" / "data" / "release_manifests" -PYPROJECT = REPO_ROOT / "pyproject.toml" -SEMVER_PATTERN = re.compile(r"^(\d+)\.(\d+)\.(\d+)$") - - -@dataclass(frozen=True) -class _CriticalCalibrationTarget: - name: str - max_abs_relative_error: float - - -_US_POPULACE_CRITICAL_CALIBRATION_TARGETS = ( - _CriticalCalibrationTarget( - name="irs_soi.ty2022.historic_table_2.us.all.income_tax_liability_amount@2024", - max_abs_relative_error=0.05, - ), - _CriticalCalibrationTarget( - name="irs_soi.ty2022.historic_table_2.us.all.income_tax_liability_returns@2024", - max_abs_relative_error=0.10, - ), - _CriticalCalibrationTarget( - name="ssa_supplement.cy2024.oasdi_ssi_payments.social_security_benefits.payment_amount@2024", - max_abs_relative_error=0.05, - ), -) - - -# --------------------------------------------------------------------------- -# policyengine.py bundle identity -# --------------------------------------------------------------------------- - - -def _pyproject_version(pyproject_path: Path) -> str: - text = pyproject_path.read_text() - match = re.search(r'^version\s*=\s*"(\d+\.\d+\.\d+)"', text, re.MULTILINE) - if match is None: - raise ValueError(f"Could not find project version in {pyproject_path}") - return match.group(1) - - -def sync_release_manifest_policyengine_version( - *, - policyengine_version: Optional[str] = None, - manifest_dir: Path = MANIFEST_DIR, - pyproject_path: Path = PYPROJECT, -) -> list[Path]: - """Sync bundled release manifests to the current ``policyengine.py`` version. - - Country model/data refreshes and package release bumps move through - different automation paths. This helper keeps the top-level bundle identity - tied to the package release regardless of which path writes the manifest. - """ - resolved_version = policyengine_version or _pyproject_version(pyproject_path) - if not SEMVER_PATTERN.match(resolved_version): - raise ValueError(f"Invalid policyengine version: {resolved_version}") - - updated_paths: list[Path] = [] - for manifest_path in sorted(manifest_dir.glob("*.json")): - manifest_json = json.loads(manifest_path.read_text()) - country_id = manifest_json.get("country_id") or manifest_path.stem - expected_bundle_id = f"{country_id}-{resolved_version}" - if ( - manifest_json.get("policyengine_version") == resolved_version - and manifest_json.get("bundle_id") == expected_bundle_id - ): - continue - - manifest_json["policyengine_version"] = resolved_version - manifest_json["bundle_id"] = expected_bundle_id - manifest_path.write_text( - json.dumps(manifest_json, indent=2, sort_keys=False) + "\n" - ) - updated_paths.append(manifest_path) - - return updated_paths - - -# --------------------------------------------------------------------------- -# PyPI metadata resolution -# --------------------------------------------------------------------------- - - -def _pypi_wheel_metadata(package: str, version: str) -> dict: - """Return ``{"url": ..., "sha256": ...}`` for the py3-none-any wheel - of ``package==version`` on PyPI. - - Raises if PyPI reports no matching wheel, or if multiple matching - wheels exist with different sha256s (i.e. the release is - unambiguous). - """ - url = f"https://pypi.org/pypi/{package}/{version}/json" - with urlopen(Request(url, headers={"User-Agent": "policyengine.py"})) as f: - payload = json.load(f) - wheels = [ - f - for f in payload.get("urls", []) - if f.get("packagetype") == "bdist_wheel" - and "py3-none-any" in f.get("filename", "") - ] - if not wheels: - raise ValueError( - f"No py3-none-any wheel found on PyPI for {package}=={version}" - ) - sha256s = {f["digests"]["sha256"] for f in wheels} - if len(sha256s) > 1: - raise ValueError( - f"Multiple distinct py3-none-any wheels for {package}=={version}: {sha256s}" - ) - wheel = wheels[0] - return {"url": wheel["url"], "sha256": wheel["digests"]["sha256"]} - - -# --------------------------------------------------------------------------- -# Hugging Face dataset resolution -# --------------------------------------------------------------------------- - - -def _hf_dataset_sha256(repo_id: str, path: str, revision: str) -> str: - """Fetch the dataset file's sha256 by streaming the resolve URL. - - Uses the ``HUGGING_FACE_TOKEN`` env var for private repos. Streams - the file in 8 MiB chunks so memory usage stays flat. - """ - url = https_dataset_uri( - repo_id=repo_id, - path_in_repo=path, - revision=revision, - ) - headers = {"User-Agent": "policyengine.py"} - token = os.environ.get("HUGGING_FACE_TOKEN") or os.environ.get("HF_TOKEN") - if token: - headers["Authorization"] = f"Bearer {token}" - - hasher = hashlib.sha256() - with urlopen(Request(url, headers=headers)) as f: - while True: - chunk = f.read(8 * 1024 * 1024) - if not chunk: - break - hasher.update(chunk) - return hasher.hexdigest() - - -@dataclass(frozen=True) -class _DataReleaseManifestFetch: - payload: dict - repo_commit: Optional[str] - - -def _fetch_data_release_manifest( - repo_id: str, - release_manifest_path: str, - revision: str, - *, - repo_type: str = "model", - allow_main_fallback: bool = True, -) -> Optional[_DataReleaseManifestFetch]: - """Fetch a data release manifest from HF if one is available. - - Older data releases may not have a machine-readable release manifest at the - inferred path. In that case the bundle refresh falls back to hashing the - dataset artifact directly. - - Data releases are stored under versioned paths, but the HF repository does - not necessarily create a matching git tag for each data version. For - inferred data-version revisions, try the version revision first for - repositories that do publish tags, then fall back to ``main`` and persist - the immutable ``x-repo-commit`` header. Explicit revisions do not get that - fallback because a typo or stale CRFB run ref should fail closed. - """ - headers = {"User-Agent": "policyengine.py"} - token = os.environ.get("HUGGING_FACE_TOKEN") or os.environ.get("HF_TOKEN") - if token: - headers["Authorization"] = f"Bearer {token}" - - revisions = [revision] - if allow_main_fallback and revision != "main": - revisions.append("main") - - for candidate in revisions: - prefix = "datasets/" if repo_type == "dataset" else "" - url = ( - f"https://huggingface.co/{prefix}{repo_id}/resolve/" - f"{candidate}/{release_manifest_path}" - ) - try: - with urlopen(Request(url, headers=headers)) as f: - payload = json.load(f) - repo_commit = getattr(f, "headers", {}).get("x-repo-commit") - return _DataReleaseManifestFetch( - payload=payload, - repo_commit=repo_commit, - ) - except (OSError, ValueError): - continue - return None - - -def _fetch_json_release_artifact( - artifact: dict, - *, - release_manifest_path: str | None, - data_repo_id: str, - data_repo_type: str, - default_revision: str, -) -> dict: - path = _release_scoped_artifact_path( - artifact, - release_manifest_path=release_manifest_path, - data_repo_id=data_repo_id, - ) - repo_id = artifact.get("repo_id") or data_repo_id - revision = artifact.get("revision") or default_revision - repo_type = artifact.get("repo_type") - if repo_type is None: - repo_type = data_repo_type if repo_id == data_repo_id else "model" - url = https_dataset_uri( - repo_id=repo_id, - path_in_repo=path, - revision=revision, - repo_type=repo_type, - ) - headers = {"User-Agent": "policyengine.py"} - token = os.environ.get("HUGGING_FACE_TOKEN") or os.environ.get("HF_TOKEN") - if token: - headers["Authorization"] = f"Bearer {token}" - - with urlopen(Request(url, headers=headers)) as f: - content = f.read() - - expected_sha256 = artifact.get("sha256") - if expected_sha256: - actual_sha256 = hashlib.sha256(content).hexdigest() - if actual_sha256 != expected_sha256: - raise ValueError( - "Data release artifact hash mismatch for " - f"{path!r}: expected {expected_sha256}, got {actual_sha256}." - ) - - try: - return json.loads(content) - except ValueError as exc: - raise ValueError(f"Data release artifact {path!r} is not valid JSON.") from exc - - -def _calibration_relative_error(target: dict) -> float | None: - relative_error = target.get("relative_error") - if relative_error is not None: - return float(relative_error) - - target_value = target.get("target") - final_estimate = target.get("final_estimate", target.get("final")) - if target_value in (None, 0) or final_estimate is None: - return None - return (float(final_estimate) - float(target_value)) / float(target_value) - - -def _validate_populace_critical_calibration_targets( - *, - country: str, - release_manifest_json: dict, - release_manifest_path: str | None, - data_package_name: str, - data_repo_id: str, - data_repo_type: str, - default_revision: str, -) -> None: - if country != "us" or data_package_name != "populace-data": - return - - diagnostics_artifact = release_manifest_json.get("artifacts", {}).get( - "calibration_diagnostics" - ) - if diagnostics_artifact is None: - raise ValueError( - "Populace release manifest is missing calibration_diagnostics; " - "refusing to certify without critical calibration target gates." - ) - - diagnostics = _fetch_json_release_artifact( - diagnostics_artifact, - release_manifest_path=release_manifest_path, - data_repo_id=data_repo_id, - data_repo_type=data_repo_type, - default_revision=default_revision, - ) - targets = diagnostics.get("targets") - if not isinstance(targets, list): - raise ValueError( - "Populace calibration_diagnostics is missing the target list; " - "refusing to certify without critical calibration target gates." - ) - targets_by_name = { - target.get("name"): target - for target in targets - if isinstance(target, dict) and target.get("name") - } - - failures = [] - for critical_target in _US_POPULACE_CRITICAL_CALIBRATION_TARGETS: - target = targets_by_name.get(critical_target.name) - if target is None: - failures.append(f"{critical_target.name}: missing") - continue - - relative_error = _calibration_relative_error(target) - if relative_error is None: - failures.append(f"{critical_target.name}: missing relative error") - continue - - if abs(relative_error) > critical_target.max_abs_relative_error: - failures.append( - f"{critical_target.name}: relative_error={relative_error:.6g} " - f"exceeds {critical_target.max_abs_relative_error:.6g}" - ) - - if failures: - raise ValueError( - "Populace critical calibration target gate failed: " + "; ".join(failures) - ) - - -def _updated_release_manifest_path( - current_path: str, - old_data: str, - new_data: str, -) -> str: - """Preserve country-specific release-manifest layout while bumping versions.""" - if old_data in current_path: - return current_path.replace(old_data, new_data) - return current_path - - -def _release_artifact_by_path( - release_manifest_json: dict, - path: str, -) -> Optional[dict]: - artifacts = release_manifest_json.get("artifacts", {}) - for artifact in artifacts.values(): - if artifact.get("path") == path: - return artifact - return None - - -def _metadata_sidecar_path(path: str) -> str: - return f"{path}.metadata.json" - - -def _release_scoped_artifact_path( - artifact: dict, - *, - release_manifest_path: str | None, - data_repo_id: str, -) -> str: - """Return the dereferenceable country-manifest path for a release artifact. - - Populace release manifests describe diagnostics relative to the release - directory, while the HF files are published under ``releases/{id}/``. - Runtime manifests must store the dereferenceable path. - """ - path = artifact.get("path", "") - if not path: - return path - if not release_manifest_path: - return path - release_dir = posixpath.dirname(release_manifest_path) - if ( - release_dir - and release_dir != "." - and artifact.get("kind") == "diagnostics" - and artifact.get("repo_id") == data_repo_id - and not path.startswith(f"{release_dir}/") - ): - return f"{release_dir}/{path}" - return path - - -def _specifier_matches(*, version: str, specifier: str) -> bool: - try: - return Version(version) in SpecifierSet(specifier) - except (InvalidSpecifier, InvalidVersion): - return False - - -def _release_manifest_has_compatible_model_package( - release_manifest_json: dict, - *, - package_name: str, - model_version: str, -) -> bool: - for compatible_model_package in release_manifest_json.get( - "compatible_model_packages", - [], - ): - if compatible_model_package.get("name") != package_name: - continue - if _specifier_matches( - version=model_version, - specifier=compatible_model_package.get("specifier", ""), - ): - return True - return False - - -def _release_manifest_compatibility_basis( - *, - release_manifest_json: dict, - current_manifest: CountryReleaseManifest, - package_name: str, - model_version: str, - built_with_model_version: str | None, - data_build_fingerprint: str | None, -) -> str: - if built_with_model_version == model_version: - return "exact_build_model_version" - - current_certification = current_manifest.certification - if ( - current_certification is not None - and current_certification.certified_for_model_version == model_version - and current_certification.data_build_fingerprint is not None - and current_certification.data_build_fingerprint == data_build_fingerprint - ): - return "matching_data_build_fingerprint" - - if _release_manifest_has_compatible_model_package( - release_manifest_json, - package_name=package_name, - model_version=model_version, - ): - return "legacy_compatible_model_package" - - raise ValueError( - "Data release manifest is not certified for " - f"{package_name}=={model_version}. Publish a data release manifest with " - "a matching build model, matching data-build fingerprint, or compatible " - "model-package specifier before refreshing the bundle." - ) - - -def _refresh_dataset_path_references_from_data_release( - manifest_json: dict, - release_manifest_json: dict, - *, - release_manifest_path: str | None = None, - data_repo_id: str, -) -> None: - """Refresh bundled dataset hash pins from a data release manifest. - - The certified default dataset is handled separately because it also carries - a URI and build ID. This helper covers every logical dataset entry under - ``datasets``; notably the US long-term bundle stores one entry per year with - both H5 and metadata-sidecar hashes. - """ - datasets = manifest_json.setdefault("datasets", {}) - release_artifacts = release_manifest_json.get("artifacts", {}) - - def update_reference_from_artifact(path_reference: dict, artifact: dict) -> None: - raw_path = artifact.get("path") - if raw_path: - path_reference["path"] = _release_scoped_artifact_path( - artifact, - release_manifest_path=release_manifest_path, - data_repo_id=data_repo_id, - ) - if artifact.get("revision"): - path_reference["revision"] = artifact["revision"] - if artifact.get("repo_id"): - path_reference["repo_id"] = artifact["repo_id"] - if artifact.get("repo_type"): - path_reference["repo_type"] = artifact["repo_type"] - - dataset_sha256 = artifact.get("sha256") - if dataset_sha256: - path_reference["sha256"] = dataset_sha256 - elif "sha256" in path_reference: - raise ValueError( - "Data release manifest dataset artifact lacks sha256 " - f"for existing pinned path {raw_path!r}; refusing to leave " - "stale dataset hash pin in place." - ) - - if not raw_path: - return - metadata_artifact = _release_artifact_by_path( - release_manifest_json, - _metadata_sidecar_path(raw_path), - ) - had_metadata_pin = "metadata_sha256" in path_reference - if metadata_artifact is None: - if had_metadata_pin: - raise ValueError( - "Data release manifest is missing metadata sidecar artifact " - f"for {raw_path!r}; refusing to drop existing metadata hash pin." - ) - path_reference.pop("metadata_sha256", None) - return - metadata_sha256 = metadata_artifact.get("sha256") - if not metadata_sha256: - if had_metadata_pin: - raise ValueError( - "Data release manifest metadata sidecar artifact lacks sha256 " - f"for {raw_path!r}; refusing to drop existing metadata hash pin." - ) - path_reference.pop("metadata_sha256", None) - return - path_reference["metadata_sha256"] = metadata_sha256 - - for name, path_reference in datasets.items(): - named_artifact = release_artifacts.get(name) - if named_artifact is not None: - update_reference_from_artifact(path_reference, named_artifact) - continue - - path = path_reference.get("path") - if not path: - continue - if path_reference.get("revision"): - continue - artifact = _release_artifact_by_path(release_manifest_json, path) - if artifact is None: - if "sha256" in path_reference or "metadata_sha256" in path_reference: - raise ValueError( - "Data release manifest is missing dataset artifact " - f"for existing pinned path {path!r}; refusing to leave " - "stale dataset hash pins in place." - ) - continue - update_reference_from_artifact(path_reference, artifact) - - for name, artifact in release_artifacts.items(): - if name in datasets: - continue - path_reference: dict = {} - update_reference_from_artifact(path_reference, artifact) - datasets[name] = path_reference - - -# --------------------------------------------------------------------------- -# Refresh result -# --------------------------------------------------------------------------- - - -@dataclass -class RefreshResult: - """What the refresh changed, for logs and PR bodies.""" - - country: str - old_model: str - new_model: str - old_data: str - new_data: str - old_wheel_sha256: str - new_wheel_sha256: str - old_dataset_sha256: str - new_dataset_sha256: str - manifest_path: Path - pyproject_updated: bool - - def summary(self) -> str: - lines = [ - f"Refreshed {self.country} release bundle:", - f" model: {self.old_model} -> {self.new_model}", - f" data: {self.old_data} -> {self.new_data}", - f" wheel sha256: {self.old_wheel_sha256[:12]}... -> " - f"{self.new_wheel_sha256[:12]}...", - f" dataset sha256: {self.old_dataset_sha256[:12]}... -> " - f"{self.new_dataset_sha256[:12]}...", - f" manifest: {self.manifest_path}", - ] - if self.pyproject_updated: - lines.append(" pyproject.toml: pin updated") - return "\n".join(lines) - - -# --------------------------------------------------------------------------- -# Core refresh function -# --------------------------------------------------------------------------- - - -def refresh_release_bundle( - country: str, - *, - model_version: Optional[str] = None, - data_version: Optional[str] = None, - release_manifest_path: Optional[str] = None, - release_manifest_revision: Optional[str] = None, - update_pyproject: bool = True, - manifest_dir: Path = MANIFEST_DIR, - pyproject_path: Path = PYPROJECT, -) -> RefreshResult: - """Refresh a country's release manifest in place. - - Args: - country: ``"us"`` or ``"uk"``. - model_version: New country-package version, e.g. ``"1.653.3"``. - If ``None``, keeps the existing pin. - data_version: New data-package version, e.g. ``"1.83.4"``. If - ``None``, keeps the existing pin. - release_manifest_path: Optional explicit data release manifest path. - Needed for custom bundles whose path does not include the data - package version, such as CRFB long-run candidate releases. - release_manifest_revision: Optional HF revision to fetch the data - release manifest from before pinning the immutable repo commit. - update_pyproject: When True, also bumps the country extra in - ``pyproject.toml`` to ``model_version``. - manifest_dir: Overridable for tests. - pyproject_path: Overridable for tests. - - Returns a :class:`RefreshResult` with the before/after of every - content-addressed pin. - """ - manifest_path = manifest_dir / f"{country}.json" - manifest_json = json.loads(manifest_path.read_text()) - current = CountryReleaseManifest.model_validate(manifest_json) - - old_model = current.model_package.version - old_data = current.data_package.version - old_wheel_sha256 = current.model_package.sha256 or "" - old_dataset_sha256 = current.certified_data_artifact.sha256 or "" - - new_model = model_version or old_model - new_data = data_version or old_data - - package_name = current.model_package.name # "policyengine-us" / "policyengine-uk" - - # Only hit PyPI if the model actually changed. Keeps no-op - # refreshes and data-only refreshes offline for the wheel pin. - if new_model != old_model: - wheel = _pypi_wheel_metadata(package_name, new_model) - new_wheel_sha256 = wheel["sha256"] - new_wheel_url = wheel["url"] - else: - new_wheel_sha256 = old_wheel_sha256 - new_wheel_url = current.model_package.wheel_url or "" - - # Dataset HF resolve URL inferred from the existing URI: we only - # change the ``@{revision}`` tail. - current_uri = current.certified_data_artifact.uri - repo_id_match = re.match(r"hf://([^/]+/[^/]+)/(.+?)@(.+)", current_uri) - if not repo_id_match: - raise ValueError( - f"Cannot parse current dataset URI {current_uri!r}; expected " - f"'hf://{{owner}}/{{repo}}/{{path}}@{{revision}}'" - ) - repo_id, dataset_path, _old_revision = repo_id_match.groups() - - data_package_json = manifest_json["data_package"] - release_manifest_json = None - new_release_manifest_revision = None - current_release_manifest_revision = data_package_json.get( - "release_manifest_revision" - ) - release_manifest_override = ( - release_manifest_path is not None or release_manifest_revision is not None - ) - new_release_manifest_path = release_manifest_path or data_package_json.get( - "release_manifest_path" - ) - should_fetch_release_manifest = new_release_manifest_path is not None and ( - new_data != old_data or new_model != old_model or release_manifest_override - ) - if should_fetch_release_manifest: - if release_manifest_path is None: - new_release_manifest_path = _updated_release_manifest_path( - current_path=new_release_manifest_path, - old_data=old_data, - new_data=new_data, - ) - fetch_revision = release_manifest_revision or ( - current_release_manifest_revision - if release_manifest_path is None and new_data == old_data - else new_data - ) - release_manifest_fetch = _fetch_data_release_manifest( - repo_id=repo_id, - release_manifest_path=new_release_manifest_path, - revision=fetch_revision, - repo_type=data_package_json.get("repo_type", "model"), - allow_main_fallback=release_manifest_revision is None, - ) - if release_manifest_fetch is None: - raise ValueError( - "Could not fetch data release manifest " - f"{new_release_manifest_path!r} from {repo_id}@{new_data}. " - "Refusing to refresh a release-manifest-backed bundle with " - "partial certification metadata." - ) - if release_manifest_fetch.repo_commit is None: - raise ValueError( - "Could not resolve an immutable HF commit for data release " - f"manifest {new_release_manifest_path!r} from {repo_id}@{new_data}." - ) - release_manifest_json = release_manifest_fetch.payload - release_manifest_data_version = release_manifest_json.get( - "data_package", {} - ).get("version") - if release_manifest_data_version != new_data: - raise ValueError( - "Data release manifest " - f"{new_release_manifest_path!r} from {repo_id} declares " - f"version {release_manifest_data_version!r}, expected {new_data!r}." - ) - new_release_manifest_revision = release_manifest_fetch.repo_commit - _validate_populace_critical_calibration_targets( - country=country, - release_manifest_json=release_manifest_json, - release_manifest_path=new_release_manifest_path, - data_package_name=data_package_json.get("name", ""), - data_repo_id=repo_id, - data_repo_type=data_package_json.get("repo_type", "model"), - default_revision=new_release_manifest_revision, - ) - - certified_dataset = ( - current.certified_data_artifact.dataset - if current.certified_data_artifact is not None - else current.default_dataset - ) - data_artifact_json = {} - if release_manifest_json is not None: - data_artifact_json = release_manifest_json.get("artifacts", {}).get( - certified_dataset, - {}, - ) - if not data_artifact_json: - raise ValueError( - "Data release manifest " - f"{new_release_manifest_path!r} from {repo_id}@{new_data} " - f"does not include certified dataset {certified_dataset!r}." - ) - dataset_repo_id = data_artifact_json.get("repo_id", repo_id) - dataset_path = data_artifact_json.get("path", dataset_path) - dataset_revision_default = ( - _old_revision - if new_data == old_data and not release_manifest_override - else new_data - ) - dataset_revision = data_artifact_json.get("revision", dataset_revision_default) - if ( - release_manifest_json is not None - and new_release_manifest_revision is not None - and dataset_repo_id == repo_id - and dataset_revision in {new_data, release_manifest_revision} - ): - dataset_revision = new_release_manifest_revision - - # Only hit HF if the data version or release manifest target changed. - if new_data != old_data or release_manifest_override: - new_dataset_sha256 = data_artifact_json.get("sha256") or _hf_dataset_sha256( - dataset_repo_id, - dataset_path, - dataset_revision, - ) - else: - new_dataset_sha256 = old_dataset_sha256 - new_uri = f"hf://{dataset_repo_id}/{dataset_path}@{dataset_revision}" - policyengine_version = _pyproject_version(pyproject_path) - - # Mutate the manifest JSON in place (keep unknown fields untouched). - manifest_json["model_package"]["version"] = new_model - manifest_json["model_package"]["sha256"] = new_wheel_sha256 - manifest_json["model_package"]["wheel_url"] = new_wheel_url - data_package_json["version"] = new_data - if new_data != old_data or release_manifest_override: - if new_release_manifest_path is not None: - data_package_json["release_manifest_path"] = new_release_manifest_path - if new_release_manifest_revision is not None: - data_package_json["release_manifest_revision"] = ( - new_release_manifest_revision - ) - manifest_json["certified_data_artifact"]["data_package"]["version"] = new_data - manifest_json["certified_data_artifact"]["build_id"] = ( - f"{current.data_package.name}-{new_data}" - ) - manifest_json["certified_data_artifact"]["uri"] = new_uri - manifest_json["certified_data_artifact"]["sha256"] = new_dataset_sha256 - manifest_json["certification"]["data_build_id"] = ( - f"{current.data_package.name}-{new_data}" - ) - manifest_json["certification"]["certified_for_model_version"] = new_model - if release_manifest_json is not None: - build = release_manifest_json.get("build") or {} - built_with_model = build.get("built_with_model_package") or {} - data_build_id = ( - build.get("build_id") or f"{current.data_package.name}-{new_data}" - ) - manifest_json["certified_data_artifact"]["build_id"] = data_build_id - certification_json = manifest_json["certification"] - certification_json["data_build_id"] = data_build_id - certification_json["certified_for_model_version"] = new_model - certification_json["certified_by"] = ( - f"{current.data_package.name} release manifest" - ) - built_with_model_version = built_with_model.get("version") - if built_with_model_version is not None: - certification_json["built_with_model_version"] = built_with_model_version - if built_with_model.get("git_sha") is not None: - certification_json["built_with_model_git_sha"] = built_with_model["git_sha"] - else: - certification_json.pop("built_with_model_git_sha", None) - data_build_fingerprint = built_with_model.get("data_build_fingerprint") - if data_build_fingerprint is not None: - certification_json["data_build_fingerprint"] = data_build_fingerprint - else: - certification_json.pop("data_build_fingerprint", None) - certification_json["compatibility_basis"] = ( - _release_manifest_compatibility_basis( - release_manifest_json=release_manifest_json, - current_manifest=current, - package_name=package_name, - model_version=new_model, - built_with_model_version=built_with_model_version, - data_build_fingerprint=data_build_fingerprint, - ) - ) - _refresh_dataset_path_references_from_data_release( - manifest_json, - release_manifest_json, - release_manifest_path=new_release_manifest_path, - data_repo_id=repo_id, - ) - - manifest_path.write_text( - json.dumps(manifest_json, indent=2, sort_keys=False) + "\n" - ) - get_release_manifest.cache_clear() - sync_release_manifest_policyengine_version( - policyengine_version=policyengine_version, - manifest_dir=manifest_dir, - ) - - pyproject_updated = False - if update_pyproject and model_version is not None: - pyproject_updated = _bump_pyproject_pin(pyproject_path, package_name, new_model) - - return RefreshResult( - country=country, - old_model=old_model, - new_model=new_model, - old_data=old_data, - new_data=new_data, - old_wheel_sha256=old_wheel_sha256, - new_wheel_sha256=new_wheel_sha256, - old_dataset_sha256=old_dataset_sha256, - new_dataset_sha256=new_dataset_sha256, - manifest_path=manifest_path, - pyproject_updated=pyproject_updated, - ) - - -# --------------------------------------------------------------------------- -# pyproject.toml pin update (regex-based; avoids adding a TOML writer dep) -# --------------------------------------------------------------------------- - - -def _bump_pyproject_pin( - pyproject_path: Path, package_name: str, new_version: str -) -> bool: - """Update the ``{package_name}=={version}`` line under country - extras. Returns True if a change was written. - - Only matches the exact ``"{package_name}==X.Y.Z"`` pin form that the - release manifests produce; any looser pin (``>=``, ``~=``, extras - markers) is left alone and signalled via the return value. - """ - text = pyproject_path.read_text() - pattern = rf'("{re.escape(package_name)}==)[^"]+(")' - new_text, n = re.subn(pattern, rf"\g<1>{new_version}\g<2>", text) - if n == 0: - return False - if new_text != text: - pyproject_path.write_text(new_text) - return True - return False - - -# --------------------------------------------------------------------------- -# Trace TRO regeneration -# --------------------------------------------------------------------------- - - -def regenerate_trace_tro(country: str, manifest_dir: Path = MANIFEST_DIR) -> Path: - """Regenerate ``{country}.trace.tro.jsonld`` from the country's - release manifest plus the live data-release manifest on HF when - that manifest is available. - - Thin wrapper around the same code path ``scripts/generate_trace_tros.py`` - uses; exposed here so the refresh function can chain - ``refresh_release_bundle(...)`` with TRO regeneration in one call. - """ - from policyengine.provenance.manifest import ( - DataReleaseManifestUnavailableError, - get_data_release_manifest, - get_release_manifest, - ) - from policyengine.provenance.trace import ( - build_trace_tro_from_release_bundle, - serialize_trace_tro, - ) - - get_release_manifest.cache_clear() - get_data_release_manifest.cache_clear() - release = get_release_manifest(country) - try: - data_release = get_data_release_manifest(country) - except DataReleaseManifestUnavailableError: - data_release = None - tro = build_trace_tro_from_release_bundle(release, data_release) - out_path = manifest_dir / f"{country}.trace.tro.jsonld" - out_path.write_bytes(serialize_trace_tro(tro)) - return out_path diff --git a/src/policyengine/provenance/certification.py b/src/policyengine/provenance/certification.py index 735d4e79..07cc8fee 100644 --- a/src/policyengine/provenance/certification.py +++ b/src/policyengine/provenance/certification.py @@ -6,38 +6,40 @@ the source of truth, and certification derives the vendored country manifest from it in one step. -The certification asserts that *this* policyengine release, with the -model package pinned in ``pyproject.toml``, serves the data release — -an assertion the test suite then exercises on the exact pair. +The certification asserts that *this* policyengine bundle, with the +model package pinned in ``src/policyengine/data/bundle/manifest.json``, +serves the data release — an assertion the test suite then exercises on the +exact pair. .. code-block:: python from policyengine.provenance.certification import certify_data_release result = certify_data_release( - country="us", + country="uk", + data_producer="populace", manifest_uri=( - "hf://dataset/policyengine/populace-us" - "@populace-us-2024-0cdbb27-c239dfe51c11-20260615T201302Z" - "/releases/populace-us-2024-0cdbb27-c239dfe51c11-20260615T201302Z" + "hf://dataset/policyengine/populace-uk-private" + "@populace-uk-2023-0cdbb27-c239dfe51c11-20260615T201302Z" + "/releases/populace-uk-2023-0cdbb27-c239dfe51c11-20260615T201302Z" "/release_manifest.json" ), ) print(result.summary()) -``scripts/certify_data_release.py`` is the argparse wrapper. Network -access is required (HF manifest fetch + PyPI wheel metadata). Countries -whose data release predates release manifests (UK's enhanced FRS) keep -using :mod:`policyengine.provenance.bundle` until their next release. +``scripts/bundle.py certify-data`` is the operator-facing wrapper. Network +access is required (HF manifest fetch + PyPI wheel metadata). Countries whose +data release predates release manifests need a dedicated data-producer strategy +before they can be updated through this path. """ from __future__ import annotations +import copy import hashlib import json import re from dataclasses import dataclass, field -from importlib.resources import files from pathlib import Path from typing import Optional @@ -60,7 +62,7 @@ "uk": "policyengine-uk", } -CERTIFIED_BY = "policyengine.py certification" +CERTIFIED_BY = "policyengine.py bundle certification" BASIS_BUILT_WITH = "built_with_model_package" BASIS_PUBLISHER_CLAIM = "compatible_model_packages" POPULACE_US_SOURCE_COVERAGE_FILE = "us_source_coverage.json" @@ -73,9 +75,10 @@ class CertificationError(ValueError): @dataclass class CertificationResult: country: str + data_producer: str manifest_uri: str manifest_sha256: str - country_manifest_path: Path + bundle_path: Path dataset_count: int default_dataset: str build_id: Optional[str] @@ -90,7 +93,8 @@ def summary(self) -> str: f" manifest: {self.manifest_uri}", f" manifest sha256: {self.manifest_sha256}", f" model: {self.model_package}=={self.model_version}", - f" wrote: {self.country_manifest_path}", + f" data-producer: {self.data_producer}", + f" wrote: {self.bundle_path}", ] for warning in self.warnings: lines.append(f" WARNING: {warning}") @@ -116,6 +120,10 @@ def https_manifest_url(parts: dict) -> str: ) +def default_bundle_source_path() -> Path: + return Path(__file__).resolve().parents[1] / "data" / "bundle" / "manifest.json" + + def https_release_file_url(parts: dict, filename: str) -> str: prefix = "datasets/" if parts["repo_type"] == "dataset" else "" release_dir = release_manifest_dir(parts) @@ -420,6 +428,141 @@ def build_country_manifest_payload( } +def build_bundle_data_release_payload( + *, + country_payload: dict, + data_producer: str, + manifest_uri: str, + uri_parts: dict, +) -> dict: + """Map the country certification payload into bundle ``data_releases``.""" + payload = copy.deepcopy(country_payload) + certified_artifact = payload.get("certified_data_artifact") or {} + data_package = payload.get("data_package") or {} + build_id = certified_artifact.get("build_id") or data_package.get("version") + payload["data_producer"] = data_producer + payload["version"] = build_id + payload["build_id"] = build_id + payload["source_manifest_uri"] = manifest_uri + payload["release_manifest_uri"] = https_manifest_url(uri_parts) + payload["default_dataset_uri"] = certified_artifact.get("uri") + return payload + + +class DataProducerCertificationStrategy: + data_producer: str + + def certify( + self, + *, + country: str, + manifest_uri: str, + model_package: str, + model_version: str, + token: Optional[str], + check_artifacts: bool, + ) -> tuple[dict, str, list[str]]: + raise NotImplementedError + + +class LegacyDataProducerCertificationStrategy(DataProducerCertificationStrategy): + data_producer = "legacy" + + def certify( + self, + *, + country: str, + manifest_uri: str, + model_package: str, + model_version: str, + token: Optional[str], + check_artifacts: bool, + ) -> tuple[dict, str, list[str]]: + raise CertificationError( + "Legacy data-producer certification updates are not implemented in " + "the bundle manifest workflow yet." + ) + + +class PopulaceDataProducerCertificationStrategy(DataProducerCertificationStrategy): + data_producer = "populace" + + def certify( + self, + *, + country: str, + manifest_uri: str, + model_package: str, + model_version: str, + token: Optional[str], + check_artifacts: bool, + ) -> tuple[dict, str, list[str]]: + manifest, manifest_sha256, uri_parts = fetch_release_manifest( + manifest_uri, token=token + ) + compatibility_basis, warnings = validate_release_manifest( + manifest, model_package, model_version + ) + + for filename in required_supplemental_release_files( + country, manifest, uri_parts + ): + if not head_release_file(uri_parts, filename, token=token): + raise CertificationError( + "Required supplemental release file is not reachable: " + f"{filename} at {https_release_file_url(uri_parts, filename)}" + ) + + default_artifact = manifest.artifacts[manifest.default_datasets["national"]] + if check_artifacts and not head_artifact(default_artifact, token=token): + raise CertificationError( + f"Certified dataset artifact is not reachable: {default_artifact.uri}" + ) + + model_wheel = fetch_pypi_wheel_metadata(model_package, model_version) + country_payload = build_country_manifest_payload( + country=country, + manifest=manifest, + uri_parts=uri_parts, + policyengine_version=policyengine_version(), + model_package=model_package, + model_version=model_version, + model_wheel=model_wheel or {}, + compatibility_basis=compatibility_basis, + ) + if check_artifacts and should_validate_vendored_artifacts( + country, manifest, uri_parts + ): + for name, reference in country_payload["datasets"].items(): + if not head_artifact_reference(reference, uri_parts, token=token): + raise CertificationError( + f"Vendored artifact {name!r} is not reachable at " + f"{artifact_reference_url(reference, uri_parts)}" + ) + + return ( + build_bundle_data_release_payload( + country_payload=country_payload, + data_producer=self.data_producer, + manifest_uri=manifest_uri, + uri_parts=uri_parts, + ), + manifest_sha256, + warnings, + ) + + +def certification_strategy( + country: str, data_producer: Optional[str] = None +) -> DataProducerCertificationStrategy: + producer = data_producer or ("populace" if country == "uk" else "legacy") + if producer == "populace": + return PopulaceDataProducerCertificationStrategy() + if producer == "legacy": + return LegacyDataProducerCertificationStrategy() + raise CertificationError(f"Unknown data-producer {producer!r}.") + + def installed_model_version(model_package: str) -> str: from importlib.metadata import version @@ -436,71 +579,45 @@ def certify_data_release( *, country: str, manifest_uri: str, + data_producer: Optional[str] = None, model_version: Optional[str] = None, token: Optional[str] = None, - output_dir: Optional[Path] = None, + bundle_path: Optional[Path] = None, check_artifacts: bool = True, ) -> CertificationResult: if country not in COUNTRY_MODEL_PACKAGES: raise CertificationError(f"Unknown country {country!r}.") model_package = COUNTRY_MODEL_PACKAGES[country] model_version = model_version or installed_model_version(model_package) + strategy = certification_strategy(country, data_producer) - manifest, manifest_sha256, uri_parts = fetch_release_manifest( - manifest_uri, token=token - ) - compatibility_basis, warnings = validate_release_manifest( - manifest, model_package, model_version - ) - - for filename in required_supplemental_release_files(country, manifest, uri_parts): - if not head_release_file(uri_parts, filename, token=token): - raise CertificationError( - "Required supplemental release file is not reachable: " - f"{filename} at {https_release_file_url(uri_parts, filename)}" - ) - - default_artifact = manifest.artifacts[manifest.default_datasets["national"]] - if check_artifacts and not head_artifact(default_artifact, token=token): - raise CertificationError( - f"Certified dataset artifact is not reachable: {default_artifact.uri}" - ) - - model_wheel = fetch_pypi_wheel_metadata(model_package, model_version) - - payload = build_country_manifest_payload( + data_release, manifest_sha256, warnings = strategy.certify( country=country, - manifest=manifest, - uri_parts=uri_parts, - policyengine_version=policyengine_version(), + manifest_uri=manifest_uri, model_package=model_package, model_version=model_version, - model_wheel=model_wheel or {}, - compatibility_basis=compatibility_basis, + token=token, + check_artifacts=check_artifacts, ) - if check_artifacts and should_validate_vendored_artifacts( - country, manifest, uri_parts - ): - for name, reference in payload["datasets"].items(): - if not head_artifact_reference(reference, uri_parts, token=token): - raise CertificationError( - f"Vendored artifact {name!r} is not reachable at " - f"{artifact_reference_url(reference, uri_parts)}" - ) - - if output_dir is None: - output_dir = Path(str(files("policyengine"))) / "data" / "release_manifests" - output_path = output_dir / f"{country}.json" - output_path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n") + bundle_path = bundle_path or default_bundle_source_path() + bundle = json.loads(bundle_path.read_text()) + bundle.setdefault("data_releases", {})[country] = data_release + bundle.setdefault("countries", {}).setdefault(country, {})["model_package"] = ( + model_package + ) + if model_package in bundle.get("packages", {}): + bundle["packages"][model_package]["version"] = model_version + bundle_path.write_text(json.dumps(bundle, indent=2, sort_keys=True) + "\n") return CertificationResult( country=country, + data_producer=strategy.data_producer, manifest_uri=manifest_uri, manifest_sha256=manifest_sha256, - country_manifest_path=output_path, - dataset_count=len(payload["datasets"]), - default_dataset=payload["default_dataset"], - build_id=payload["certified_data_artifact"].get("build_id"), + bundle_path=bundle_path, + dataset_count=len(data_release["datasets"]), + default_dataset=data_release["default_dataset"], + build_id=data_release["certified_data_artifact"].get("build_id"), model_package=model_package, model_version=model_version, warnings=warnings, diff --git a/src/policyengine/provenance/manifest.py b/src/policyengine/provenance/manifest.py index b9af72ee..cd3dd7bd 100644 --- a/src/policyengine/provenance/manifest.py +++ b/src/policyengine/provenance/manifest.py @@ -1,4 +1,5 @@ import hashlib +import json import os from functools import lru_cache from importlib import import_module @@ -252,14 +253,18 @@ def fetch_pypi_wheel_metadata(name: str, version: str) -> dict[str, Optional[str @lru_cache def get_release_manifest(country_id: str) -> CountryReleaseManifest: - manifest_path = files("policyengine").joinpath( - "data", "release_manifests", f"{country_id}.json" - ) + manifest_path = files("policyengine").joinpath("data", "bundle", "manifest.json") if not manifest_path.is_file(): - raise ValueError(f"No bundled release manifest for country '{country_id}'") + raise ValueError("No bundled PolicyEngine bundle manifest found.") + + source_bytes = manifest_path.read_text().encode() + bundle = json.loads(source_bytes) + try: + release_payload = bundle["data_releases"][country_id] + except KeyError as exc: + raise ValueError(f"No bundled data release for country '{country_id}'") from exc - source_bytes = manifest_path.read_bytes() - manifest = CountryReleaseManifest.model_validate_json(source_bytes) + manifest = CountryReleaseManifest.model_validate(release_payload) manifest.source_sha256 = hashlib.sha256(source_bytes).hexdigest() return manifest @@ -336,11 +341,7 @@ def certify_data_release_compatibility( data_release_manifest = get_data_release_manifest(country_id) except DataReleaseManifestUnavailableError as exc: bundled_certification = country_manifest.certification - if ( - bundled_certification is not None - and bundled_certification.certified_for_model_version - == runtime_model_version - ): + if bundled_certification is not None: if ( runtime_data_build_fingerprint is not None and bundled_certification.data_build_fingerprint is not None @@ -353,7 +354,24 @@ def certify_data_release_compatibility( "Runtime data build fingerprint does not match the bundled " "data certification." ) - return bundled_certification + if ( + bundled_certification.certified_for_model_version + == runtime_model_version + ): + return bundled_certification + return DataCertification( + compatibility_basis="unverified_data_release_manifest_unavailable", + certified_for_model_version=runtime_model_version, + data_build_id=bundled_certification.data_build_id, + built_with_model_version=( + bundled_certification.built_with_model_version + ), + built_with_model_git_sha=( + bundled_certification.built_with_model_git_sha + ), + data_build_fingerprint=(bundled_certification.data_build_fingerprint), + certified_by=bundled_certification.certified_by, + ) raise exc built_with_model = ( data_release_manifest.build.built_with_model_package diff --git a/src/policyengine/provenance/trace.py b/src/policyengine/provenance/trace.py index 185c8228..777b233d 100644 --- a/src/policyengine/provenance/trace.py +++ b/src/policyengine/provenance/trace.py @@ -346,10 +346,7 @@ def build_trace_tro_from_release_bundle( f"provides a SHA256 for dataset '{certified_artifact.dataset}'." ) - bundle_manifest_location = ( - bundle_manifest_path - or f"data/release_manifests/{country_manifest.country_id}.json" - ) + bundle_manifest_location = bundle_manifest_path or "data/bundle/manifest.json" data_manifest_location = ( data_release_manifest_path or https_release_manifest_uri(country_manifest.data_package) diff --git a/tests/test_bump_version.py b/tests/test_bump_version.py index 105d5cda..76d9c61a 100644 --- a/tests/test_bump_version.py +++ b/tests/test_bump_version.py @@ -1,6 +1,7 @@ from __future__ import annotations import importlib.util +import json from pathlib import Path import pytest @@ -89,40 +90,54 @@ def test_update_file_replaces_stale_version_field(tmp_path): assert 'version = "3.4.3"' in pyproject.read_text() -def test_sync_release_manifest_versions_rewrites_bundle_identity(tmp_path): - manifest_dir = tmp_path / "release_manifests" - manifest_dir.mkdir() - manifest_path = manifest_dir / "uk.json" - manifest_path.write_text( - "{\n" - ' "schema_version": 1,\n' - ' "bundle_id": "uk-4.0.0",\n' - ' "country_id": "uk",\n' - ' "policyengine_version": "4.0.0"\n' - "}\n" +def test_sync_bundle_versions_rewrites_bundle_identity(tmp_path): + bundle_path = tmp_path / "manifest.json" + bundle_path.write_text( + json.dumps( + { + "bundle_version": "4.0.0", + "policyengine_version": "4.0.0", + "packages": { + "policyengine": { + "name": "policyengine", + "version": "4.0.0", + } + }, + "data_releases": { + "uk": { + "bundle_id": "uk-4.0.0", + "policyengine_version": "4.0.0", + } + }, + } + ) + + "\n" ) - bump_version.sync_release_manifest_versions(manifest_dir, "4.3.2") - - text = manifest_path.read_text() - assert '"bundle_id": "uk-4.3.2"' in text - assert '"policyengine_version": "4.3.2"' in text - - -def test_sync_release_manifest_versions_fails_when_required_field_missing(tmp_path): - manifest_dir = tmp_path / "release_manifests" - manifest_dir.mkdir() - manifest_path = manifest_dir / "uk.json" - manifest_path.write_text( - "{\n" - ' "schema_version": 1,\n' - ' "bundle_id": "uk-4.0.0",\n' - ' "country_id": "uk"\n' - "}\n" + bump_version.sync_bundle_versions(bundle_path, "4.3.2") + + bundle = json.loads(bundle_path.read_text()) + assert bundle["bundle_version"] == "4.3.2" + assert bundle["policyengine_version"] == "4.3.2" + assert bundle["packages"]["policyengine"]["version"] == "4.3.2" + assert bundle["data_releases"]["uk"]["bundle_id"] == "uk-4.3.2" + assert bundle["data_releases"]["uk"]["policyengine_version"] == "4.3.2" + + +def test_sync_bundle_versions_fails_when_required_field_missing(tmp_path): + bundle_path = tmp_path / "manifest.json" + bundle_path.write_text( + json.dumps( + { + "bundle_version": "4.0.0", + "policyengine_version": "4.0.0", + } + ) + + "\n" ) - original = manifest_path.read_text() + original = bundle_path.read_text() with pytest.raises(SystemExit): - bump_version.sync_release_manifest_versions(manifest_dir, "4.3.2") + bump_version.sync_bundle_versions(bundle_path, "4.3.2") - assert manifest_path.read_text() == original + assert bundle_path.read_text() == original diff --git a/tests/test_bundle.py b/tests/test_bundle.py new file mode 100644 index 00000000..adfbb100 --- /dev/null +++ b/tests/test_bundle.py @@ -0,0 +1,428 @@ +import hashlib +import json +import sys +from pathlib import Path + +import pytest + +from policyengine import bundle +from policyengine.cli import main as cli_main + + +def _sha256(payload: bytes) -> str: + return hashlib.sha256(payload).hexdigest() + + +def _manifest_with_dataset_sha(country: str, sha256: str) -> dict: + manifest = json.loads(json.dumps(bundle.get_current_bundle())) + release = manifest["data_releases"][country] + dataset = release["default_dataset"] + release["datasets"][dataset]["sha256"] = sha256 + release["certified_data_artifact"]["sha256"] = sha256 + return manifest + + +def _write_manifest(tmp_path, manifest: dict) -> str: + path = tmp_path / "bundle-manifest.json" + path.write_text(json.dumps(manifest), encoding="utf-8") + return str(path) + + +def test_bundle_manifest_exposes_data_releases(): + manifest = bundle.get_current_bundle() + + assert manifest["bundle_version"] == manifest["policyengine_version"] + assert manifest["data_releases"]["us"]["data_producer"] == "populace" + assert manifest["data_releases"]["us"]["version"].startswith("populace-us-2024-") + assert manifest["data_releases"]["uk"]["data_producer"] == "populace" + assert manifest["data_releases"]["uk"]["version"].startswith("populace-uk-2023-") + assert manifest["data_releases"]["uk"]["default_dataset_uri"].startswith( + "hf://policyengine/populace-uk-private/" + ) + + +def test_bundle_install_requirements_are_country_scoped(): + manifest = bundle.get_current_bundle() + + assert bundle.bundle_install_requirements(manifest, countries=["uk"]) == [ + f"policyengine=={manifest['policyengine_version']}", + manifest["packages"]["policyengine-core"]["install_requirement"], + manifest["packages"]["policyengine-uk"]["install_requirement"], + ] + assert ( + "policyengine-us-data==1.78.2; python_version >= '3.12' and python_version < '3.15'" + in bundle.bundle_install_requirements( + manifest, + countries=["us"], + ) + ) + + +def test_dataset_plans_use_certified_release_metadata(tmp_path): + plans = bundle.dataset_plans( + bundle.get_current_bundle(), + countries=["uk"], + data_dir=tmp_path, + ) + + assert len(plans) == 1 + assert plans[0].country == "uk" + assert plans[0].data_version.startswith("populace-uk-2023-") + assert plans[0].data_producer == "populace" + assert plans[0].repo_type == "dataset" + assert plans[0].destination == tmp_path / "populace_uk_2023.h5" + assert ( + plans[0].expected_sha256 + == bundle.get_current_bundle()["data_releases"]["uk"]["datasets"][ + "populace_uk_2023" + ]["sha256"] + ) + + +def test_runtime_strategy_selects_populace(): + assert isinstance( + bundle.runtime_strategy("populace"), + bundle.PopulaceDataProducerRuntimeStrategy, + ) + + +def test_populace_runtime_strategy_requires_certified_hash(): + manifest = json.loads(json.dumps(bundle.get_current_bundle())) + release = manifest["data_releases"]["uk"] + dataset = release["default_dataset"] + del release["datasets"][dataset]["sha256"] + del release["certified_data_artifact"]["sha256"] + + with pytest.raises(bundle.BundleError, match="certified sha256"): + bundle.dataset_plans(manifest, countries=["uk"]) + + +def test_install_bundle_package_only_uses_explicit_python(monkeypatch, tmp_path): + calls = [] + + def fake_install(target_python, requirements, *, dry_run=False): + calls.append((target_python, requirements, dry_run)) + + monkeypatch.setattr(bundle, "install_package_scaffold", fake_install) + + result = bundle.install_bundle( + python=sys.executable, + countries=["uk"], + data_dir=tmp_path, + no_datasets=True, + ) + + assert result["countries"] == ["uk"] + assert calls[0][0] == Path(sys.executable).resolve() + receipt = bundle.read_receipt(tmp_path) + assert receipt is not None + assert receipt["target_python"] == str(Path(sys.executable).resolve()) + assert calls[0][1] == [ + f"policyengine=={result['bundle_version']}", + bundle.get_current_bundle()["packages"]["policyengine-core"][ + "install_requirement" + ], + bundle.get_current_bundle()["packages"]["policyengine-uk"][ + "install_requirement" + ], + ] + + +def test_resolve_target_python_accepts_path_executable(monkeypatch, tmp_path): + python_path = tmp_path / "python" + python_path.write_text("") + monkeypatch.setattr(bundle.shutil, "which", lambda name: str(python_path)) + + assert bundle.resolve_target_python(python="python") == python_path + + +def test_resolve_target_python_defaults_to_local_venv(monkeypatch, tmp_path): + monkeypatch.chdir(tmp_path) + monkeypatch.delenv("VIRTUAL_ENV", raising=False) + monkeypatch.delenv("CONDA_PREFIX", raising=False) + + assert bundle.resolve_target_python(create_venv=False) == ( + tmp_path / ".venv" / "bin" / "python" + ) + assert not (tmp_path / ".venv").exists() + + +def test_resolve_target_python_uses_local_venv_from_runner_env(monkeypatch, tmp_path): + monkeypatch.chdir(tmp_path) + monkeypatch.setenv("VIRTUAL_ENV", str(tmp_path / "uvx-runner")) + monkeypatch.delenv("CONDA_PREFIX", raising=False) + + assert bundle.resolve_target_python(create_venv=False) == ( + tmp_path / ".venv" / "bin" / "python" + ) + assert not (tmp_path / ".venv").exists() + + +class FakeResponse: + status_code = 200 + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + def raise_for_status(self): + return None + + def iter_content(self, chunk_size): + yield b"new-data" + + +class FakeSession: + def get(self, *args, **kwargs): + return FakeResponse() + + +def test_install_datasets_downloads_then_backs_up_existing_file(tmp_path): + manifest = _manifest_with_dataset_sha("us", _sha256(b"new-data")) + existing = tmp_path / "populace_us_2024.h5" + existing.write_bytes(b"old-data") + + installed = bundle.install_datasets( + manifest, + countries=["us"], + data_dir=tmp_path, + yes=True, + session=FakeSession(), + ) + + assert installed[0]["country"] == "us" + assert installed[0]["expected_sha256"] == _sha256(b"new-data") + assert installed[0]["installed_sha256"] == _sha256(b"new-data") + assert installed[0]["build_id"] == manifest["data_releases"]["us"]["build_id"] + assert existing.read_bytes() == b"new-data" + backups = list((tmp_path / bundle.BACKUP_DIR_NAME).glob("*/populace_us_2024.h5")) + assert len(backups) == 1 + assert backups[0].read_bytes() == b"old-data" + + +def test_install_datasets_rejects_downloaded_hash_mismatch(tmp_path): + manifest = _manifest_with_dataset_sha("us", _sha256(b"expected-data")) + existing = tmp_path / "populace_us_2024.h5" + existing.write_bytes(b"old-data") + + with pytest.raises(bundle.BundleError, match="sha256"): + bundle.install_datasets( + manifest, + countries=["us"], + data_dir=tmp_path, + yes=True, + session=FakeSession(), + ) + + assert existing.read_bytes() == b"old-data" + assert not (tmp_path / bundle.BACKUP_DIR_NAME).exists() + + +def test_status_matches_receipt_and_packages(monkeypatch, tmp_path): + manifest = _manifest_with_dataset_sha("uk", _sha256(b"data")) + datasets = [ + { + "country": "uk", + "dataset": "populace_uk_2023", + "version": manifest["data_releases"]["uk"]["version"], + "uri": manifest["data_releases"]["uk"]["default_dataset_uri"], + "path": str(tmp_path / "populace_uk_2023.h5"), + "expected_sha256": _sha256(b"data"), + "installed_sha256": _sha256(b"data"), + } + ] + (tmp_path / "populace_uk_2023.h5").write_bytes(b"data") + bundle.write_receipt( + manifest, + data_dir=tmp_path, + countries=["uk"], + datasets=datasets, + ) + versions = { + component["name"]: component["version"] + for component in manifest["packages"].values() + } + monkeypatch.setattr(bundle.metadata, "version", lambda name: versions[name]) + + report = bundle.inspect_bundle_status( + manifest_ref=_write_manifest(tmp_path, manifest), + countries=["uk"], + data_dir=tmp_path, + ) + + assert report["matched"] is True + assert {check["status"] for check in report["packages"]} == {"ok"} + assert {check["status"] for check in report["datasets"]} == {"ok"} + assert report["datasets"][0]["installed_sha256"] == _sha256(b"data") + + +def test_status_uses_receipt_target_python(monkeypatch, tmp_path): + manifest = _manifest_with_dataset_sha("uk", _sha256(b"data")) + target_python = tmp_path / ".venv" / "bin" / "python" + target_python.parent.mkdir(parents=True) + target_python.write_text("") + datasets = [ + { + "country": "uk", + "dataset": "populace_uk_2023", + "version": manifest["data_releases"]["uk"]["version"], + "uri": manifest["data_releases"]["uk"]["default_dataset_uri"], + "path": str(tmp_path / "populace_uk_2023.h5"), + "expected_sha256": _sha256(b"data"), + "installed_sha256": _sha256(b"data"), + } + ] + (tmp_path / "populace_uk_2023.h5").write_bytes(b"data") + bundle.write_receipt( + manifest, + data_dir=tmp_path, + countries=["uk"], + datasets=datasets, + target_python=target_python, + ) + versions = { + component["name"]: component["version"] + for component in manifest["packages"].values() + } + calls = [] + + def fake_versions(python_path, components): + calls.append(python_path) + return versions, None + + monkeypatch.setattr(bundle, "_package_versions_from_python", fake_versions) + + report = bundle.inspect_bundle_status( + manifest_ref=_write_manifest(tmp_path, manifest), + countries=["uk"], + data_dir=tmp_path, + ) + + assert report["matched"] is True + assert calls == [target_python.resolve()] + assert report["target_python"] == str(target_python.resolve()) + + +def test_status_reports_dataset_hash_mismatch(monkeypatch, tmp_path): + manifest = _manifest_with_dataset_sha("uk", _sha256(b"certified-data")) + dataset_path = tmp_path / "populace_uk_2023.h5" + dataset_path.write_bytes(b"tampered-data") + bundle.write_receipt( + manifest, + data_dir=tmp_path, + countries=["uk"], + datasets=[ + { + "country": "uk", + "dataset": "populace_uk_2023", + "version": manifest["data_releases"]["uk"]["version"], + "uri": manifest["data_releases"]["uk"]["default_dataset_uri"], + "path": str(dataset_path), + "expected_sha256": _sha256(b"certified-data"), + "installed_sha256": _sha256(b"certified-data"), + } + ], + ) + versions = { + component["name"]: component["version"] + for component in manifest["packages"].values() + } + monkeypatch.setattr(bundle.metadata, "version", lambda name: versions[name]) + + report = bundle.inspect_bundle_status( + manifest_ref=_write_manifest(tmp_path, manifest), + countries=["uk"], + data_dir=tmp_path, + ) + + assert report["matched"] is False + assert report["datasets"][0]["status"] == "sha256_mismatch" + assert report["datasets"][0]["expected_sha256"] == _sha256(b"certified-data") + assert report["datasets"][0]["installed_sha256"] == _sha256(b"tampered-data") + + +def test_status_reports_missing_receipt_target_python(tmp_path): + manifest = bundle.get_current_bundle() + missing_python = tmp_path / ".venv" / "bin" / "python" + bundle.write_receipt( + manifest, + data_dir=tmp_path, + countries=["uk"], + datasets=[], + target_python=missing_python, + ) + + report = bundle.inspect_bundle_status( + countries=["uk"], + data_dir=tmp_path, + packages_only=True, + ) + + assert report["matched"] is False + assert {check["status"] for check in report["packages"]} == { + "target_python_missing" + } + + +def test_status_treats_corrupt_receipt_as_missing(monkeypatch, tmp_path): + manifest = bundle.get_current_bundle() + (tmp_path / bundle.RECEIPT_FILENAME).write_text("{not-json", encoding="utf-8") + versions = { + component["name"]: component["version"] + for component in manifest["packages"].values() + } + monkeypatch.setattr(bundle.metadata, "version", lambda name: versions[name]) + + report = bundle.inspect_bundle_status(countries=["uk"], data_dir=tmp_path) + + assert report["matched"] is False + assert report["receipt"] is None + assert report["datasets"][0]["status"] == "missing_receipt" + + +def test_bundle_manifest_cli_outputs_json(capsys): + exit_code = cli_main(["bundle", "manifest"]) + + assert exit_code == 0 + payload = json.loads(capsys.readouterr().out) + assert payload["bundle_version"] == payload["policyengine_version"] + + +def test_bundle_install_dry_run_cli_uses_standard_flags(capsys): + exit_code = cli_main( + [ + "bundle", + "install", + "--python", + sys.executable, + "--country", + "uk", + "--no-datasets", + "--dry-run", + ] + ) + + assert exit_code == 0 + output = capsys.readouterr().out + assert "pip install" in output + assert ( + bundle.get_current_bundle()["packages"]["policyengine-uk"][ + "install_requirement" + ] + in output + ) + + +def test_bundle_verify_cli_handles_unknown_bundle(capsys): + exit_code = cli_main(["bundle", "verify", "0.0.0"]) + + captured = capsys.readouterr() + assert exit_code == 1 + assert "Bundle '0.0.0'" in captured.err + + +def test_unknown_bundle_version_is_named(): + with pytest.raises(bundle.BundleError, match="Bundle '0.0.0'"): + bundle.load_bundle_manifest("0.0.0") diff --git a/tests/test_bundle_metadata.py b/tests/test_bundle_metadata.py new file mode 100644 index 00000000..1ac44a55 --- /dev/null +++ b/tests/test_bundle_metadata.py @@ -0,0 +1,120 @@ +import importlib.util +import json +import sys +from pathlib import Path + +from policyengine import bundle +from policyengine.cli import main as cli_main + + +def _load_export_script(monkeypatch): + scripts_dir = Path(__file__).resolve().parents[1] / "scripts" + monkeypatch.syspath_prepend(str(scripts_dir)) + spec = importlib.util.spec_from_file_location( + "export_bundle_release_assets", + scripts_dir / "export_bundle_release_assets.py", + ) + module = importlib.util.module_from_spec(spec) + assert spec.loader is not None + spec.loader.exec_module(module) + return module + + +def test_bundle_manifest_exposes_model_and_country_extras(): + manifest = bundle.get_current_bundle() + + assert manifest["bundle_version"] == manifest["policyengine_version"] + assert set(manifest["extras"]) == {"models", "uk", "us"} + assert manifest["extras"]["models"] == [ + "policyengine-core", + "policyengine-us", + "policyengine-uk", + ] + assert manifest["extras"]["us"] == [ + "policyengine-core", + "policyengine-us", + ] + assert manifest["extras"]["uk"] == [ + "policyengine-core", + "policyengine-uk", + ] + assert "policyengine-uk-data" not in manifest["packages"] + + +def test_bundle_manifest_carries_populace_uk_data_release(): + release = bundle.get_current_bundle()["data_releases"]["uk"] + + assert release["data_producer"] == "populace" + assert release["data_package"]["name"] == "populace-data" + assert release["data_package"]["repo_type"] == "dataset" + assert release["default_dataset"] == "populace_uk_2023" + assert release["version"].startswith("populace-uk-2023-") + assert release["default_dataset_uri"].startswith( + "hf://policyengine/populace-uk-private/" + ) + + +def test_bundle_manifest_cli_outputs_json(capsys): + exit_code = cli_main(["bundle", "manifest"]) + + assert exit_code == 0 + payload = json.loads(capsys.readouterr().out) + assert payload["bundle_version"] == payload["policyengine_version"] + assert set(payload["packages"]) >= { + "policyengine", + "policyengine-core", + "policyengine-us", + "policyengine-uk", + } + + +def test_bundle_verify_packages_only_cli_outputs_json(monkeypatch, capsys): + manifest = bundle.get_current_bundle() + versions = { + component["name"]: component["version"] + for component in manifest["packages"].values() + } + monkeypatch.setattr(bundle.metadata, "version", lambda name: versions[name]) + + exit_code = cli_main( + [ + "bundle", + "verify", + "--country", + "us", + "--country", + "uk", + "--packages-only", + "--json", + ] + ) + + assert exit_code == 0 + payload = json.loads(capsys.readouterr().out) + assert payload["matched"] is True + assert payload["datasets"] == [] + + +def test_export_release_assets_writes_bundle_assets(monkeypatch, tmp_path): + export_script = _load_export_script(monkeypatch) + manifest = bundle.get_current_bundle() + version = manifest["bundle_version"] + monkeypatch.setattr( + sys, + "argv", + ["export_bundle_release_assets", "--dist-dir", str(tmp_path)], + ) + + assert export_script.main() == 0 + + bundle_manifest = tmp_path / f"policyengine-bundle-{version}.json" + assert bundle_manifest.exists() + assert json.loads(bundle_manifest.read_text()) == manifest + assert ( + f"policyengine=={manifest['policyengine_version']}" + in (tmp_path / f"policyengine-bundle-{version}.constraints.txt").read_text() + ) + assert ( + "PolicyEngine bundle" + in (tmp_path / f"policyengine-bundle-{version}.citation.txt").read_text() + ) diff --git a/tests/test_bundle_refresh.py b/tests/test_bundle_refresh.py deleted file mode 100644 index 38da76e8..00000000 --- a/tests/test_bundle_refresh.py +++ /dev/null @@ -1,1484 +0,0 @@ -"""Unit tests for ``policyengine.provenance.bundle.refresh_release_bundle``. - -Mocks the PyPI JSON API and the HF ``resolve`` endpoint so the tests -run offline. Exercises: - -- Updating only the model version (data-version unchanged). -- Updating only the data version (model unchanged). -- Updating both in one call. -- ``pyproject.toml`` pin rewrite. -- ``--no-pyproject`` / ``update_pyproject=False`` short-circuits. -- Error paths: PyPI has no matching wheel; URI is malformed. - -The end-to-end TRO regeneration requires the bundled -release-manifest resolver and a live HF metadata call, so it is -tested separately in ``tests/test_release_manifests.py`` via the -existing script-level hook. This file covers only the pure-refresh -surface. -""" - -from __future__ import annotations - -import hashlib -import io -import json -from pathlib import Path -from unittest.mock import patch -from urllib.error import HTTPError - -import pytest - -from policyengine.provenance.bundle import refresh_release_bundle - -PYPI_PAYLOAD_TEMPLATE = { - "urls": [ - { - "packagetype": "bdist_wheel", - "filename": "policyengine_us-NEW_VERSION-py3-none-any.whl", - "url": "https://files.pythonhosted.org/packages/ff/00/policyengine_us-NEW_VERSION-py3-none-any.whl", - "digests": {"sha256": "a" * 64}, - }, - # Source-dist should be ignored. - { - "packagetype": "sdist", - "filename": "policyengine_us-NEW_VERSION.tar.gz", - "url": "https://files.pythonhosted.org/packages/ff/00/policyengine_us-NEW_VERSION.tar.gz", - "digests": {"sha256": "b" * 64}, - }, - ] -} - - -def _pypi_response(package: str, version: str): - """Return a mock PyPI ``urlopen`` response.""" - payload = json.loads( - json.dumps(PYPI_PAYLOAD_TEMPLATE).replace("NEW_VERSION", version) - ) - # PyPI urls contain the filename; replace the package placeholder too. - for u in payload["urls"]: - u["filename"] = u["filename"].replace( - "policyengine_us", package.replace("-", "_") - ) - u["url"] = u["url"].replace("policyengine_us", package.replace("-", "_")) - return io.BytesIO(json.dumps(payload).encode()) - - -class _FakeHFResponse: - """Streams a deterministic byte sequence so sha256 is predictable.""" - - def __init__(self, content: bytes, headers: dict | None = None) -> None: - self._buffer = io.BytesIO(content) - self.headers = headers or {} - - def read(self, size: int = -1) -> bytes: - return self._buffer.read(size) - - def __enter__(self): - return self - - def __exit__(self, *args): - self._buffer.close() - - -def _data_release_manifest_response( - *, - model_version: str = "1.653.3", - data_version: str = "1.83.4", - dataset_sha256: str = "e" * 64, - extra_artifacts: dict | None = None, - compatible_model_packages: list[dict] | None = None, - headers: dict | None = None, -): - artifacts = { - "enhanced_cps_2024": { - "kind": "microdata", - "path": "enhanced_cps_2024.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": data_version, - "sha256": dataset_sha256, - } - } - if extra_artifacts: - artifacts.update(extra_artifacts) - payload = { - "schema_version": 1, - "data_package": { - "name": "policyengine-us-data", - "version": data_version, - }, - "build": { - "build_id": f"policyengine-us-data-{data_version}", - "built_with_model_package": { - "name": "policyengine-us", - "version": model_version, - "git_sha": "deadbeef", - "data_build_fingerprint": "sha256:fingerprint", - }, - }, - "compatible_model_packages": ( - compatible_model_packages - if compatible_model_packages is not None - else [{"name": "policyengine-us", "specifier": f"=={model_version}"}] - ), - "artifacts": artifacts, - } - return _FakeHFResponse( - json.dumps(payload).encode(), - headers=( - {"x-repo-commit": "release-manifest-commit-sha"} - if headers is None - else headers - ), - ) - - -_INCOME_TAX_TARGET = ( - "irs_soi.ty2022.historic_table_2.us.all.income_tax_liability_amount@2024" -) -_INCOME_TAX_RETURNS_TARGET = ( - "irs_soi.ty2022.historic_table_2.us.all.income_tax_liability_returns@2024" -) -_SOCIAL_SECURITY_TARGET = "ssa_supplement.cy2024.oasdi_ssi_payments.social_security_benefits.payment_amount@2024" - - -def _json_bytes(payload: dict) -> bytes: - return json.dumps(payload).encode() - - -def _json_sha256(payload: dict) -> str: - return hashlib.sha256(_json_bytes(payload)).hexdigest() - - -def _json_response(payload: dict, headers: dict | None = None) -> _FakeHFResponse: - return _FakeHFResponse(_json_bytes(payload), headers=headers) - - -def _populace_calibration_diagnostics( - *, - income_tax_relative_error: float = -0.02, - income_tax_returns_relative_error: float = -0.07, - social_security_relative_error: float = 0.04, -) -> dict: - def target(name: str, relative_error: float) -> dict: - return { - "name": name, - "target": 100.0, - "final_estimate": 100.0 * (1.0 + relative_error), - "relative_error": relative_error, - } - - return { - "targets": [ - target(_INCOME_TAX_TARGET, income_tax_relative_error), - target(_INCOME_TAX_RETURNS_TARGET, income_tax_returns_relative_error), - target(_SOCIAL_SECURITY_TARGET, social_security_relative_error), - ] - } - - -@pytest.fixture -def sandbox(tmp_path: Path) -> dict: - """A writable scratch copy of the US release manifest + a stub - pyproject.toml, returned as ``{manifest_dir, pyproject_path, - manifest_sha256}``. - """ - manifest_dir = tmp_path / "manifests" - manifest_dir.mkdir() - manifest = { - "schema_version": 1, - "bundle_id": "us-4.0.0", - "country_id": "us", - "policyengine_version": "4.0.0", - "model_package": { - "name": "policyengine-us", - "version": "1.600.0", - "sha256": "c" * 64, - "wheel_url": "https://files.pythonhosted.org/packages/old.whl", - }, - "data_package": { - "name": "policyengine-us-data", - "version": "1.70.0", - "repo_id": "policyengine/policyengine-us-data", - "release_manifest_path": "releases/1.70.0/release_manifest.json", - "release_manifest_revision": "old-release-manifest-commit", - }, - "certified_data_artifact": { - "data_package": { - "name": "policyengine-us-data", - "version": "1.70.0", - }, - "build_id": "policyengine-us-data-1.70.0", - "dataset": "enhanced_cps_2024", - "uri": "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@old-dataset-commit", - "sha256": "d" * 64, - }, - "certification": { - "compatibility_basis": "matching_data_build_fingerprint", - "data_build_id": "policyengine-us-data-1.70.0", - "built_with_model_version": "1.595.0", - "certified_for_model_version": "1.600.0", - "certified_by": "test fixture", - "data_build_fingerprint": "sha256:fingerprint", - }, - "default_dataset": "enhanced_cps_2024", - "datasets": {"enhanced_cps_2024": {"path": "enhanced_cps_2024.h5"}}, - "region_datasets": {"national": {"path_template": "enhanced_cps_2024.h5"}}, - } - (manifest_dir / "us.json").write_text(json.dumps(manifest, indent=2)) - - pyproject_path = tmp_path / "pyproject.toml" - pyproject_path.write_text( - "[project]\n" - 'version = "4.2.0"\n' - "\n" - "[project.optional-dependencies]\n" - "us = [\n" - ' "policyengine_core>=3.25.0",\n' - ' "policyengine-us==1.600.0",\n' - "]\n" - ) - return { - "manifest_dir": manifest_dir, - "pyproject_path": pyproject_path, - } - - -def test__bump_model_only_rewrites_wheel_pins_and_pyproject(sandbox) -> None: - """Bumping only the model version pulls fresh wheel metadata, - keeps the dataset pin intact, and updates pyproject.toml. - """ - - def fake_urlopen(request, *args, **kwargs): - url = request.full_url - if "pypi.org" in url: - return _pypi_response("policyengine-us", "1.653.3") - if url.endswith("releases/1.70.0/release_manifest.json"): - return _data_release_manifest_response( - data_version="1.70.0", - dataset_sha256="d" * 64, - compatible_model_packages=[ - {"name": "policyengine-us", "specifier": "==1.600.0"}, - {"name": "policyengine-us", "specifier": "==1.653.3"}, - ], - extra_artifacts={ - "enhanced_cps_2024": { - "kind": "microdata", - "path": "enhanced_cps_2024.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "old-dataset-commit", - "sha256": "d" * 64, - } - }, - headers={"x-repo-commit": "old-release-manifest-commit"}, - ) - raise AssertionError(f"Unexpected URL fetched: {url}") - - with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): - result = refresh_release_bundle( - country="us", - model_version="1.653.3", - manifest_dir=sandbox["manifest_dir"], - pyproject_path=sandbox["pyproject_path"], - ) - - assert result.new_model == "1.653.3" - assert result.new_data == "1.70.0" # untouched - assert result.pyproject_updated - assert "policyengine-us==1.653.3" in sandbox["pyproject_path"].read_text() - - written = json.loads((sandbox["manifest_dir"] / "us.json").read_text()) - assert written["model_package"]["version"] == "1.653.3" - assert written["model_package"]["sha256"] == "a" * 64 - assert written["bundle_id"] == "us-4.2.0" - assert written["policyengine_version"] == "4.2.0" - # Dataset pins untouched. - assert written["data_package"]["version"] == "1.70.0" - assert ( - written["data_package"]["release_manifest_path"] - == "releases/1.70.0/release_manifest.json" - ) - assert ( - written["data_package"]["release_manifest_revision"] - == "old-release-manifest-commit" - ) - assert written["certified_data_artifact"]["sha256"] == "d" * 64 - assert ( - written["certified_data_artifact"]["uri"] - == "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@old-dataset-commit" - ) - - -def test__bump_model_only_fetches_dataset_repo_release_manifest(sandbox) -> None: - manifest_path = sandbox["manifest_dir"] / "us.json" - manifest = json.loads(manifest_path.read_text()) - manifest["data_package"]["repo_id"] = "policyengine/populace-us" - manifest["data_package"]["repo_type"] = "dataset" - manifest["data_package"]["release_manifest_path"] = ( - "releases/1.70.0/release_manifest.json" - ) - manifest["certified_data_artifact"]["uri"] = ( - "hf://policyengine/populace-us/enhanced_cps_2024.h5@old-dataset-commit" - ) - manifest_path.write_text(json.dumps(manifest, indent=2)) - - def fake_urlopen(request, *args, **kwargs): - url = request.full_url - if "pypi.org" in url: - return _pypi_response("policyengine-us", "1.653.3") - if ( - url == "https://huggingface.co/datasets/policyengine/populace-us/resolve/" - "old-release-manifest-commit/releases/1.70.0/release_manifest.json" - ): - return _data_release_manifest_response( - data_version="1.70.0", - dataset_sha256="d" * 64, - compatible_model_packages=[ - {"name": "policyengine-us", "specifier": "==1.600.0"}, - {"name": "policyengine-us", "specifier": "==1.653.3"}, - ], - extra_artifacts={ - "enhanced_cps_2024": { - "kind": "microdata", - "path": "enhanced_cps_2024.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "old-dataset-commit", - "sha256": "d" * 64, - } - }, - headers={"x-repo-commit": "old-release-manifest-commit"}, - ) - raise AssertionError(f"Unexpected URL fetched: {url}") - - with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): - result = refresh_release_bundle( - country="us", - model_version="1.653.3", - manifest_dir=sandbox["manifest_dir"], - pyproject_path=sandbox["pyproject_path"], - ) - - assert result.new_model == "1.653.3" - - -def test__bump_model_only_requires_data_release_manifest_compatibility( - sandbox, -) -> None: - def fake_urlopen(request, *args, **kwargs): - url = request.full_url - if "pypi.org" in url: - return _pypi_response("policyengine-us", "1.653.3") - if url.endswith("releases/1.70.0/release_manifest.json"): - return _data_release_manifest_response( - data_version="1.70.0", - model_version="1.600.0", - compatible_model_packages=[ - {"name": "policyengine-us", "specifier": "==1.600.0"}, - ], - headers={"x-repo-commit": "old-release-manifest-commit"}, - ) - raise AssertionError(f"Unexpected URL fetched: {url}") - - with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): - with pytest.raises( - ValueError, - match="Data release manifest is not certified", - ): - refresh_release_bundle( - country="us", - model_version="1.653.3", - manifest_dir=sandbox["manifest_dir"], - pyproject_path=sandbox["pyproject_path"], - ) - - -def test__bump_data_only_streams_hf_and_updates_uri(sandbox) -> None: - """Bumping only the data version streams the HF file, recomputes - its sha256, and rewrites the URI revision.""" - manifest_path = sandbox["manifest_dir"] / "us.json" - manifest = json.loads(manifest_path.read_text()) - manifest["data_package"].pop("release_manifest_path") - manifest["data_package"].pop("release_manifest_revision") - manifest_path.write_text(json.dumps(manifest, indent=2)) - - hf_bytes = b"synthetic dataset payload" - expected_sha256 = hashlib.sha256(hf_bytes).hexdigest() - - def fake_urlopen(request, *args, **kwargs): - url = request.full_url - if "huggingface.co" in url: - assert "@" not in url # URI revision is in the URL path - assert "/datasets/" not in url - assert "1.83.4" in url - return _FakeHFResponse(hf_bytes) - raise AssertionError(f"Unexpected URL: {url}") - - with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): - result = refresh_release_bundle( - country="us", - data_version="1.83.4", - manifest_dir=sandbox["manifest_dir"], - pyproject_path=sandbox["pyproject_path"], - ) - - assert result.new_model == "1.600.0" # untouched - assert result.new_data == "1.83.4" - assert result.new_dataset_sha256 == expected_sha256 - assert not result.pyproject_updated # no model bump => no pyproject change - - written = json.loads((sandbox["manifest_dir"] / "us.json").read_text()) - assert written["data_package"]["version"] == "1.83.4" - assert "release_manifest_path" not in written["data_package"] - assert "release_manifest_revision" not in written["data_package"] - assert written["certified_data_artifact"]["data_package"]["version"] == "1.83.4" - assert written["certified_data_artifact"]["build_id"] == ( - "policyengine-us-data-1.83.4" - ) - assert written["certified_data_artifact"]["sha256"] == expected_sha256 - assert ( - written["certified_data_artifact"]["uri"] - == "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@1.83.4" - ) - - -def test__bump_data_only_writes_release_manifest_revision_when_absent( - sandbox, -) -> None: - manifest_path = sandbox["manifest_dir"] / "us.json" - manifest = json.loads(manifest_path.read_text()) - manifest["data_package"].pop("release_manifest_revision") - manifest_path.write_text(json.dumps(manifest, indent=2)) - - def fake_urlopen(request, *args, **kwargs): - url = request.full_url - if url.endswith("releases/1.83.4/release_manifest.json"): - return _data_release_manifest_response() - raise AssertionError(f"Unexpected URL fetched: {url}") - - with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): - refresh_release_bundle( - country="us", - data_version="1.83.4", - manifest_dir=sandbox["manifest_dir"], - pyproject_path=sandbox["pyproject_path"], - ) - - written = json.loads((sandbox["manifest_dir"] / "us.json").read_text()) - assert ( - written["data_package"]["release_manifest_revision"] - == "release-manifest-commit-sha" - ) - assert ( - written["certified_data_artifact"]["uri"] - == "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@release-manifest-commit-sha" - ) - - -def test__bump_data_only_falls_back_to_main_for_release_manifest( - sandbox, -) -> None: - seen_urls = [] - - def fake_urlopen(request, *args, **kwargs): - url = request.full_url - seen_urls.append(url) - if "/resolve/1.83.4/releases/1.83.4/release_manifest.json" in url: - raise HTTPError(url, 404, "Not Found", hdrs=None, fp=None) - if "/resolve/main/releases/1.83.4/release_manifest.json" in url: - return _data_release_manifest_response() - raise AssertionError(f"Unexpected URL fetched: {url}") - - with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): - refresh_release_bundle( - country="us", - data_version="1.83.4", - manifest_dir=sandbox["manifest_dir"], - pyproject_path=sandbox["pyproject_path"], - ) - - assert any("/resolve/1.83.4/" in url for url in seen_urls) - assert any("/resolve/main/" in url for url in seen_urls) - written = json.loads((sandbox["manifest_dir"] / "us.json").read_text()) - assert ( - written["data_package"]["release_manifest_revision"] - == "release-manifest-commit-sha" - ) - assert ( - written["certified_data_artifact"]["uri"] - == "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@release-manifest-commit-sha" - ) - - -def test__release_manifest_version_mismatch_raises(sandbox) -> None: - def fake_urlopen(request, *args, **kwargs): - url = request.full_url - if url.endswith("releases/1.83.4/release_manifest.json"): - return _data_release_manifest_response(data_version="1.83.3") - raise AssertionError(f"Unexpected URL fetched: {url}") - - with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): - with pytest.raises( - ValueError, - match="declares version '1.83.3', expected '1.83.4'", - ): - refresh_release_bundle( - country="us", - data_version="1.83.4", - manifest_dir=sandbox["manifest_dir"], - pyproject_path=sandbox["pyproject_path"], - ) - - -def test__bump_both_updates_everything(sandbox) -> None: - def fake_urlopen(request, *args, **kwargs): - url = request.full_url - if "pypi.org" in url: - return _pypi_response("policyengine-us", "1.653.3") - if url.endswith("releases/1.83.4/release_manifest.json"): - return _data_release_manifest_response() - raise AssertionError(url) - - with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): - result = refresh_release_bundle( - country="us", - model_version="1.653.3", - data_version="1.83.4", - manifest_dir=sandbox["manifest_dir"], - pyproject_path=sandbox["pyproject_path"], - ) - - assert result.pyproject_updated - assert result.new_model == "1.653.3" - assert result.new_data == "1.83.4" - - -def test__bump_both_uses_data_release_manifest_metadata(sandbox) -> None: - def fake_urlopen(request, *args, **kwargs): - url = request.full_url - if "pypi.org" in url: - return _pypi_response("policyengine-us", "1.653.3") - if url.endswith("releases/1.83.4/release_manifest.json"): - return _data_release_manifest_response() - raise AssertionError(f"Unexpected URL fetched: {url}") - - with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): - result = refresh_release_bundle( - country="us", - model_version="1.653.3", - data_version="1.83.4", - manifest_dir=sandbox["manifest_dir"], - pyproject_path=sandbox["pyproject_path"], - ) - - assert result.new_dataset_sha256 == "e" * 64 - - written = json.loads((sandbox["manifest_dir"] / "us.json").read_text()) - assert written["certified_data_artifact"]["build_id"] == ( - "policyengine-us-data-1.83.4" - ) - assert written["certified_data_artifact"]["sha256"] == "e" * 64 - assert written["certification"]["compatibility_basis"] == ( - "exact_build_model_version" - ) - assert written["certification"]["built_with_model_version"] == "1.653.3" - assert written["certification"]["built_with_model_git_sha"] == "deadbeef" - assert written["certification"]["data_build_fingerprint"] == ("sha256:fingerprint") - assert ( - written["data_package"]["release_manifest_revision"] - == "release-manifest-commit-sha" - ) - - -def test__model_refresh_uses_compatible_model_package_assertion(sandbox) -> None: - def fake_urlopen(request, *args, **kwargs): - url = request.full_url - if "pypi.org" in url: - return _pypi_response("policyengine-us", "1.653.3") - if url.endswith("releases/1.70.0/release_manifest.json"): - return _data_release_manifest_response( - model_version="1.600.0", - data_version="1.70.0", - compatible_model_packages=[ - {"name": "policyengine-us", "specifier": "==1.600.0"}, - {"name": "policyengine-us", "specifier": "==1.653.3"}, - ], - ) - raise AssertionError(f"Unexpected URL fetched: {url}") - - with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): - refresh_release_bundle( - country="us", - model_version="1.653.3", - release_manifest_revision="release-manifest-commit-sha", - manifest_dir=sandbox["manifest_dir"], - pyproject_path=sandbox["pyproject_path"], - ) - - written = json.loads((sandbox["manifest_dir"] / "us.json").read_text()) - assert ( - written["certification"]["compatibility_basis"] - == "legacy_compatible_model_package" - ) - assert written["certification"]["built_with_model_version"] == "1.600.0" - assert written["certification"]["certified_for_model_version"] == "1.653.3" - - -def test__model_refresh_rejects_uncertified_data_release_manifest(sandbox) -> None: - def fake_urlopen(request, *args, **kwargs): - url = request.full_url - if "pypi.org" in url: - return _pypi_response("policyengine-us", "1.653.3") - if url.endswith("releases/1.70.0/release_manifest.json"): - return _data_release_manifest_response( - model_version="1.600.0", - data_version="1.70.0", - compatible_model_packages=[ - {"name": "policyengine-us", "specifier": "==1.600.0"}, - ], - ) - raise AssertionError(f"Unexpected URL fetched: {url}") - - with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): - with pytest.raises(ValueError, match="not certified"): - refresh_release_bundle( - country="us", - model_version="1.653.3", - release_manifest_revision="release-manifest-commit-sha", - manifest_dir=sandbox["manifest_dir"], - pyproject_path=sandbox["pyproject_path"], - ) - - -def test__custom_release_manifest_refreshes_long_term_dataset_hashes( - sandbox, -) -> None: - manifest_path = sandbox["manifest_dir"] / "us.json" - manifest = json.loads(manifest_path.read_text()) - manifest["data_package"]["release_manifest_path"] = ( - "releases/crfb-longrun-old/release_manifest.json" - ) - manifest["data_package"]["release_manifest_revision"] = "crfb-longrun-old" - manifest["datasets"]["long_term_cps_2100"] = { - "path": "long_term/2100.h5", - "sha256": "1" * 64, - "metadata_sha256": "2" * 64, - } - manifest_path.write_text(json.dumps(manifest, indent=2)) - - def fake_urlopen(request, *args, **kwargs): - url = request.full_url - if ( - "/resolve/crfb-longrun-new/releases/crfb-longrun-new/release_manifest.json" - in url - ): - return _data_release_manifest_response( - data_version="1.83.4", - extra_artifacts={ - "long_term/2100": { - "kind": "microdata", - "path": "long_term/2100.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.83.4", - "sha256": "3" * 64, - }, - "long_term/2100.h5.metadata": { - "kind": "auxiliary", - "path": "long_term/2100.h5.metadata.json", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.83.4", - "sha256": "4" * 64, - }, - }, - ) - raise AssertionError(f"Unexpected URL fetched: {url}") - - with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): - refresh_release_bundle( - country="us", - data_version="1.83.4", - release_manifest_path="releases/crfb-longrun-new/release_manifest.json", - release_manifest_revision="crfb-longrun-new", - manifest_dir=sandbox["manifest_dir"], - pyproject_path=sandbox["pyproject_path"], - ) - - written = json.loads((sandbox["manifest_dir"] / "us.json").read_text()) - assert ( - written["data_package"]["release_manifest_path"] - == "releases/crfb-longrun-new/release_manifest.json" - ) - assert ( - written["data_package"]["release_manifest_revision"] - == "release-manifest-commit-sha" - ) - assert written["datasets"]["long_term_cps_2100"]["sha256"] == "3" * 64 - assert written["datasets"]["long_term_cps_2100"]["metadata_sha256"] == "4" * 64 - assert ( - written["certified_data_artifact"]["uri"] - == "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@release-manifest-commit-sha" - ) - - -def test__custom_release_manifest_requires_existing_long_term_metadata_sidecar( - sandbox, -) -> None: - manifest_path = sandbox["manifest_dir"] / "us.json" - manifest = json.loads(manifest_path.read_text()) - manifest["data_package"]["release_manifest_path"] = ( - "releases/crfb-longrun-old/release_manifest.json" - ) - manifest["data_package"]["release_manifest_revision"] = "crfb-longrun-old" - manifest["datasets"]["long_term_cps_2100"] = { - "path": "long_term/2100.h5", - "sha256": "1" * 64, - "metadata_sha256": "2" * 64, - } - manifest_path.write_text(json.dumps(manifest, indent=2)) - - def fake_urlopen(request, *args, **kwargs): - url = request.full_url - if ( - "/resolve/crfb-longrun-new/releases/crfb-longrun-new/release_manifest.json" - in url - ): - return _data_release_manifest_response( - data_version="1.83.4", - extra_artifacts={ - "long_term/2100": { - "kind": "microdata", - "path": "long_term/2100.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.83.4", - "sha256": "3" * 64, - }, - }, - ) - raise AssertionError(f"Unexpected URL fetched: {url}") - - with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): - with pytest.raises( - ValueError, - match="missing metadata sidecar artifact", - ): - refresh_release_bundle( - country="us", - data_version="1.83.4", - release_manifest_path="releases/crfb-longrun-new/release_manifest.json", - release_manifest_revision="crfb-longrun-new", - manifest_dir=sandbox["manifest_dir"], - pyproject_path=sandbox["pyproject_path"], - ) - - -def test__custom_release_manifest_requires_existing_long_term_dataset_artifact( - sandbox, -) -> None: - manifest_path = sandbox["manifest_dir"] / "us.json" - manifest = json.loads(manifest_path.read_text()) - manifest["data_package"]["release_manifest_path"] = ( - "releases/crfb-longrun-old/release_manifest.json" - ) - manifest["data_package"]["release_manifest_revision"] = "crfb-longrun-old" - manifest["datasets"]["long_term_cps_2100"] = { - "path": "long_term/2100.h5", - "sha256": "1" * 64, - "metadata_sha256": "2" * 64, - } - manifest_path.write_text(json.dumps(manifest, indent=2)) - - def fake_urlopen(request, *args, **kwargs): - url = request.full_url - if ( - "/resolve/crfb-longrun-new/releases/crfb-longrun-new/release_manifest.json" - in url - ): - return _data_release_manifest_response(data_version="1.83.4") - raise AssertionError(f"Unexpected URL fetched: {url}") - - with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): - with pytest.raises( - ValueError, - match="missing dataset artifact", - ): - refresh_release_bundle( - country="us", - data_version="1.83.4", - release_manifest_path="releases/crfb-longrun-new/release_manifest.json", - release_manifest_revision="crfb-longrun-new", - manifest_dir=sandbox["manifest_dir"], - pyproject_path=sandbox["pyproject_path"], - ) - - -def test__refresh_preserves_dataset_entries_with_explicit_revisions( - sandbox, -) -> None: - manifest_path = sandbox["manifest_dir"] / "us.json" - manifest = json.loads(manifest_path.read_text()) - manifest["datasets"]["long_term_cps_2100"] = { - "path": "long_term/2100.h5", - "revision": "crfb-longrun-20260517", - "sha256": "1" * 64, - "metadata_sha256": "2" * 64, - } - manifest_path.write_text(json.dumps(manifest, indent=2)) - - def fake_urlopen(request, *args, **kwargs): - url = request.full_url - if url.endswith("releases/1.83.4/release_manifest.json"): - return _data_release_manifest_response(data_version="1.83.4") - raise AssertionError(f"Unexpected URL fetched: {url}") - - with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): - refresh_release_bundle( - country="us", - data_version="1.83.4", - manifest_dir=sandbox["manifest_dir"], - pyproject_path=sandbox["pyproject_path"], - ) - - written = json.loads((sandbox["manifest_dir"] / "us.json").read_text()) - assert written["datasets"]["long_term_cps_2100"] == { - "path": "long_term/2100.h5", - "revision": "crfb-longrun-20260517", - "sha256": "1" * 64, - "metadata_sha256": "2" * 64, - } - assert ( - written["certified_data_artifact"]["uri"] - == "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5@release-manifest-commit-sha" - ) - - -def _write_populace_refresh_manifest(sandbox, *, old_release: str) -> None: - manifest_path = sandbox["manifest_dir"] / "us.json" - manifest = json.loads(manifest_path.read_text()) - manifest["data_package"] = { - "name": "populace-data", - "version": "0.1.0", - "repo_id": "policyengine/populace-us", - "repo_type": "dataset", - "release_manifest_path": f"releases/{old_release}/release_manifest.json", - "release_manifest_revision": old_release, - } - manifest["certified_data_artifact"] = { - "data_package": {"name": "populace-data", "version": "0.1.0"}, - "build_id": old_release, - "dataset": "populace_us_2024", - "uri": f"hf://policyengine/populace-us/populace_us_2024.h5@{old_release}", - "sha256": "1" * 64, - } - manifest["certification"] = { - "compatibility_basis": "exact_build_model_version", - "data_build_id": old_release, - "built_with_model_version": "1.600.0", - "certified_for_model_version": "1.600.0", - "certified_by": "test fixture", - } - manifest["default_dataset"] = "populace_us_2024" - manifest["datasets"] = { - "populace_us_2024": { - "path": "populace_us_2024.h5", - "repo_id": "policyengine/populace-us", - "revision": old_release, - "sha256": "1" * 64, - }, - "populace_us_2024_calibration": { - "path": "populace_us_2024_calibration.npz", - "repo_id": "policyengine/populace-us", - "revision": old_release, - "sha256": "2" * 64, - }, - "calibration_diagnostics": { - "path": f"releases/{old_release}/calibration_diagnostics.json", - "repo_id": "policyengine/populace-us", - "revision": old_release, - "sha256": "3" * 64, - }, - } - manifest_path.write_text(json.dumps(manifest, indent=2)) - - -def _populace_release_payload( - *, - new_release: str, - diagnostics: dict, - include_diagnostics_artifact: bool = True, -) -> dict: - artifacts = { - "populace_us_2024": { - "kind": "microdata", - "path": "populace_us_2024.h5", - "repo_id": "policyengine/populace-us", - "revision": new_release, - "sha256": "5" * 64, - }, - "populace_us_2024_calibration": { - "kind": "calibration", - "path": "populace_us_2024_calibration.npz", - "repo_id": "policyengine/populace-us", - "revision": new_release, - "sha256": "6" * 64, - }, - } - if include_diagnostics_artifact: - artifacts["calibration_diagnostics"] = { - "kind": "diagnostics", - "path": "calibration_diagnostics.json", - "repo_id": "policyengine/populace-us", - "revision": new_release, - "sha256": _json_sha256(diagnostics), - } - return { - "schema_version": 1, - "data_package": {"name": "populace-data", "version": "0.1.0"}, - "build": { - "build_id": new_release, - "built_with_model_package": { - "name": "policyengine-us", - "version": "1.600.0", - }, - }, - "artifacts": artifacts, - } - - -def test__same_data_version_release_override_refreshes_revisioned_populace_artifacts( - sandbox, -) -> None: - manifest_path = sandbox["manifest_dir"] / "us.json" - manifest = json.loads(manifest_path.read_text()) - old_release = "populace-us-2024-old" - new_release = "populace-us-2024-new" - manifest["data_package"] = { - "name": "populace-data", - "version": "0.1.0", - "repo_id": "policyengine/populace-us", - "repo_type": "dataset", - "release_manifest_path": f"releases/{old_release}/release_manifest.json", - "release_manifest_revision": old_release, - } - manifest["certified_data_artifact"] = { - "data_package": {"name": "populace-data", "version": "0.1.0"}, - "build_id": old_release, - "dataset": "populace_us_2024", - "uri": f"hf://policyengine/populace-us/populace_us_2024.h5@{old_release}", - "sha256": "1" * 64, - } - manifest["certification"] = { - "compatibility_basis": "exact_build_model_version", - "data_build_id": old_release, - "built_with_model_version": "1.600.0", - "certified_for_model_version": "1.600.0", - "certified_by": "test fixture", - } - manifest["default_dataset"] = "populace_us_2024" - manifest["datasets"] = { - "populace_us_2024": { - "path": "populace_us_2024.h5", - "repo_id": "policyengine/populace-us", - "revision": old_release, - "sha256": "1" * 64, - }, - "populace_us_2024_calibration": { - "path": "populace_us_2024_calibration.npz", - "repo_id": "policyengine/populace-us", - "revision": old_release, - "sha256": "2" * 64, - }, - "calibration_diagnostics": { - "path": f"releases/{old_release}/calibration_diagnostics.json", - "repo_id": "policyengine/populace-us", - "revision": old_release, - "sha256": "3" * 64, - }, - "external_long_term": { - "path": "long_term/2100.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "crfb-longrun-20260517", - "sha256": "4" * 64, - }, - } - manifest_path.write_text(json.dumps(manifest, indent=2)) - - diagnostics = _populace_calibration_diagnostics() - payload = { - "schema_version": 1, - "data_package": {"name": "populace-data", "version": "0.1.0"}, - "build": { - "build_id": new_release, - "built_with_model_package": { - "name": "policyengine-us", - "version": "1.600.0", - }, - }, - "artifacts": { - "populace_us_2024": { - "kind": "microdata", - "path": "populace_us_2024.h5", - "repo_id": "policyengine/populace-us", - "revision": new_release, - "sha256": "5" * 64, - }, - "populace_us_2024_calibration": { - "kind": "calibration", - "path": "populace_us_2024_calibration.npz", - "repo_id": "policyengine/populace-us", - "revision": new_release, - "sha256": "6" * 64, - }, - "calibration_diagnostics": { - "kind": "diagnostics", - "path": "calibration_diagnostics.json", - "repo_id": "policyengine/populace-us", - "revision": new_release, - "sha256": _json_sha256(diagnostics), - }, - "reform_validation": { - "kind": "diagnostics", - "path": "reform_validation.json", - "repo_id": "policyengine/populace-us", - "revision": new_release, - "sha256": "8" * 64, - }, - }, - } - - def fake_urlopen(request, *args, **kwargs): - url = request.full_url - if ( - f"/resolve/{new_release}/releases/{new_release}/release_manifest.json" - in url - ): - return _json_response( - payload, - headers={"x-repo-commit": "new-release-manifest-commit"}, - ) - if ( - f"/resolve/{new_release}/releases/{new_release}/calibration_diagnostics.json" - in url - ): - return _json_response(diagnostics) - raise AssertionError(f"Unexpected URL fetched: {url}") - - with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): - refresh_release_bundle( - country="us", - data_version="0.1.0", - release_manifest_path=f"releases/{new_release}/release_manifest.json", - release_manifest_revision=new_release, - manifest_dir=sandbox["manifest_dir"], - pyproject_path=sandbox["pyproject_path"], - ) - - written = json.loads((sandbox["manifest_dir"] / "us.json").read_text()) - assert written["certified_data_artifact"]["build_id"] == new_release - assert ( - written["certified_data_artifact"]["uri"] - == "hf://policyengine/populace-us/populace_us_2024.h5@new-release-manifest-commit" - ) - assert written["datasets"]["populace_us_2024"] == { - "path": "populace_us_2024.h5", - "repo_id": "policyengine/populace-us", - "revision": new_release, - "sha256": "5" * 64, - } - assert written["datasets"]["calibration_diagnostics"] == { - "path": f"releases/{new_release}/calibration_diagnostics.json", - "repo_id": "policyengine/populace-us", - "revision": new_release, - "sha256": _json_sha256(diagnostics), - } - assert written["datasets"]["reform_validation"] == { - "path": f"releases/{new_release}/reform_validation.json", - "repo_id": "policyengine/populace-us", - "revision": new_release, - "sha256": "8" * 64, - } - assert written["datasets"]["external_long_term"] == { - "path": "long_term/2100.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "crfb-longrun-20260517", - "sha256": "4" * 64, - } - - -def test__populace_release_requires_calibration_diagnostics( - sandbox, -) -> None: - old_release = "populace-us-2024-old" - new_release = "populace-us-2024-new" - _write_populace_refresh_manifest(sandbox, old_release=old_release) - diagnostics = _populace_calibration_diagnostics() - payload = _populace_release_payload( - new_release=new_release, - diagnostics=diagnostics, - include_diagnostics_artifact=False, - ) - - def fake_urlopen(request, *args, **kwargs): - url = request.full_url - if ( - f"/resolve/{new_release}/releases/{new_release}/release_manifest.json" - in url - ): - return _json_response( - payload, - headers={"x-repo-commit": "new-release-manifest-commit"}, - ) - raise AssertionError(f"Unexpected URL fetched: {url}") - - with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): - with pytest.raises( - ValueError, - match="missing calibration_diagnostics", - ): - refresh_release_bundle( - country="us", - data_version="0.1.0", - release_manifest_path=f"releases/{new_release}/release_manifest.json", - release_manifest_revision=new_release, - manifest_dir=sandbox["manifest_dir"], - pyproject_path=sandbox["pyproject_path"], - ) - - -def test__populace_release_rejects_critical_calibration_target_miss( - sandbox, -) -> None: - old_release = "populace-us-2024-old" - new_release = "populace-us-2024-new" - _write_populace_refresh_manifest(sandbox, old_release=old_release) - diagnostics = _populace_calibration_diagnostics( - income_tax_relative_error=-0.6508063496056629 - ) - payload = _populace_release_payload( - new_release=new_release, - diagnostics=diagnostics, - ) - - def fake_urlopen(request, *args, **kwargs): - url = request.full_url - if ( - f"/resolve/{new_release}/releases/{new_release}/release_manifest.json" - in url - ): - return _json_response( - payload, - headers={"x-repo-commit": "new-release-manifest-commit"}, - ) - if ( - f"/resolve/{new_release}/releases/{new_release}/calibration_diagnostics.json" - in url - ): - return _json_response(diagnostics) - raise AssertionError(f"Unexpected URL fetched: {url}") - - with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): - with pytest.raises( - ValueError, - match="critical calibration target gate failed", - ): - refresh_release_bundle( - country="us", - data_version="0.1.0", - release_manifest_path=f"releases/{new_release}/release_manifest.json", - release_manifest_revision=new_release, - manifest_dir=sandbox["manifest_dir"], - pyproject_path=sandbox["pyproject_path"], - ) - - -def test__custom_release_manifest_requires_existing_long_term_dataset_sha( - sandbox, -) -> None: - manifest_path = sandbox["manifest_dir"] / "us.json" - manifest = json.loads(manifest_path.read_text()) - manifest["data_package"]["release_manifest_path"] = ( - "releases/crfb-longrun-old/release_manifest.json" - ) - manifest["data_package"]["release_manifest_revision"] = "crfb-longrun-old" - manifest["datasets"]["long_term_cps_2100"] = { - "path": "long_term/2100.h5", - "sha256": "1" * 64, - "metadata_sha256": "2" * 64, - } - manifest_path.write_text(json.dumps(manifest, indent=2)) - - def fake_urlopen(request, *args, **kwargs): - url = request.full_url - if ( - "/resolve/crfb-longrun-new/releases/crfb-longrun-new/release_manifest.json" - in url - ): - return _data_release_manifest_response( - data_version="1.83.4", - extra_artifacts={ - "long_term/2100": { - "kind": "microdata", - "path": "long_term/2100.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.83.4", - }, - "long_term/2100.h5.metadata": { - "kind": "auxiliary", - "path": "long_term/2100.h5.metadata.json", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.83.4", - "sha256": "4" * 64, - }, - }, - ) - raise AssertionError(f"Unexpected URL fetched: {url}") - - with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): - with pytest.raises( - ValueError, - match="dataset artifact lacks sha256", - ): - refresh_release_bundle( - country="us", - data_version="1.83.4", - release_manifest_path="releases/crfb-longrun-new/release_manifest.json", - release_manifest_revision="crfb-longrun-new", - manifest_dir=sandbox["manifest_dir"], - pyproject_path=sandbox["pyproject_path"], - ) - - -def test__explicit_release_manifest_revision_does_not_fallback_to_main( - sandbox, -) -> None: - seen_urls = [] - - def fake_urlopen(request, *args, **kwargs): - url = request.full_url - seen_urls.append(url) - if ( - "/resolve/bad-crfb-ref/releases/crfb-longrun-new/release_manifest.json" - in url - ): - raise HTTPError(url, 404, "Not Found", hdrs=None, fp=None) - if "/resolve/main/releases/crfb-longrun-new/release_manifest.json" in url: - return _data_release_manifest_response(data_version="1.83.4") - raise AssertionError(f"Unexpected URL fetched: {url}") - - with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): - with pytest.raises( - ValueError, - match="Could not fetch data release manifest", - ): - refresh_release_bundle( - country="us", - data_version="1.83.4", - release_manifest_path="releases/crfb-longrun-new/release_manifest.json", - release_manifest_revision="bad-crfb-ref", - manifest_dir=sandbox["manifest_dir"], - pyproject_path=sandbox["pyproject_path"], - ) - - assert any("/resolve/bad-crfb-ref/" in url for url in seen_urls) - assert not any("/resolve/main/" in url for url in seen_urls) - - -def test__missing_release_manifest_metadata_raises(sandbox) -> None: - def fake_urlopen(request, *args, **kwargs): - url = request.full_url - if url.endswith("releases/1.83.4/release_manifest.json"): - return _FakeHFResponse(b"not json") - raise AssertionError(f"Unexpected URL fetched: {url}") - - with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): - with pytest.raises( - ValueError, - match="Could not fetch data release manifest", - ): - refresh_release_bundle( - country="us", - data_version="1.83.4", - manifest_dir=sandbox["manifest_dir"], - pyproject_path=sandbox["pyproject_path"], - ) - - -def test__missing_release_manifest_commit_raises(sandbox) -> None: - def fake_urlopen(request, *args, **kwargs): - url = request.full_url - if url.endswith("releases/1.83.4/release_manifest.json"): - return _data_release_manifest_response(headers={}) - raise AssertionError(f"Unexpected URL fetched: {url}") - - with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): - with pytest.raises( - ValueError, - match="Could not resolve an immutable HF commit", - ): - refresh_release_bundle( - country="us", - data_version="1.83.4", - manifest_dir=sandbox["manifest_dir"], - pyproject_path=sandbox["pyproject_path"], - ) - - -def test__release_manifest_missing_certified_artifact_raises(sandbox) -> None: - def fake_urlopen(request, *args, **kwargs): - url = request.full_url - if url.endswith("releases/1.83.4/release_manifest.json"): - payload = { - "schema_version": 1, - "data_package": { - "name": "policyengine-us-data", - "version": "1.83.4", - }, - "artifacts": { - "other_dataset": { - "kind": "microdata", - "path": "other_dataset.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "1.83.4", - "sha256": "e" * 64, - } - }, - } - return _FakeHFResponse( - json.dumps(payload).encode(), - headers={"x-repo-commit": "release-manifest-commit-sha"}, - ) - raise AssertionError(f"Unexpected URL fetched: {url}") - - with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): - with pytest.raises( - ValueError, - match="does not include certified dataset", - ): - refresh_release_bundle( - country="us", - data_version="1.83.4", - manifest_dir=sandbox["manifest_dir"], - pyproject_path=sandbox["pyproject_path"], - ) - - -def test__update_pyproject_false_leaves_pins_alone(sandbox) -> None: - def fake_urlopen(request, *args, **kwargs): - url = request.full_url - if "pypi.org" in url: - return _pypi_response("policyengine-us", "1.653.3") - if url.endswith("releases/1.70.0/release_manifest.json"): - return _data_release_manifest_response( - data_version="1.70.0", - dataset_sha256="d" * 64, - compatible_model_packages=[ - {"name": "policyengine-us", "specifier": "==1.600.0"}, - {"name": "policyengine-us", "specifier": "==1.653.3"}, - ], - extra_artifacts={ - "enhanced_cps_2024": { - "kind": "microdata", - "path": "enhanced_cps_2024.h5", - "repo_id": "policyengine/policyengine-us-data", - "revision": "old-dataset-commit", - "sha256": "d" * 64, - } - }, - headers={"x-repo-commit": "old-release-manifest-commit"}, - ) - raise AssertionError(f"Unexpected URL fetched: {url}") - - with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): - result = refresh_release_bundle( - country="us", - model_version="1.653.3", - update_pyproject=False, - manifest_dir=sandbox["manifest_dir"], - pyproject_path=sandbox["pyproject_path"], - ) - - assert not result.pyproject_updated - assert "policyengine-us==1.600.0" in sandbox["pyproject_path"].read_text() - - -def test__invalid_pyproject_version_fails_before_manifest_write( - sandbox, tmp_path -) -> None: - invalid_pyproject = tmp_path / "invalid-pyproject.toml" - invalid_pyproject.write_text('[project]\nname = "policyengine"\n') - manifest_path = sandbox["manifest_dir"] / "us.json" - original = manifest_path.read_text() - - with pytest.raises(ValueError, match="Could not find project version"): - refresh_release_bundle( - country="us", - manifest_dir=sandbox["manifest_dir"], - pyproject_path=invalid_pyproject, - ) - - assert manifest_path.read_text() == original - - -def test__no_matching_wheel_on_pypi_raises(sandbox) -> None: - def fake_urlopen(*args, **kwargs): - return io.BytesIO(json.dumps({"urls": []}).encode()) - - with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): - with pytest.raises(ValueError, match="No py3-none-any wheel"): - refresh_release_bundle( - country="us", - model_version="1.999.0", - manifest_dir=sandbox["manifest_dir"], - pyproject_path=sandbox["pyproject_path"], - ) - - -def test__malformed_uri_raises(tmp_path) -> None: - """If the current manifest's URI doesn't match the expected - ``hf://.../path@revision`` shape, we refuse to guess.""" - manifest_dir = tmp_path / "m" - manifest_dir.mkdir() - bad = { - "schema_version": 1, - "bundle_id": "us-4.0.0", - "country_id": "us", - "policyengine_version": "4.0.0", - "model_package": { - "name": "policyengine-us", - "version": "1.600.0", - "sha256": "c" * 64, - "wheel_url": "https://…old.whl", - }, - "data_package": { - "name": "policyengine-us-data", - "version": "1.70.0", - "repo_id": "policyengine/policyengine-us-data", - }, - "certified_data_artifact": { - "data_package": { - "name": "policyengine-us-data", - "version": "1.70.0", - }, - "build_id": "policyengine-us-data-1.70.0", - "dataset": "enhanced_cps_2024", - # Malformed: no @revision. - "uri": "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5", - "sha256": "d" * 64, - }, - "certification": { - "compatibility_basis": "matching_data_build_fingerprint", - "data_build_id": "policyengine-us-data-1.70.0", - "built_with_model_version": "1.595.0", - "certified_for_model_version": "1.600.0", - "certified_by": "test fixture", - }, - "default_dataset": "enhanced_cps_2024", - "datasets": {"enhanced_cps_2024": {"path": "enhanced_cps_2024.h5"}}, - "region_datasets": {}, - } - (manifest_dir / "us.json").write_text(json.dumps(bad)) - - with pytest.raises(ValueError, match="Cannot parse current dataset URI"): - refresh_release_bundle( - country="us", - data_version="1.83.4", - manifest_dir=manifest_dir, - pyproject_path=tmp_path / "pyproject.toml", - ) diff --git a/tests/test_certify_data_release.py b/tests/test_certify_data_release.py index 69d01602..5ca207f9 100644 --- a/tests/test_certify_data_release.py +++ b/tests/test_certify_data_release.py @@ -18,6 +18,11 @@ MANIFEST_URI = ( f"hf://dataset/policyengine/populace-us@{TAG}/releases/{TAG}/release_manifest.json" ) +UK_TAG = "populace-uk-2023-bbbbbbb-20260101" +UK_MANIFEST_URI = ( + "hf://dataset/policyengine/populace-uk-private" + f"@{UK_TAG}/releases/{UK_TAG}/release_manifest.json" +) def _release_manifest_payload() -> dict: @@ -95,6 +100,40 @@ def _manifest() -> DataReleaseManifest: return DataReleaseManifest.model_validate(_release_manifest_payload()) +def _uk_release_manifest_payload() -> dict: + payload = _release_manifest_payload() + payload["compatible_model_packages"] = [ + {"name": "policyengine-uk", "specifier": "==2.89.2"} + ] + payload["build"]["build_id"] = UK_TAG + payload["build"]["built_with_model_package"] = { + "name": "policyengine-uk", + "version": "2.89.2", + "git_sha": "deadbeef", + } + for artifact in payload["artifacts"].values(): + if artifact["repo_id"] == "policyengine/populace-us": + artifact["repo_id"] = "policyengine/populace-uk-private" + artifact["revision"] = UK_TAG + payload["artifacts"].pop("us_source_coverage") + return payload + + +def _bundle_source_payload() -> dict: + return { + "schema_version": 2, + "bundle_version": "9.9.9", + "policyengine_version": "9.9.9", + "packages": { + "policyengine": {"name": "policyengine", "version": "9.9.9"}, + "policyengine-uk": {"name": "policyengine-uk", "version": "2.0.0"}, + }, + "extras": {}, + "countries": {"uk": {"model_package": "policyengine-uk"}}, + "data_releases": {}, + } + + class TestParseManifestUri: def test__given_dataset_uri__then_parses_parts(self): parts = parse_manifest_uri(MANIFEST_URI) @@ -221,7 +260,7 @@ def test__given_build_provenance__then_certification_carries_it(self): certification = payload["certification"] assert certification["compatibility_basis"] == "built_with_model_package" - assert certification["certified_by"] == "policyengine.py certification" + assert certification["certified_by"] == "policyengine.py bundle certification" assert certification["data_build_id"] == TAG assert certification["built_with_model_version"] == "1.723.0" assert certification["built_with_model_git_sha"] == "deadbeef" @@ -232,10 +271,14 @@ def test__given_build_provenance__then_certification_carries_it(self): class TestCertifyDataRelease: - def test__given_fetched_manifest__then_writes_country_manifest(self, tmp_path): + def test__given_fetched_populace_manifest__then_updates_bundle_manifest( + self, tmp_path + ): + bundle_path = tmp_path / "manifest.json" + bundle_path.write_text(json.dumps(_bundle_source_payload()) + "\n") response = MagicMock() response.status_code = 200 - response.content = json.dumps(_release_manifest_payload()).encode() + response.content = json.dumps(_uk_release_manifest_payload()).encode() with ( patch( @@ -258,20 +301,46 @@ def test__given_fetched_manifest__then_writes_country_manifest(self, tmp_path): "policyengine.provenance.certification.fetch_pypi_wheel_metadata", return_value={"sha256": "d" * 64, "url": "https://example"}, ), + patch( + "policyengine.provenance.certification.policyengine_version", + return_value="9.9.9", + ), ): result = certify_data_release( + country="uk", + data_producer="populace", + manifest_uri=UK_MANIFEST_URI, + model_version="2.89.2", + bundle_path=bundle_path, + ) + + written = json.loads(bundle_path.read_text()) + release = written["data_releases"]["uk"] + assert release["data_producer"] == "populace" + assert release["default_dataset"] == "populace_us_2024" + assert release["certification"]["data_build_id"] == UK_TAG + assert release["version"] == UK_TAG + assert release["source_manifest_uri"] == UK_MANIFEST_URI + assert written["packages"]["policyengine-uk"]["version"] == "2.89.2" + assert result.data_producer == "populace" + assert result.dataset_count == 4 + assert result.build_id == UK_TAG + assert result.bundle_path == bundle_path + + def test__given_us_without_data_producer__then_legacy_update_is_explicitly_unsupported( + self, tmp_path + ): + bundle_path = tmp_path / "manifest.json" + bundle_path.write_text(json.dumps(_bundle_source_payload()) + "\n") + + with pytest.raises(CertificationError, match="Legacy data-producer"): + certify_data_release( country="us", manifest_uri=MANIFEST_URI, model_version="1.723.0", - output_dir=tmp_path, + bundle_path=bundle_path, ) - written = json.loads((tmp_path / "us.json").read_text()) - assert written["default_dataset"] == "populace_us_2024" - assert written["certification"]["data_build_id"] == TAG - assert result.dataset_count == 5 - assert result.build_id == TAG - def test__given_missing_populace_us_source_coverage__then_raises(self, tmp_path): response = MagicMock() response.status_code = 200 @@ -290,9 +359,9 @@ def test__given_missing_populace_us_source_coverage__then_raises(self, tmp_path) ): certify_data_release( country="us", + data_producer="populace", manifest_uri=MANIFEST_URI, model_version="1.723.0", - output_dir=tmp_path, ) def test__given_unreachable_artifact__then_raises(self, tmp_path): @@ -321,9 +390,9 @@ def test__given_unreachable_artifact__then_raises(self, tmp_path): ): certify_data_release( country="us", + data_producer="populace", manifest_uri=MANIFEST_URI, model_version="1.723.0", - output_dir=tmp_path, ) def test__given_unreachable_vendored_artifact__then_raises(self, tmp_path): @@ -356,80 +425,28 @@ def test__given_unreachable_vendored_artifact__then_raises(self, tmp_path): ): certify_data_release( country="us", + data_producer="populace", manifest_uri=MANIFEST_URI, model_version="1.723.0", - output_dir=tmp_path, ) class TestVendoredSidecarBinding: - def test__given_vendored_us_manifest__then_tro_sidecar_binds_it(self): - """The shipped TRO must bind the shipped country manifest under the - same byte-hash convention used by trace-tro-verify.""" + def test__given_vendored_bundle_manifest__then_tro_sidecar_binds_it(self): + """The shipped TRO must bind the certified country payload embedded in + the bundle manifest.""" import hashlib from importlib.resources import files - manifest_dir = files("policyengine").joinpath("data/release_manifests") + bundle_dir = files("policyengine").joinpath("data", "bundle") expected = hashlib.sha256( - manifest_dir.joinpath("us.json").read_bytes() + bundle_dir.joinpath("manifest.json").read_bytes() ).hexdigest() - tro = json.loads(manifest_dir.joinpath("us.trace.tro.jsonld").read_text()) + tro = json.loads(bundle_dir.joinpath("us.trace.tro.jsonld").read_text()) artifacts = tro["@graph"][0]["trov:hasComposition"]["trov:hasArtifact"] bundle_manifest = next( a for a in artifacts if a["@id"].endswith("bundle_manifest") ) assert bundle_manifest["trov:sha256"] == expected - - -class TestTraceTroRegeneration: - def test__given_cached_manifest_readers__then_clears_before_regeneration( - self, monkeypatch, tmp_path - ): - """Certification writes the manifest then regenerates the sidecar in - the same process, so stale manifest caches must be invalidated first.""" - from policyengine.provenance import bundle, trace - from policyengine.provenance import manifest as manifest_module - - calls = [] - - class FakeCachedReader: - def __init__(self, name, result): - self.name = name - self.result = result - - def cache_clear(self): - calls.append(f"{self.name}.cache_clear") - - def __call__(self, country): - calls.append(f"{self.name}({country})") - return self.result - - monkeypatch.setattr( - manifest_module, - "get_release_manifest", - FakeCachedReader("release", object()), - ) - monkeypatch.setattr( - manifest_module, - "get_data_release_manifest", - FakeCachedReader("data_release", object()), - ) - monkeypatch.setattr( - trace, - "build_trace_tro_from_release_bundle", - lambda release, data_release: {"ok": True}, - ) - monkeypatch.setattr(trace, "serialize_trace_tro", lambda tro: b"{}\n") - - out_path = bundle.regenerate_trace_tro("us", tmp_path) - - assert out_path == tmp_path / "us.trace.tro.jsonld" - assert out_path.read_bytes() == b"{}\n" - assert calls == [ - "release.cache_clear", - "data_release.cache_clear", - "release(us)", - "data_release(us)", - ] diff --git a/tests/test_release_manifests.py b/tests/test_release_manifests.py index a66f3767..33536fb4 100644 --- a/tests/test_release_manifests.py +++ b/tests/test_release_manifests.py @@ -1,13 +1,15 @@ """Tests for bundled compatibility manifests and data release manifests.""" import hashlib +import importlib import json import os import re import subprocess import sys +from importlib.machinery import ModuleSpec from pathlib import Path -from types import ModuleType +from types import ModuleType, SimpleNamespace from unittest.mock import MagicMock, patch from requests import Timeout @@ -27,13 +29,6 @@ resolve_local_managed_dataset_source, resolve_managed_dataset_reference, ) -from policyengine.tax_benefit_models.uk import ( - managed_microsimulation as managed_uk_microsimulation, -) -from policyengine.tax_benefit_models.us import ( - managed_microsimulation as managed_us_microsimulation, -) -from policyengine.tax_benefit_models.us import us_latest PYPROJECT = Path(__file__).resolve().parents[1] / "pyproject.toml" POLICYENGINE_VERSION = re.search( @@ -48,7 +43,7 @@ US_DATA_RELEASE_REVISION = US_DATA_RELEASE_ID US_DATA_RELEASE_PATH = f"releases/{US_DATA_RELEASE_ID}/release_manifest.json" US_DATA_ARTIFACT_REVISION = US_DATA_RELEASE_ID -US_CERTIFICATION_SOURCE = "policyengine.py certification" +US_CERTIFICATION_SOURCE = "policyengine.py bundle certification" US_MANAGED_DATASET_URI = ( f"hf://policyengine/populace-us/populace_us_2024.h5@{US_DATA_ARTIFACT_REVISION}" ) @@ -64,7 +59,7 @@ UK_DATA_RELEASE_ID = "populace-uk-2023-dd68c73-4aa4b14-20260619T023711Z" UK_DATA_RELEASE_REVISION = UK_DATA_RELEASE_ID UK_DATA_RELEASE_PATH = f"releases/{UK_DATA_RELEASE_ID}/release_manifest.json" -UK_CERTIFICATION_SOURCE = "policyengine.py certification" +UK_CERTIFICATION_SOURCE = "policyengine.py bundle certification" UK_CERTIFIED_DATASET_URI = ( f"hf://policyengine/populace-uk-private/populace_uk_2023.h5" f"@{UK_DATA_RELEASE_REVISION}" @@ -89,14 +84,44 @@ def _response_with_json(payload: dict) -> MagicMock: return response -def _country_module_with_microsimulation( +def _country_modules_with_microsimulation( name: str, microsimulation: MagicMock, -) -> ModuleType: +) -> dict[str, ModuleType]: module = ModuleType(name) module.Microsimulation = microsimulation module.__file__ = str(Path(__file__).resolve()) - return module + module.__path__ = [] + module.__spec__ = ModuleSpec(name, loader=None, is_package=True) + module.__spec__.submodule_search_locations = [] + + parameters = MagicMock() + parameters.get_descendants.return_value = [] + system_module = ModuleType(f"{name}.system") + system_module.system = SimpleNamespace(variables={}, parameters=parameters) + system_module.__spec__ = ModuleSpec(f"{name}.system", loader=None) + + data_module = ModuleType(f"{name}.data") + data_module.__path__ = [] + data_module.__spec__ = ModuleSpec(f"{name}.data", loader=None, is_package=True) + data_module.__spec__.submodule_search_locations = [] + + class FakeDataset: + @staticmethod + def validate_file_path(_path, _raise_if_invalid=True): + return False + + schema_module = ModuleType(f"{name}.data.dataset_schema") + schema_module.UKMultiYearDataset = FakeDataset + schema_module.UKSingleYearDataset = FakeDataset + schema_module.__spec__ = ModuleSpec(f"{name}.data.dataset_schema", loader=None) + + return { + name: module, + f"{name}.system": system_module, + f"{name}.data": data_module, + f"{name}.data.dataset_schema": schema_module, + } class TestReleaseManifests: @@ -569,7 +594,7 @@ def test__given_legacy_compatible_certification__then_offline_fingerprint_mismat assert certification == bundled_certification - def test__given_manifest_fetch_failure_and_version_mismatch__then_fallback_fails( + def test__given_manifest_fetch_failure_and_version_mismatch__then_fallback_is_unverified( self, ): get_data_release_manifest.cache_clear() @@ -578,15 +603,17 @@ def test__given_manifest_fetch_failure_and_version_mismatch__then_fallback_fails "policyengine.provenance.manifest.requests.get", side_effect=Timeout("network timeout"), ): - try: - certify_data_release_compatibility( - "us", - runtime_model_version="1.602.0", - ) - except DataReleaseManifestUnavailableError as error: - assert "Could not fetch" in str(error) - else: - raise AssertionError("Expected offline mismatched version to fail") + certification = certify_data_release_compatibility( + "us", + runtime_model_version="1.602.0", + ) + + assert ( + certification.compatibility_basis + == "unverified_data_release_manifest_unavailable" + ) + assert certification.certified_for_model_version == "1.602.0" + assert certification.certified_by == US_CERTIFICATION_SOURCE def test__given_offline_hf__then_us_import_uses_bundled_certification( self, @@ -728,19 +755,25 @@ def test__given_us_managed_microsimulation__then_passes_certified_dataset_and_bu with ( patch.dict( sys.modules, - { - "policyengine_us": _country_module_with_microsimulation( - "policyengine_us", - mock_microsimulation, - ) - }, + _country_modules_with_microsimulation( + "policyengine_us", + mock_microsimulation, + ), ), patch( - "policyengine.tax_benefit_models.us.model.materialize_dataset_source", - return_value="/tmp/enhanced_cps_2024.h5", + "policyengine.tax_benefit_models.common.model_version.certify_data_release_compatibility", + return_value=get_release_manifest("us").certification, ), ): - microsim = managed_us_microsimulation() + us_model = importlib.import_module( + "policyengine.tax_benefit_models.us.model" + ) + with patch.object( + us_model, + "materialize_dataset_source", + return_value="/tmp/enhanced_cps_2024.h5", + ): + microsim = us_model.managed_microsimulation() dataset = mock_microsimulation.call_args.kwargs["dataset"] assert dataset == microsim.policyengine_bundle["runtime_dataset_source"] @@ -750,7 +783,7 @@ def test__given_us_managed_microsimulation__then_passes_certified_dataset_and_bu assert microsim.policyengine_bundle["runtime_dataset"] == "populace_us_2024" assert ( microsim.policyengine_bundle["runtime_dataset_uri"] - == us_latest.default_dataset_uri + == us_model.us_latest.default_dataset_uri ) dataset_source = microsim.policyengine_bundle["runtime_dataset_source"] assert dataset_source == "/tmp/enhanced_cps_2024.h5" @@ -762,22 +795,28 @@ def test__given_us_unmanaged_dataset_uri__then_source_is_not_rewritten(self): with ( patch.dict( sys.modules, - { - "policyengine_us": _country_module_with_microsimulation( - "policyengine_us", - mock_microsimulation, - ) - }, + _country_modules_with_microsimulation( + "policyengine_us", + mock_microsimulation, + ), ), patch( - "policyengine.tax_benefit_models.us.model.materialize_dataset_source", - return_value="/tmp/cps_2023.h5", + "policyengine.tax_benefit_models.common.model_version.certify_data_release_compatibility", + return_value=get_release_manifest("us").certification, ), ): - microsim = managed_us_microsimulation( - dataset=dataset, - allow_unmanaged=True, + us_model = importlib.import_module( + "policyengine.tax_benefit_models.us.model" ) + with patch.object( + us_model, + "materialize_dataset_source", + return_value="/tmp/cps_2023.h5", + ): + microsim = us_model.managed_microsimulation( + dataset=dataset, + allow_unmanaged=True, + ) assert mock_microsimulation.call_args.kwargs["dataset"] == "/tmp/cps_2023.h5" assert microsim.policyengine_bundle["runtime_dataset_uri"] == dataset @@ -790,19 +829,25 @@ def test__given_uk_managed_dataset_name__then_resolves_within_bundle(self): with ( patch.dict( sys.modules, - { - "policyengine_uk": _country_module_with_microsimulation( - "policyengine_uk", - mock_microsimulation, - ) - }, + _country_modules_with_microsimulation( + "policyengine_uk", + mock_microsimulation, + ), ), patch( - "policyengine.tax_benefit_models.uk.model.materialize_dataset_source", - return_value="/tmp/populace_uk_2023.h5", + "policyengine.tax_benefit_models.common.model_version.certify_data_release_compatibility", + return_value=get_release_manifest("uk").certification, ), ): - microsim = managed_uk_microsimulation(dataset="populace_uk_2023") + uk_model = importlib.import_module( + "policyengine.tax_benefit_models.uk.model" + ) + with patch.object( + uk_model, + "materialize_dataset_source", + return_value="/tmp/populace_uk_2023.h5", + ): + microsim = uk_model.managed_microsimulation(dataset="populace_uk_2023") dataset = mock_microsimulation.call_args.kwargs["dataset"] assert dataset == "/tmp/populace_uk_2023.h5" @@ -823,22 +868,28 @@ def test__given_uk_unmanaged_dataset_uri__then_source_is_not_rewritten(self): with ( patch.dict( sys.modules, - { - "policyengine_uk": _country_module_with_microsimulation( - "policyengine_uk", - mock_microsimulation, - ) - }, + _country_modules_with_microsimulation( + "policyengine_uk", + mock_microsimulation, + ), ), patch( - "policyengine.tax_benefit_models.uk.model.materialize_dataset_source", - return_value="/tmp/frs_2022_23.h5", + "policyengine.tax_benefit_models.common.model_version.certify_data_release_compatibility", + return_value=get_release_manifest("uk").certification, ), ): - microsim = managed_uk_microsimulation( - dataset=dataset, - allow_unmanaged=True, + uk_model = importlib.import_module( + "policyengine.tax_benefit_models.uk.model" ) + with patch.object( + uk_model, + "materialize_dataset_source", + return_value="/tmp/frs_2022_23.h5", + ): + microsim = uk_model.managed_microsimulation( + dataset=dataset, + allow_unmanaged=True, + ) assert mock_microsimulation.call_args.kwargs["dataset"] == ( "/tmp/frs_2022_23.h5" diff --git a/tests/test_trace_tro.py b/tests/test_trace_tro.py index aa706db4..13ec377a 100644 --- a/tests/test_trace_tro.py +++ b/tests/test_trace_tro.py @@ -280,7 +280,7 @@ def test__given_artifact_locations__then_all_paths_are_https_or_local( "trov:hasArtifactLocation" ] paths = [location["trov:hasLocation"] for location in locations] - assert paths[0].startswith("data/release_manifests/") + assert paths[0] == "data/bundle/manifest.json" for path in paths[1:]: assert path.startswith("https://"), path @@ -471,7 +471,7 @@ def test__given_fixed_ci_env__then_tro_bytes_match_across_builds(self, monkeypat def test__given_self_url__then_tro_records_it(self): self_url = ( "https://raw.githubusercontent.com/PolicyEngine/policyengine.py/" - "v3.4.5/src/policyengine/data/release_manifests/us.trace.tro.jsonld" + "v3.4.5/src/policyengine/data/bundle/us.trace.tro.jsonld" ) tro = build_trace_tro_from_release_bundle( get_release_manifest("us"), @@ -763,7 +763,7 @@ def test__given_bundle_tro_url__then_performance_records_it(self, us_bundle_tro) tro = build_results_trace_tro( self._results(), bundle_tro=us_bundle_tro, - bundle_tro_url="https://raw.githubusercontent.com/PolicyEngine/policyengine.py/v3.4.5/src/policyengine/data/release_manifests/us.trace.tro.jsonld", + bundle_tro_url="https://raw.githubusercontent.com/PolicyEngine/policyengine.py/v3.4.5/src/policyengine/data/bundle/us.trace.tro.jsonld", ) performance = tro["@graph"][0]["trov:hasPerformance"] @@ -803,7 +803,7 @@ def test__given_write_helper__then_results_and_tro_files_are_sidebyside( ): bundle_url = ( "https://raw.githubusercontent.com/PolicyEngine/policyengine.py/" - "v3.4.5/src/policyengine/data/release_manifests/us.trace.tro.jsonld" + "v3.4.5/src/policyengine/data/bundle/us.trace.tro.jsonld" ) written = write_results_with_trace_tro( self._results(),