"""Run manifest for feature-analysis reproducibility.
Captures config, dataset identity (zarr paths + mtimes hash), sklearn /
numpy versions, git SHA (best-effort), and seeds. Written next to the
report as ``manifest.json``.
"""
import hashlib
import json
import platform
import subprocess
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
def _zarr_dir_hash(zarr_dir: Path) -> dict[str, Any]:
"""Hash of sorted zarr store paths + their mtimes + count.
Catches most "data changed since last run" cases without reading the
stores. Not cryptographic — intended for reproducibility tracking.
"""
sim_paths = sorted(zarr_dir.glob("*.zarr"))
entries = [(sp.name, sp.stat().st_mtime_ns) for sp in sim_paths]
h = hashlib.sha256()
for name, mtime in entries:
h.update(name.encode())
h.update(str(mtime).encode())
return {
"zarr_dir": str(zarr_dir.resolve()),
"n_stores": len(entries),
"sha256": h.hexdigest(),
}
def _git_sha(repo_root: Path) -> str | None:
try:
out = subprocess.check_output(
["git", "-C", str(repo_root), "rev-parse", "HEAD"],
stderr=subprocess.DEVNULL,
timeout=5,
)
return out.decode().strip()
except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired):
return None
def _lib_versions() -> dict[str, str]:
versions: dict[str, str] = {"python": platform.python_version()}
for mod_name in ("numpy", "sklearn", "scipy", "zarr", "pandas", "matplotlib"):
try:
mod = __import__(mod_name)
versions[mod_name] = getattr(mod, "__version__", "unknown")
except ImportError:
continue
return versions
[docs]
def build_manifest(
*,
config: dict[str, Any],
zarr_dir: str | Path,
feature_names: list[str],
target_name: str,
n_rows: int,
n_cases: int,
seeds: dict[str, int],
repo_root: str | Path | None = None,
) -> dict[str, Any]:
"""Assemble a manifest dict describing the current run."""
zarr_dir = Path(zarr_dir)
if repo_root is None:
repo_root = Path(__file__).resolve().parents[2]
repo_root = Path(repo_root)
return {
"created_utc": datetime.now(timezone.utc).isoformat(),
"config": config,
"dataset": {
**_zarr_dir_hash(zarr_dir),
"feature_names": list(feature_names),
"target_name": target_name,
"n_rows": int(n_rows),
"n_cases": int(n_cases),
},
"seeds": dict(seeds),
"versions": _lib_versions(),
"git_sha": _git_sha(repo_root),
"platform": platform.platform(),
}
[docs]
def write_manifest(manifest: dict[str, Any], output_dir: str | Path) -> Path:
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
path = output_dir / "manifest.json"
path.write_text(json.dumps(manifest, indent=2, sort_keys=True))
return path