"""Visual confidence payload for generator verification.
Provides a tested payload function that backs all panels of the visual
confidence notebook. The notebook renders from the payload; unit tests
assert on the payload directly without notebook rendering.
Public API
----------
- ``VisualConfidencePayload`` — frozen dataclass with required sections.
- ``build_visual_confidence_payload(bundle, bootstrap_seed,
n_bootstrap=200) -> VisualConfidencePayload``.
"""
from __future__ import annotations
import dataclasses
import numpy as np
from pytyche.contracts import CalibrationBundle, MetricFamily
from pytyche.summarize import summarize_hurdle_components
# ---------------------------------------------------------------------------
# Truth-leakage column names — observed DataFrames must not contain these.
# ---------------------------------------------------------------------------
#: Column names that indicate ground-truth leakage into observed data.
#: These names appear in TruthResult / CalibrationTruth internals but must
#: never appear as columns in VariantData.visitors DataFrames.
_TRUTH_COLUMN_NAMES: frozenset[str] = frozenset({
"cate",
"tau",
"tau_i",
"p0",
"p1",
"m0",
"m1",
"conv",
"aov",
"cate_per_visitor",
"conv_per_visitor",
"aov_per_visitor",
"true_effect",
"true_cate",
"effect",
"effect_components",
})
# ---------------------------------------------------------------------------
# Payload type
# ---------------------------------------------------------------------------
[docs]
@dataclasses.dataclass(frozen=True)
class VisualConfidencePayload:
"""Typed payload backing all panels of the visual confidence notebook.
All four fields are required — there are no optional sections. Future
analyzer/BCF panels extend this type by subclassing or by adding fields
to a derived dataclass without breaking existing sections (open/closed
principle for the payload contract).
Fields:
invariants: Name → bool map of generator contract checks.
truth_summary: Population-level truth statistics.
data_summary: Per-variant empirical summaries (variant name → stats).
recovery: Empirical recovery comparison with planted truth and
bootstrap SE on the empirical lift.
"""
invariants: dict[str, bool]
truth_summary: dict[str, object]
data_summary: dict[str, dict[str, object]]
recovery: dict[str, object]
# ---------------------------------------------------------------------------
# Invariant checks
# ---------------------------------------------------------------------------
def _check_observed_truth_boundary(bundle: CalibrationBundle) -> bool:
"""True if no observed variant DataFrame contains truth-leakage columns."""
for variant in bundle.observed.variants:
cols = set(variant.visitors.columns)
if cols & _TRUTH_COLUMN_NAMES:
return False
return True
def _check_cate_alignment(bundle: CalibrationBundle) -> bool:
"""True if cate_per_visitor length equals total observed visitors."""
cate = bundle.truth.cate_per_visitor
# visual_confidence is K=2-only: the scalar cate_per_visitor is populated
# only for binary arms (at K>=3 the heterogeneity lives in the per-contrast
# list). Fail loudly rather than degrade if handed a multi-arm bundle.
assert cate is not None, "visual_confidence requires a K=2 bundle (cate_per_visitor populated)"
total_visitors = sum(v.n_visitors for v in bundle.observed.variants)
return len(cate.values) == total_visitors
def _check_decomposition_identity(bundle: CalibrationBundle, tol: float = 1e-10) -> bool:
"""True if hurdle decomposition satisfies effect = conv_effect + aov_effect.
For binary metrics, always returns True (single component trivially equals
the total effect).
"""
if bundle.truth.metric_family == MetricFamily.BINARY:
return True
ec = bundle.truth.effect_components
conv_effect = ec.get("conv_effect", 0.0)
aov_effect = ec.get("aov_effect", 0.0)
return abs(bundle.truth.effect - (conv_effect + aov_effect)) <= tol
def _check_effect_equals_mean_cate(bundle: CalibrationBundle, tol: float = 1e-10) -> bool:
"""True if bundle.truth.effect equals mean(cate_per_visitor.values)."""
cate = bundle.truth.cate_per_visitor
assert cate is not None, "visual_confidence requires a K=2 bundle (cate_per_visitor populated)"
mean_cate = float(np.mean(cate.values))
return abs(bundle.truth.effect - mean_cate) <= tol
def _check_cross_arm_visitor_id_uniqueness(bundle: CalibrationBundle) -> bool:
"""True if no visitor_id appears in more than one variant."""
seen: set[str] = set()
for variant in bundle.observed.variants:
ids = set(variant.visitors["visitor_id"].tolist())
if ids & seen:
return False
seen.update(ids)
return True
# ---------------------------------------------------------------------------
# Section builders
# ---------------------------------------------------------------------------
def _build_invariants(bundle: CalibrationBundle) -> dict[str, bool]:
"""Compute all 5 invariant checks and return as a name → bool dict."""
return {
"observed_truth_boundary": _check_observed_truth_boundary(bundle),
"cate_alignment": _check_cate_alignment(bundle),
"decomposition_identity": _check_decomposition_identity(bundle),
"effect_equals_mean_cate": _check_effect_equals_mean_cate(bundle),
"cross_arm_visitor_id_uniqueness": _check_cross_arm_visitor_id_uniqueness(
bundle
),
}
def _build_truth_summary(bundle: CalibrationBundle) -> dict[str, object]:
"""Extract population-level truth statistics from bundle.truth."""
cate = bundle.truth.cate_per_visitor
assert cate is not None, "visual_confidence requires a K=2 bundle (cate_per_visitor populated)"
cate_values = cate.values
return {
"effect": bundle.truth.effect,
"effect_components": bundle.truth.effect_components,
"cate_mean": float(np.mean(cate_values)),
"cate_std": float(np.std(cate_values, ddof=1)),
"cate_min": float(np.min(cate_values)),
"cate_max": float(np.max(cate_values)),
}
def _build_data_summary(
bundle: CalibrationBundle,
) -> dict[str, dict[str, object]]:
"""Compute per-variant empirical summaries.
For binary metrics: n_visitors, conversion_rate.
For hurdle metrics: n_visitors, conversion_rate, mean_revenue.
"""
is_hurdle = bundle.truth.metric_family == MetricFamily.HURDLE_REAL
result: dict[str, dict[str, object]] = {}
for variant in bundle.observed.variants:
df = variant.visitors
n = variant.n_visitors
conversion_rate = (
df["converted"].sum().item() / n if n > 0 else 0.0
)
entry: dict[str, object] = {
"n_visitors": n,
"conversion_rate": conversion_rate,
}
if is_hurdle:
entry["mean_revenue"] = (
df["revenue"].sum().item() / n if n > 0 else 0.0
)
result[variant.name] = entry
return result
def _get_metric_values(bundle: CalibrationBundle) -> tuple[np.ndarray, np.ndarray]:
"""Extract per-visitor metric values for control and treatment arms.
Returns (ctrl_values, treat_values) for the primary metric.
For binary: conversion indicator (0/1).
For hurdle: revenue per visitor (0.0 for non-converters).
"""
# Convention: variants[0] = control, variants[1] = treatment.
ctrl_variant = bundle.observed.variants[0]
treat_variant = bundle.observed.variants[1]
if bundle.truth.metric_family == MetricFamily.BINARY:
ctrl_values = ctrl_variant.visitors["converted"].to_numpy(dtype=float)
treat_values = treat_variant.visitors["converted"].to_numpy(dtype=float)
else:
# Hurdle: revenue per visitor.
ctrl_values = ctrl_variant.visitors["revenue"].to_numpy(dtype=float)
treat_values = treat_variant.visitors["revenue"].to_numpy(dtype=float)
return ctrl_values, treat_values
def _build_recovery(
bundle: CalibrationBundle,
bootstrap_seed: int,
n_bootstrap: int,
) -> dict[str, object]:
"""Compute recovery comparison with bootstrap SE on empirical lift.
Fields:
planted_effect: bundle.truth.effect (ground truth).
empirical_lift: treatment mean minus control mean for the primary metric.
bootstrap_se: deterministic bootstrap SE on empirical lift.
"""
ctrl_values, treat_values = _get_metric_values(bundle)
empirical_lift = float(treat_values.mean() - ctrl_values.mean())
# Deterministic bootstrap SE using the provided seed.
rng = np.random.default_rng(bootstrap_seed)
lifts: list[float] = []
for _ in range(n_bootstrap):
ctrl_sample = rng.choice(ctrl_values, size=len(ctrl_values), replace=True)
treat_sample = rng.choice(treat_values, size=len(treat_values), replace=True)
lifts.append(float(treat_sample.mean() - ctrl_sample.mean()))
bootstrap_se = float(np.std(lifts, ddof=1))
recovery: dict[str, object] = {
"planted_effect": bundle.truth.effect,
"empirical_lift": empirical_lift,
"bootstrap_se": bootstrap_se,
}
if bundle.truth.metric_family == MetricFamily.HURDLE_REAL:
components = summarize_hurdle_components(bundle.observed)
recovery["planted_conv_effect"] = bundle.truth.effect_components["conv_effect"]
recovery["planted_aov_effect"] = bundle.truth.effect_components["aov_effect"]
recovery.update(components)
return recovery
# ---------------------------------------------------------------------------
# Public entrypoint
# ---------------------------------------------------------------------------
[docs]
def build_visual_confidence_payload(
bundle: CalibrationBundle,
bootstrap_seed: int,
n_bootstrap: int = 200,
) -> VisualConfidencePayload:
"""Build a VisualConfidencePayload from a CalibrationBundle.
Computes all panel data from the bundle:
- ``invariants``: 5 generator contract checks (all bool).
- ``truth_summary``: planted effect, components, and per-visitor CATE stats.
- ``data_summary``: per-variant empirical summaries (n_visitors, rates, revenue).
- ``recovery``: planted effect, empirical lift, and bootstrap SE on lift.
Parameters
----------
bundle:
CalibrationBundle from ``generate_v2_core()``.
bootstrap_seed:
Seed for the bootstrap RNG — controls SE reproducibility.
n_bootstrap:
Number of bootstrap resamples. Default 200.
Returns
-------
VisualConfidencePayload
Frozen payload with all four required sections populated.
"""
if not isinstance(n_bootstrap, int) or n_bootstrap < 2:
raise ValueError(
f"n_bootstrap must be an int >= 2, got {n_bootstrap!r}"
)
return VisualConfidencePayload(
invariants=_build_invariants(bundle),
truth_summary=_build_truth_summary(bundle),
data_summary=_build_data_summary(bundle),
recovery=_build_recovery(bundle, bootstrap_seed, n_bootstrap),
)