"""v2 comparison and recommendation — pure numpy, no v1 dependency.
Provides ``compare_variants()`` and ``recommendation_summary()``
for 2-arm compare-to-control experiments. All functions are pure (no I/O,
no PyMC imports).
"""
from __future__ import annotations
import numpy as np
from pytyche.contracts import (
ComparisonResult,
Decision,
DecisionThresholds,
DecompositionSamples,
RecommendationSummary,
)
# ---------------------------------------------------------------------------
# Private comparison math
# ---------------------------------------------------------------------------
_EPS = 1e-10
def _expected_loss(
samples_a: np.ndarray,
samples_b: np.ndarray,
) -> tuple[float, float, np.ndarray, np.ndarray]:
"""Expected loss for choosing a over b, and vice versa.
Returns (loss_a, loss_b, loss_samples_a, loss_samples_b) where:
- loss_a = E[max(b - a, 0)] — cost of choosing a when b is better.
- loss_b = E[max(a - b, 0)] — cost of choosing b when a is better.
"""
loss_samples_a = np.maximum(samples_b - samples_a, 0.0)
loss_samples_b = np.maximum(samples_a - samples_b, 0.0)
return (
float(np.mean(loss_samples_a)),
float(np.mean(loss_samples_b)),
loss_samples_a,
loss_samples_b,
)
def _probability_positive(
samples_a: np.ndarray,
samples_b: np.ndarray,
) -> float:
"""P(a > b)."""
return float(np.mean(samples_a > samples_b))
def _relative_effect(
samples_a: np.ndarray,
samples_b: np.ndarray,
) -> np.ndarray:
"""(a - b) / |b| with near-zero guard and finite clamping.
When |baseline| is near zero, relative effects are undefined.
Uses epsilon guard and replaces non-finite values with 0.0.
"""
denom = np.maximum(np.abs(samples_b), _EPS)
rel = (samples_a - samples_b) / denom
return np.where(np.isfinite(rel), rel, 0.0)
def _probability_better(
samples_a: np.ndarray,
samples_b: np.ndarray,
threshold: float,
) -> float:
"""P(relative_effect(a, b) > threshold)."""
rel = _relative_effect(samples_a, samples_b)
return float(np.mean(rel > threshold))
def _probability_harmful(
samples_a: np.ndarray,
samples_b: np.ndarray,
threshold: float,
) -> float:
"""P(relative_effect(a, b) < -threshold)."""
rel = _relative_effect(samples_a, samples_b)
return float(np.mean(rel < -threshold))
def _lift_ci(
lift_samples: np.ndarray,
ci_level: float,
) -> tuple[float, float]:
"""(low, high) credible interval from lift samples."""
alpha = (1.0 - ci_level) / 2.0
low = float(np.percentile(lift_samples, 100 * alpha))
high = float(np.percentile(lift_samples, 100 * (1.0 - alpha)))
return (low, high)
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
[docs]
def compare_variants(
samples_baseline: np.ndarray,
samples_comparison: np.ndarray,
baseline_name: str,
comparison_name: str,
lift_unit: str,
ci_level: float = 0.80,
min_practical_effect: float = 0.02,
decomposition: DecompositionSamples | None = None,
) -> ComparisonResult:
"""Compare two variants from posterior samples.
Args:
samples_baseline: 1-D posterior samples for the baseline variant.
samples_comparison: 1-D posterior samples for the comparison variant.
baseline_name: Variant name serving as baseline.
comparison_name: Variant name being compared.
lift_unit: ``"pct"`` for relative lift, ``"dollar"`` for absolute.
ci_level: Credible interval level (default 0.80).
min_practical_effect: Minimum meaningful effect size for
``probability_better`` / ``probability_harmful`` (default 0.02).
decomposition: Optional frequency/severity decomposition.
Returns:
Frozen ``ComparisonResult`` with all comparison metrics.
Raises:
ValueError: If inputs are invalid.
"""
_VALID_LIFT_UNITS = ("pct", "dollar")
if lift_unit not in _VALID_LIFT_UNITS:
raise ValueError(
f"lift_unit must be one of {_VALID_LIFT_UNITS}, got {lift_unit!r}"
)
if not (0.0 < ci_level < 1.0):
raise ValueError(f"ci_level must be in (0, 1), got {ci_level}")
if min_practical_effect < 0.0:
raise ValueError(
f"min_practical_effect must be >= 0, got {min_practical_effect}"
)
if samples_baseline.ndim != 1 or samples_comparison.ndim != 1:
raise ValueError("Sample arrays must be 1-D")
if len(samples_baseline) != len(samples_comparison):
raise ValueError(
f"Sample arrays must have equal length, "
f"got {len(samples_baseline)} and {len(samples_comparison)}"
)
# Lift samples are always absolute (T - C) in metric-native units.
# lift_unit is retained so display layers know how to present the metric
# (e.g. derive percentage lift for binary metrics when rendering).
lift_samples = samples_comparison - samples_baseline
p_positive = _probability_positive(samples_comparison, samples_baseline)
p_better = _probability_better(
samples_comparison, samples_baseline, min_practical_effect
)
p_harmful = _probability_harmful(
samples_comparison, samples_baseline, min_practical_effect
)
loss_baseline, loss_comparison, loss_samples_baseline, loss_samples_comparison = (
_expected_loss(samples_baseline, samples_comparison)
)
ci = _lift_ci(lift_samples, ci_level)
return ComparisonResult(
baseline=baseline_name,
comparison=comparison_name,
method="compare_to_control",
probability_positive=p_positive,
probability_better=p_better,
probability_harmful=p_harmful,
expected_loss_baseline=loss_baseline,
expected_loss_comparison=loss_comparison,
expected_loss_samples_baseline=loss_samples_baseline,
expected_loss_samples_comparison=loss_samples_comparison,
lift_samples=lift_samples,
lift_unit=lift_unit,
lift_ci=ci,
lift_ci_level=ci_level,
decomposition=decomposition,
)
[docs]
def recommendation_summary(
comparison: ComparisonResult,
thresholds: DecisionThresholds | None = None,
) -> RecommendationSummary:
"""Produce a recommendation summary from a comparison result.
Decision logic (priority order):
1. SHIP: loss_comparison < tolerance AND p_positive > threshold AND p_better > threshold
2. STOP (harm): p_harmful > harm_threshold
3. STOP (futility): p_better < futility_threshold
4. CONTINUE: default
Args:
comparison: Result from ``compare_variants()``.
thresholds: Decision thresholds. Defaults to ``DecisionThresholds()``.
Returns:
``RecommendationSummary`` with decision and supporting evidence.
"""
if thresholds is None:
thresholds = DecisionThresholds()
threshold_dict = {
"expected_loss_tolerance": thresholds.expected_loss_tolerance,
"p_positive_threshold": thresholds.p_positive_threshold,
"p_better_threshold": thresholds.p_better_threshold,
"futility_threshold": thresholds.futility_threshold,
"harm_threshold": thresholds.harm_threshold,
}
# Priority 1: SHIP
if (
comparison.expected_loss_comparison < thresholds.expected_loss_tolerance
and comparison.probability_positive > thresholds.p_positive_threshold
and comparison.probability_better > thresholds.p_better_threshold
):
decision = Decision.SHIP
# Priority 2: STOP (harm)
elif comparison.probability_harmful > thresholds.harm_threshold:
decision = Decision.STOP
# Priority 3: STOP (futility)
elif comparison.probability_better < thresholds.futility_threshold:
decision = Decision.STOP
# Priority 4: CONTINUE
else:
decision = Decision.CONTINUE
return RecommendationSummary(
treatment=comparison.comparison,
decision=decision,
expected_loss_baseline=comparison.expected_loss_baseline,
expected_loss_comparison=comparison.expected_loss_comparison,
probability_positive=comparison.probability_positive,
probability_better=comparison.probability_better,
probability_harmful=comparison.probability_harmful,
thresholds=threshold_dict,
)