Source code for pytyche.compare.variants

"""v2 comparison and recommendation — pure numpy, no v1 dependency.

Provides ``compare_variants()`` and ``recommendation_summary()``
for 2-arm compare-to-control experiments. All functions are pure (no I/O,
no PyMC imports).
"""

from __future__ import annotations

import numpy as np

from pytyche.contracts import (
    ComparisonResult,
    Decision,
    DecisionThresholds,
    DecompositionSamples,
    RecommendationSummary,
)

# ---------------------------------------------------------------------------
# Private comparison math
# ---------------------------------------------------------------------------

_EPS = 1e-10


def _expected_loss(
    samples_a: np.ndarray,
    samples_b: np.ndarray,
) -> tuple[float, float, np.ndarray, np.ndarray]:
    """Expected loss for choosing a over b, and vice versa.

    Returns (loss_a, loss_b, loss_samples_a, loss_samples_b) where:
    - loss_a = E[max(b - a, 0)] — cost of choosing a when b is better.
    - loss_b = E[max(a - b, 0)] — cost of choosing b when a is better.
    """
    loss_samples_a = np.maximum(samples_b - samples_a, 0.0)
    loss_samples_b = np.maximum(samples_a - samples_b, 0.0)
    return (
        float(np.mean(loss_samples_a)),
        float(np.mean(loss_samples_b)),
        loss_samples_a,
        loss_samples_b,
    )


def _probability_positive(
    samples_a: np.ndarray,
    samples_b: np.ndarray,
) -> float:
    """P(a > b)."""
    return float(np.mean(samples_a > samples_b))


def _relative_effect(
    samples_a: np.ndarray,
    samples_b: np.ndarray,
) -> np.ndarray:
    """(a - b) / |b| with near-zero guard and finite clamping.

    When |baseline| is near zero, relative effects are undefined.
    Uses epsilon guard and replaces non-finite values with 0.0.
    """
    denom = np.maximum(np.abs(samples_b), _EPS)
    rel = (samples_a - samples_b) / denom
    return np.where(np.isfinite(rel), rel, 0.0)


def _probability_better(
    samples_a: np.ndarray,
    samples_b: np.ndarray,
    threshold: float,
) -> float:
    """P(relative_effect(a, b) > threshold)."""
    rel = _relative_effect(samples_a, samples_b)
    return float(np.mean(rel > threshold))


def _probability_harmful(
    samples_a: np.ndarray,
    samples_b: np.ndarray,
    threshold: float,
) -> float:
    """P(relative_effect(a, b) < -threshold)."""
    rel = _relative_effect(samples_a, samples_b)
    return float(np.mean(rel < -threshold))


def _lift_ci(
    lift_samples: np.ndarray,
    ci_level: float,
) -> tuple[float, float]:
    """(low, high) credible interval from lift samples."""
    alpha = (1.0 - ci_level) / 2.0
    low = float(np.percentile(lift_samples, 100 * alpha))
    high = float(np.percentile(lift_samples, 100 * (1.0 - alpha)))
    return (low, high)


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------


[docs] def compare_variants( samples_baseline: np.ndarray, samples_comparison: np.ndarray, baseline_name: str, comparison_name: str, lift_unit: str, ci_level: float = 0.80, min_practical_effect: float = 0.02, decomposition: DecompositionSamples | None = None, ) -> ComparisonResult: """Compare two variants from posterior samples. Args: samples_baseline: 1-D posterior samples for the baseline variant. samples_comparison: 1-D posterior samples for the comparison variant. baseline_name: Variant name serving as baseline. comparison_name: Variant name being compared. lift_unit: ``"pct"`` for relative lift, ``"dollar"`` for absolute. ci_level: Credible interval level (default 0.80). min_practical_effect: Minimum meaningful effect size for ``probability_better`` / ``probability_harmful`` (default 0.02). decomposition: Optional frequency/severity decomposition. Returns: Frozen ``ComparisonResult`` with all comparison metrics. Raises: ValueError: If inputs are invalid. """ _VALID_LIFT_UNITS = ("pct", "dollar") if lift_unit not in _VALID_LIFT_UNITS: raise ValueError( f"lift_unit must be one of {_VALID_LIFT_UNITS}, got {lift_unit!r}" ) if not (0.0 < ci_level < 1.0): raise ValueError(f"ci_level must be in (0, 1), got {ci_level}") if min_practical_effect < 0.0: raise ValueError( f"min_practical_effect must be >= 0, got {min_practical_effect}" ) if samples_baseline.ndim != 1 or samples_comparison.ndim != 1: raise ValueError("Sample arrays must be 1-D") if len(samples_baseline) != len(samples_comparison): raise ValueError( f"Sample arrays must have equal length, " f"got {len(samples_baseline)} and {len(samples_comparison)}" ) # Lift samples are always absolute (T - C) in metric-native units. # lift_unit is retained so display layers know how to present the metric # (e.g. derive percentage lift for binary metrics when rendering). lift_samples = samples_comparison - samples_baseline p_positive = _probability_positive(samples_comparison, samples_baseline) p_better = _probability_better( samples_comparison, samples_baseline, min_practical_effect ) p_harmful = _probability_harmful( samples_comparison, samples_baseline, min_practical_effect ) loss_baseline, loss_comparison, loss_samples_baseline, loss_samples_comparison = ( _expected_loss(samples_baseline, samples_comparison) ) ci = _lift_ci(lift_samples, ci_level) return ComparisonResult( baseline=baseline_name, comparison=comparison_name, method="compare_to_control", probability_positive=p_positive, probability_better=p_better, probability_harmful=p_harmful, expected_loss_baseline=loss_baseline, expected_loss_comparison=loss_comparison, expected_loss_samples_baseline=loss_samples_baseline, expected_loss_samples_comparison=loss_samples_comparison, lift_samples=lift_samples, lift_unit=lift_unit, lift_ci=ci, lift_ci_level=ci_level, decomposition=decomposition, )
[docs] def recommendation_summary( comparison: ComparisonResult, thresholds: DecisionThresholds | None = None, ) -> RecommendationSummary: """Produce a recommendation summary from a comparison result. Decision logic (priority order): 1. SHIP: loss_comparison < tolerance AND p_positive > threshold AND p_better > threshold 2. STOP (harm): p_harmful > harm_threshold 3. STOP (futility): p_better < futility_threshold 4. CONTINUE: default Args: comparison: Result from ``compare_variants()``. thresholds: Decision thresholds. Defaults to ``DecisionThresholds()``. Returns: ``RecommendationSummary`` with decision and supporting evidence. """ if thresholds is None: thresholds = DecisionThresholds() threshold_dict = { "expected_loss_tolerance": thresholds.expected_loss_tolerance, "p_positive_threshold": thresholds.p_positive_threshold, "p_better_threshold": thresholds.p_better_threshold, "futility_threshold": thresholds.futility_threshold, "harm_threshold": thresholds.harm_threshold, } # Priority 1: SHIP if ( comparison.expected_loss_comparison < thresholds.expected_loss_tolerance and comparison.probability_positive > thresholds.p_positive_threshold and comparison.probability_better > thresholds.p_better_threshold ): decision = Decision.SHIP # Priority 2: STOP (harm) elif comparison.probability_harmful > thresholds.harm_threshold: decision = Decision.STOP # Priority 3: STOP (futility) elif comparison.probability_better < thresholds.futility_threshold: decision = Decision.STOP # Priority 4: CONTINUE else: decision = Decision.CONTINUE return RecommendationSummary( treatment=comparison.comparison, decision=decision, expected_loss_baseline=comparison.expected_loss_baseline, expected_loss_comparison=comparison.expected_loss_comparison, probability_positive=comparison.probability_positive, probability_better=comparison.probability_better, probability_harmful=comparison.probability_harmful, thresholds=threshold_dict, )