Source code for pytyche.calibrate.scorecard

"""v2 scorecard — decision summary and per-scenario aggregation."""

from __future__ import annotations

import dataclasses
import math
import statistics
from collections import defaultdict

from pytyche.contracts import CalibrationRecord, ClaimLevel, Decision


[docs] @dataclasses.dataclass(frozen=True) class CellRegretStats: """Per-cell regret statistics for a single oracle × actual decision pair. Fields: mean: Mean regret across all records in this cell; None if no records. median: Median regret across all records in this cell; None if no records. """ mean: float | None median: float | None
[docs] @dataclasses.dataclass(frozen=True) class DecisionSummary: """Summary of oracle-vs-actual decision accuracy. Convenience fields (``n_correct``, ``n_false_ship``, ``n_missed_win``) are derived views of ``decision_matrix`` for the current 2-arm phase. When multi-arm support lands, the matrix naturally extends without API breakage. Fields: n_correct: Number of decisions that matched the oracle. n_false_ship: Shipped when oracle says don't (oracle != SHIP, actual == SHIP). n_missed_win: Didn't ship when oracle says ship (oracle == SHIP, actual != SHIP). decision_matrix: ``{oracle_decision_value: {actual_decision_value: count}}``. Keys are ``Decision`` enum values (strings). """ n_correct: int n_false_ship: int n_missed_win: int decision_matrix: dict[str, dict[str, int]] cell_regret: dict[str, dict[str, CellRegretStats]]
[docs] @dataclasses.dataclass(frozen=True) class ScenarioScorecard: """Per-scenario aggregated calibration metrics. All 7 metric fields are ``float | None``. When ``n_records_used == 0``, all metrics are ``None``. Fields: scenario_id: Scenario identifier (from CalibrationRecord). n_records_total: Count of ALL records with this scenario_id (pre-filter). n_records_used: Count of HONEST_ESTIMATE records (post-filter). decision_summary: Decision accuracy summary (counts only, no rates). coverage_rate: Fraction of CIs containing the true effect [0, 1]. bias: Mean(estimated_lift - effect) in metric-native units. rmse: sqrt(mean((estimated_lift - effect)^2)) in metric-native units. false_ship_rate: n_false_ship / n_records_used (total-denominator). missed_win_rate: n_missed_win / n_records_used (total-denominator). mean_regret: Mean of non-None regret values; None if ALL are None. mean_regret_cpm: mean_regret * 1000 if mean_regret is not None, else None. """ scenario_id: str n_records_total: int n_records_used: int decision_summary: DecisionSummary coverage_rate: float | None bias: float | None rmse: float | None false_ship_rate: float | None missed_win_rate: float | None mean_regret: float | None mean_regret_cpm: float | None
def _build_decision_summary(records: list[CalibrationRecord]) -> DecisionSummary: """Build DecisionSummary from a list of CalibrationRecords.""" matrix: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int)) # Accumulate regret values per (oracle, actual) cell regret_by_cell: dict[str, dict[str, list[float]]] = defaultdict(lambda: defaultdict(list)) n_correct = 0 n_false_ship = 0 n_missed_win = 0 for record in records: oracle = record.oracle_decision actual = record.decision oracle_val = oracle.value actual_val = actual.value matrix[oracle_val][actual_val] += 1 # Filter None regret values defensively (matches compute_scorecard's pattern) if record.regret is not None: regret_by_cell[oracle_val][actual_val].append(record.regret) if record.decision_correct: n_correct += 1 elif actual == Decision.SHIP: n_false_ship += 1 else: n_missed_win += 1 # Convert defaultdicts to plain dicts for the frozen dataclass plain_matrix: dict[str, dict[str, int]] = { oracle_key: dict(actual_map) for oracle_key, actual_map in matrix.items() } # Build cell_regret — sparse: only cells that have records are present # Keys mirror decision_matrix exactly (same cells) plain_cell_regret: dict[str, dict[str, CellRegretStats]] = {} for oracle_key, actual_map in plain_matrix.items(): plain_cell_regret[oracle_key] = {} for actual_key in actual_map: regrets = regret_by_cell[oracle_key][actual_key] if regrets: cell_stats = CellRegretStats( mean=statistics.mean(regrets), median=statistics.median(regrets), ) else: cell_stats = CellRegretStats(mean=None, median=None) plain_cell_regret[oracle_key][actual_key] = cell_stats return DecisionSummary( n_correct=n_correct, n_false_ship=n_false_ship, n_missed_win=n_missed_win, decision_matrix=plain_matrix, cell_regret=plain_cell_regret, )
[docs] def compute_scorecard(records: list[CalibrationRecord]) -> list[ScenarioScorecard]: """Group CalibrationRecords by scenario_id and compute per-group metrics. Filters records to ``analysis_mode == ClaimLevel.HONEST_ESTIMATE`` before computing metrics. Both ``n_records_total`` (pre-filter) and ``n_records_used`` (post-filter) are surfaced on each ScenarioScorecard. Args: records: Flat list of CalibrationRecords from one or more scenarios. Returns: List of ScenarioScorecards, one per unique scenario_id, sorted by scenario_id for consistent ordering. """ if not records: return [] # Group ALL records by scenario_id (for n_records_total) all_by_scenario: dict[str, list[CalibrationRecord]] = defaultdict(list) for record in records: all_by_scenario[record.scenario_id].append(record) # Group HONEST_ESTIMATE records by scenario_id (for metrics) honest_by_scenario: dict[str, list[CalibrationRecord]] = defaultdict(list) for record in records: if record.analysis_mode == ClaimLevel.HONEST_ESTIMATE: honest_by_scenario[record.scenario_id].append(record) scorecards: list[ScenarioScorecard] = [] for scenario_id in sorted(all_by_scenario.keys()): all_records = all_by_scenario[scenario_id] honest_records = honest_by_scenario[scenario_id] n_records_total = len(all_records) n_records_used = len(honest_records) decision_summary = _build_decision_summary(honest_records) if n_records_used == 0: scorecard = ScenarioScorecard( scenario_id=scenario_id, n_records_total=n_records_total, n_records_used=n_records_used, decision_summary=decision_summary, coverage_rate=None, bias=None, rmse=None, false_ship_rate=None, missed_win_rate=None, mean_regret=None, mean_regret_cpm=None, ) else: # coverage_rate: mean(ci_low <= effect <= ci_high) coverage_rate = sum( 1.0 if (r.ci_low <= r.effect <= r.ci_high) else 0.0 for r in honest_records ) / n_records_used # bias: mean(estimated_lift - effect) bias = sum( r.estimated_lift - r.effect for r in honest_records ) / n_records_used # rmse: sqrt(mean((estimated_lift - effect)^2)) rmse = math.sqrt( sum((r.estimated_lift - r.effect) ** 2 for r in honest_records) / n_records_used ) # Decision error rates (total-denominator semantics) false_ship_rate = decision_summary.n_false_ship / n_records_used missed_win_rate = decision_summary.n_missed_win / n_records_used # mean_regret: mean of non-None regret values; None if ALL are None non_none_regrets = [r.regret for r in honest_records if r.regret is not None] if non_none_regrets: mean_regret: float | None = sum(non_none_regrets) / len(non_none_regrets) mean_regret_cpm: float | None = mean_regret * 1000.0 else: mean_regret = None mean_regret_cpm = None scorecard = ScenarioScorecard( scenario_id=scenario_id, n_records_total=n_records_total, n_records_used=n_records_used, decision_summary=decision_summary, coverage_rate=coverage_rate, bias=bias, rmse=rmse, false_ship_rate=false_ship_rate, missed_win_rate=missed_win_rate, mean_regret=mean_regret, mean_regret_cpm=mean_regret_cpm, ) scorecards.append(scorecard) return scorecards