"""v2 scorecard — decision summary and per-scenario aggregation."""
from __future__ import annotations
import dataclasses
import math
import statistics
from collections import defaultdict
from pytyche.contracts import CalibrationRecord, ClaimLevel, Decision
[docs]
@dataclasses.dataclass(frozen=True)
class CellRegretStats:
"""Per-cell regret statistics for a single oracle × actual decision pair.
Fields:
mean: Mean regret across all records in this cell; None if no records.
median: Median regret across all records in this cell; None if no records.
"""
mean: float | None
median: float | None
[docs]
@dataclasses.dataclass(frozen=True)
class DecisionSummary:
"""Summary of oracle-vs-actual decision accuracy.
Convenience fields (``n_correct``, ``n_false_ship``, ``n_missed_win``)
are derived views of ``decision_matrix`` for the current 2-arm phase.
When multi-arm support lands, the matrix naturally extends without API
breakage.
Fields:
n_correct: Number of decisions that matched the oracle.
n_false_ship: Shipped when oracle says don't (oracle != SHIP, actual == SHIP).
n_missed_win: Didn't ship when oracle says ship (oracle == SHIP, actual != SHIP).
decision_matrix: ``{oracle_decision_value: {actual_decision_value: count}}``.
Keys are ``Decision`` enum values (strings).
"""
n_correct: int
n_false_ship: int
n_missed_win: int
decision_matrix: dict[str, dict[str, int]]
cell_regret: dict[str, dict[str, CellRegretStats]]
[docs]
@dataclasses.dataclass(frozen=True)
class ScenarioScorecard:
"""Per-scenario aggregated calibration metrics.
All 7 metric fields are ``float | None``. When ``n_records_used == 0``,
all metrics are ``None``.
Fields:
scenario_id: Scenario identifier (from CalibrationRecord).
n_records_total: Count of ALL records with this scenario_id (pre-filter).
n_records_used: Count of HONEST_ESTIMATE records (post-filter).
decision_summary: Decision accuracy summary (counts only, no rates).
coverage_rate: Fraction of CIs containing the true effect [0, 1].
bias: Mean(estimated_lift - effect) in metric-native units.
rmse: sqrt(mean((estimated_lift - effect)^2)) in metric-native units.
false_ship_rate: n_false_ship / n_records_used (total-denominator).
missed_win_rate: n_missed_win / n_records_used (total-denominator).
mean_regret: Mean of non-None regret values; None if ALL are None.
mean_regret_cpm: mean_regret * 1000 if mean_regret is not None, else None.
"""
scenario_id: str
n_records_total: int
n_records_used: int
decision_summary: DecisionSummary
coverage_rate: float | None
bias: float | None
rmse: float | None
false_ship_rate: float | None
missed_win_rate: float | None
mean_regret: float | None
mean_regret_cpm: float | None
def _build_decision_summary(records: list[CalibrationRecord]) -> DecisionSummary:
"""Build DecisionSummary from a list of CalibrationRecords."""
matrix: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
# Accumulate regret values per (oracle, actual) cell
regret_by_cell: dict[str, dict[str, list[float]]] = defaultdict(lambda: defaultdict(list))
n_correct = 0
n_false_ship = 0
n_missed_win = 0
for record in records:
oracle = record.oracle_decision
actual = record.decision
oracle_val = oracle.value
actual_val = actual.value
matrix[oracle_val][actual_val] += 1
# Filter None regret values defensively (matches compute_scorecard's pattern)
if record.regret is not None:
regret_by_cell[oracle_val][actual_val].append(record.regret)
if record.decision_correct:
n_correct += 1
elif actual == Decision.SHIP:
n_false_ship += 1
else:
n_missed_win += 1
# Convert defaultdicts to plain dicts for the frozen dataclass
plain_matrix: dict[str, dict[str, int]] = {
oracle_key: dict(actual_map)
for oracle_key, actual_map in matrix.items()
}
# Build cell_regret — sparse: only cells that have records are present
# Keys mirror decision_matrix exactly (same cells)
plain_cell_regret: dict[str, dict[str, CellRegretStats]] = {}
for oracle_key, actual_map in plain_matrix.items():
plain_cell_regret[oracle_key] = {}
for actual_key in actual_map:
regrets = regret_by_cell[oracle_key][actual_key]
if regrets:
cell_stats = CellRegretStats(
mean=statistics.mean(regrets),
median=statistics.median(regrets),
)
else:
cell_stats = CellRegretStats(mean=None, median=None)
plain_cell_regret[oracle_key][actual_key] = cell_stats
return DecisionSummary(
n_correct=n_correct,
n_false_ship=n_false_ship,
n_missed_win=n_missed_win,
decision_matrix=plain_matrix,
cell_regret=plain_cell_regret,
)
[docs]
def compute_scorecard(records: list[CalibrationRecord]) -> list[ScenarioScorecard]:
"""Group CalibrationRecords by scenario_id and compute per-group metrics.
Filters records to ``analysis_mode == ClaimLevel.HONEST_ESTIMATE`` before
computing metrics. Both ``n_records_total`` (pre-filter) and
``n_records_used`` (post-filter) are surfaced on each ScenarioScorecard.
Args:
records: Flat list of CalibrationRecords from one or more scenarios.
Returns:
List of ScenarioScorecards, one per unique scenario_id, sorted by
scenario_id for consistent ordering.
"""
if not records:
return []
# Group ALL records by scenario_id (for n_records_total)
all_by_scenario: dict[str, list[CalibrationRecord]] = defaultdict(list)
for record in records:
all_by_scenario[record.scenario_id].append(record)
# Group HONEST_ESTIMATE records by scenario_id (for metrics)
honest_by_scenario: dict[str, list[CalibrationRecord]] = defaultdict(list)
for record in records:
if record.analysis_mode == ClaimLevel.HONEST_ESTIMATE:
honest_by_scenario[record.scenario_id].append(record)
scorecards: list[ScenarioScorecard] = []
for scenario_id in sorted(all_by_scenario.keys()):
all_records = all_by_scenario[scenario_id]
honest_records = honest_by_scenario[scenario_id]
n_records_total = len(all_records)
n_records_used = len(honest_records)
decision_summary = _build_decision_summary(honest_records)
if n_records_used == 0:
scorecard = ScenarioScorecard(
scenario_id=scenario_id,
n_records_total=n_records_total,
n_records_used=n_records_used,
decision_summary=decision_summary,
coverage_rate=None,
bias=None,
rmse=None,
false_ship_rate=None,
missed_win_rate=None,
mean_regret=None,
mean_regret_cpm=None,
)
else:
# coverage_rate: mean(ci_low <= effect <= ci_high)
coverage_rate = sum(
1.0 if (r.ci_low <= r.effect <= r.ci_high) else 0.0
for r in honest_records
) / n_records_used
# bias: mean(estimated_lift - effect)
bias = sum(
r.estimated_lift - r.effect for r in honest_records
) / n_records_used
# rmse: sqrt(mean((estimated_lift - effect)^2))
rmse = math.sqrt(
sum((r.estimated_lift - r.effect) ** 2 for r in honest_records)
/ n_records_used
)
# Decision error rates (total-denominator semantics)
false_ship_rate = decision_summary.n_false_ship / n_records_used
missed_win_rate = decision_summary.n_missed_win / n_records_used
# mean_regret: mean of non-None regret values; None if ALL are None
non_none_regrets = [r.regret for r in honest_records if r.regret is not None]
if non_none_regrets:
mean_regret: float | None = sum(non_none_regrets) / len(non_none_regrets)
mean_regret_cpm: float | None = mean_regret * 1000.0
else:
mean_regret = None
mean_regret_cpm = None
scorecard = ScenarioScorecard(
scenario_id=scenario_id,
n_records_total=n_records_total,
n_records_used=n_records_used,
decision_summary=decision_summary,
coverage_rate=coverage_rate,
bias=bias,
rmse=rmse,
false_ship_rate=false_ship_rate,
missed_win_rate=missed_win_rate,
mean_regret=mean_regret,
mean_regret_cpm=mean_regret_cpm,
)
scorecards.append(scorecard)
return scorecards