"""Tests for training metric summarization helpers.""" from __future__ import annotations from replicalab.models import RewardBreakdown, ScientistAction, ScientistObservation, StepInfo from replicalab.training.metrics import episode_to_metrics, summarize_episodes from replicalab.training.rollout import EpisodeRecord, StepRecord def _build_step_record( error: str | None = None, *, tool_traces: list[dict[str, object]] | None = None, ) -> StepRecord: return StepRecord( round_number=0, observation=ScientistObservation( paper_title="Paper", paper_hypothesis="Hypothesis", paper_method="Method", paper_key_finding="Finding", experiment_goal="Goal", conversation_history=[], current_protocol=None, round_number=0, max_rounds=6, ), action=ScientistAction( action_type="request_info", sample_size=0, controls=[], technique="", duration_days=0, required_equipment=[], required_reagents=[], questions=["What is missing?"], rationale="", ), reward=0.0, done=False, error=error, info=StepInfo(error=error), tool_traces=tool_traces or [], ) def test_episode_to_metrics_counts_invalid_actions() -> None: record = EpisodeRecord( seed=11, scenario="math_reasoning", difficulty="easy", episode_id="ep-1", steps=[_build_step_record("invalid"), _build_step_record(None)], total_reward=1.25, reward_breakdown=RewardBreakdown( rigor=0.8, feasibility=0.9, fidelity=0.7, parsimony=0.95, ), verdict="accept", agreement_reached=True, tool_traces=[ {"tool": "search_evidence", "status": "ok"}, {"tool": "run_code_check", "status": "error", "error": "timeout"}, ], ) metrics = episode_to_metrics(record) assert metrics.invalid_action_count == 1 assert metrics.invalid_action_rate == 0.5 assert metrics.invalid_bounded_tool_count == 1 assert metrics.invalid_bounded_tool_rate == 0.5 assert metrics.agreement_reached is True assert 0.0 <= metrics.paper_understanding <= 1.0 assert metrics.communication_quality == 0.0 def test_summarize_episodes_aggregates_rewards() -> None: first = EpisodeRecord( seed=1, scenario="math_reasoning", difficulty="easy", episode_id="ep-1", steps=[_build_step_record(None)], total_reward=2.0, reward_breakdown=RewardBreakdown(rigor=0.6, feasibility=0.7, fidelity=0.8), verdict="accept", agreement_reached=True, tool_traces=[{"tool": "search_evidence", "status": "ok"}], ) second = EpisodeRecord( seed=2, scenario="ml_benchmark", difficulty="medium", episode_id="ep-2", steps=[_build_step_record("invalid")], total_reward=0.5, reward_breakdown=RewardBreakdown(rigor=0.2, feasibility=0.4, fidelity=0.5), verdict="timeout", agreement_reached=False, tool_traces=[{"tool": "run_code_check", "status": "error"}], ) summary = summarize_episodes([first, second]) assert summary.episode_count == 2 assert summary.average_reward == 1.25 assert 0.0 < summary.invalid_action_rate < 1.0 assert summary.average_invalid_bounded_tool_rate == 0.5 assert 0.0 <= summary.average_paper_understanding <= 1.0 assert summary.average_communication_quality == 0.0