""" Benchmark configuration — evaluation-only scene splits and metric definitions. This config is independent from src.velocity_prediction.config so that evaluation scenarios can be changed without touching training config. """ from dataclasses import dataclass, field from pathlib import Path from typing import List, Dict # ──────────────────────────── Dataset root ──────────────────────────── DATASET_ROOT = Path(__file__).resolve().parents[1] / "dataset" # ──────────────────────────── Scene splits ──────────────────────────── # Each scene group has a name, a list of scene dirs, and a difficulty label. # The test scenes are the primary evaluation set; val scenes are for # checkpoint selection reference. @dataclass class SceneGroup: name: str scenes: List[str] difficulty: str = "medium" # easy / medium / hard # ── Validation scenes (for checkpoint selection reference) ── VAL_SCENE_GROUPS: List[SceneGroup] = [ SceneGroup("indoor_forward_7", ["indoor_forward_7"], "hard"), SceneGroup("outdoor_forward_1", ["outdoor_forward_1"], "easy"), # SceneGroup("indoor_forward_6", ["indoor_forward_6"], "medium"), # SceneGroup("indoor_forward_9", ["indoor_forward_9"], "easy"), # SceneGroup("indoor_forward_10", ["indoor_forward_10"], "easy"), # SceneGroup("indoor_forward_5", ["indoor_forward_5"], "medium"), ] # ── Test scenes (primary evaluation) ── TEST_SCENE_GROUPS: List[SceneGroup] = [ SceneGroup("indoor_forward_7", ["indoor_forward_7"], "hard"), SceneGroup("outdoor_forward_1", ["outdoor_forward_1"], "easy"), SceneGroup("outdoor_forward_5", ["outdoor_forward_5"], "hard"), SceneGroup("indoor_forward_6", ["indoor_forward_6"], "medium"), SceneGroup("indoor_forward_9", ["indoor_forward_9"], "easy"), SceneGroup("indoor_forward_10", ["indoor_forward_10"], "easy"), SceneGroup("indoor_forward_5", ["indoor_forward_5"], "medium"), ] # Flat lists for convenience VAL_SCENES: List[str] = [s for g in VAL_SCENE_GROUPS for s in g.scenes] TEST_SCENES: List[str] = [s for g in TEST_SCENE_GROUPS for s in g.scenes] # Difficulty grouping DIFFICULTY_GROUPS: Dict[str, List[str]] = {} for g in TEST_SCENE_GROUPS: DIFFICULTY_GROUPS.setdefault(g.difficulty, []).extend(g.scenes) # ──────────────────────────── Evaluation parameters ──────────────────────────── @dataclass class EvalConfig: """Parameters used when running evaluation.""" # Sequence length (must match what the model was trained with) seq_len: int = 8 # Batch size for evaluation (can be larger than training) batch_size: int = 64 # Data loading num_workers: int = 2 # Event simulation (must match training config) event_threshold: float = 0.1 event_use_log: bool = True # Output directory (relative to benchmark/results/) output_dir: str = "results" # Whether to generate per-scene plots save_plots: bool = True # Device override (None = auto-detect) device: str = "cuda" # ──────────────────────────── Metrics definition ──────────────────────────── # Metrics computed per-axis and overall METRICS = ["rmse", "mae", "r2"] # Singleton eval_cfg = EvalConfig()