uzh-fpv-sv-test/benchmark/config.py

"""
Benchmark configuration — evaluation-only scene splits and metric definitions.

This config is independent from src.velocity_prediction.config so that
evaluation scenarios can be changed without touching training config.
"""

from dataclasses import dataclass, field
from pathlib import Path
from typing import List, Dict


# ──────────────────────────── Dataset root ────────────────────────────

DATASET_ROOT = Path(__file__).resolve().parents[1] / "dataset"

# ──────────────────────────── Scene splits ────────────────────────────

# Each scene group has a name, a list of scene dirs, and a difficulty label.
# The test scenes are the primary evaluation set; val scenes are for
# checkpoint selection reference.


@dataclass
class SceneGroup:
    name: str
    scenes: List[str]
    difficulty: str = "medium"  # easy / medium / hard


# ── Validation scenes (for checkpoint selection reference) ──
VAL_SCENE_GROUPS: List[SceneGroup] = [
    SceneGroup("indoor_forward_7", ["indoor_forward_7"], "hard"),
    SceneGroup("outdoor_forward_1", ["outdoor_forward_1"], "easy"),
    # SceneGroup("indoor_forward_6", ["indoor_forward_6"], "medium"),
    # SceneGroup("indoor_forward_9", ["indoor_forward_9"], "easy"),
    # SceneGroup("indoor_forward_10", ["indoor_forward_10"], "easy"),
    # SceneGroup("indoor_forward_5", ["indoor_forward_5"], "medium"),
]

# ── Test scenes (primary evaluation) ──
TEST_SCENE_GROUPS: List[SceneGroup] = [
    SceneGroup("indoor_forward_7", ["indoor_forward_7"], "hard"),
    SceneGroup("outdoor_forward_1", ["outdoor_forward_1"], "easy"),
    SceneGroup("outdoor_forward_5", ["outdoor_forward_5"], "hard"),
    SceneGroup("indoor_forward_6", ["indoor_forward_6"], "medium"),
    SceneGroup("indoor_forward_9", ["indoor_forward_9"], "easy"),
    SceneGroup("indoor_forward_10", ["indoor_forward_10"], "easy"),
    SceneGroup("indoor_forward_5", ["indoor_forward_5"], "medium"),
]

# Flat lists for convenience
VAL_SCENES: List[str] = [s for g in VAL_SCENE_GROUPS for s in g.scenes]
TEST_SCENES: List[str] = [s for g in TEST_SCENE_GROUPS for s in g.scenes]

# Difficulty grouping
DIFFICULTY_GROUPS: Dict[str, List[str]] = {}
for g in TEST_SCENE_GROUPS:
    DIFFICULTY_GROUPS.setdefault(g.difficulty, []).extend(g.scenes)


# ──────────────────────────── Evaluation parameters ────────────────────────────


@dataclass
class EvalConfig:
    """Parameters used when running evaluation."""

    # Sequence length (must match what the model was trained with)
    seq_len: int = 8

    # Batch size for evaluation (can be larger than training)
    batch_size: int = 64

    # Data loading
    num_workers: int = 2

    # Event simulation (must match training config)
    event_threshold: float = 0.1
    event_use_log: bool = True

    # Output directory (relative to benchmark/results/)
    output_dir: str = "results"

    # Whether to generate per-scene plots
    save_plots: bool = True

    # Device override (None = auto-detect)
    device: str = "cuda"


# ──────────────────────────── Metrics definition ────────────────────────────

# Metrics computed per-axis and overall
METRICS = ["rmse", "mae", "r2"]

# Singleton
eval_cfg = EvalConfig()