diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..2c864ff --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,171 @@ +# UZH-FPV Velocity Prediction + +从 DAVIS 事件相机灰度图像序列中预测**机体速度**(body-frame forward/lateral velocity)。 + +## 项目结构 + +``` +uzh_fpv/ +├── AGENTS.md # ← 本文件 +├── requirements.txt # Python 依赖 +├── DATASET_FORMAT.md # 数据集格式详细说明 +├── rosbag2wds.py # ROS bag → WebDataset shard 转换脚本 +├── batch_convert.sh # 批量转换脚本 +├── dataset/ # 数据集(.gitignore 忽略) +│ └── / +│ ├── shard_0000.tar # WebDataset shard(图像+GT) +│ ├── imu_sequence.npz # 完整 IMU 序列 +│ └── metadata.json # 元信息 +├── src/ +│ ├── event_utils.py # EventProcessor: 帧间亮度变化 → 模拟事件帧 +│ └── velocity_prediction/ # 主项目代码 +│ ├── __init__.py # 模块说明 +│ ├── config.py # 路径、模型架构、训练超参数 +│ ├── utils.py # 四元数运算(torch + numpy 封装) +│ ├── transforms.py # 数据预处理管线 +│ ├── dataset.py # WebDataset 加载 + 序列采样 +│ ├── model.py # CNN + PoseMLP + GRU + Head +│ ├── train.py # 训练循环 +│ └── evaluate.py # 评估 + 绘图 +├── visualize/ +│ ├── __init__.py +│ └── visualize_dataset.py # 数据集可视化:叠加位姿信息并生成视频 +├── benchmark/ +│ ├── __init__.py +│ ├── config.py # 评估配置 +│ ├── evaluate.py # 完整评估管线 +│ └── benchmark.py # 统一评估入口 +├── checkpoints/ # 模型权重(.gitignore 忽略) +├── logs/ # TensorBoard 日志(.gitignore 忽略) +└── videos/ # 可视化输出视频 +``` + +## 运行环境 + +```bash +uv run python -m # 使用 uv 虚拟环境运行 +``` + +依赖见 `requirements.txt`,核心依赖:PyTorch、WebDataset、OpenCV、NumPy、Matplotlib。 + +## 数据集 + +UZH-FPV 数据集,由 DAVIS 事件相机采集。每个场景目录包含: + +| 文件 | 格式 | 内容 | +|------|------|------| +| `shard_*.tar` | WebDataset | 灰度图 (320×240) + 位姿 + 速度 + 时间戳 | +| `imu_sequence.npz` | NPZ | 完整 IMU 序列(加速度+角速度) | +| `metadata.json` | JSON | 场景元信息 | + +shard 中每个样本的字段: + +| Key | 类型 | 说明 | +|-----|------|------| +| `jpg` | JPEG bytes | 灰度图 320×240 | +| `ts` | float64 | 时间戳 | +| `pose` | float32[7] | `[x, y, z, qx, qy, qz, qw]` 世界→机体四元数 | +| `vel` | float32[6] | `[vx, vy, vz, wx, wy, wz]` 世界线速度 + 角速度 | + +坐标系:z 轴与重力对齐(水平坐标系)。 + +### 场景列表 + +| 场景 | 帧数 | 类型 | +|------|------|------| +| indoor_forward_3/5/6/7/9/10 | 627~4918 | 室内前飞 | +| indoor_45_2/4/9/12/13/14 | 656~1472 | 室内 45° 飞行 | +| outdoor_forward_1/3/5 | 907~13299 | 室外前飞 | +| outdoor_45_1 | 799 | 室外 45° 飞行 | + +## 模型 + +### 架构 + +``` +Event frame (1, 240, 320) ──► CNN (4 Conv+Pool+GAP, 256-d) + │ +Body up (3,) ──► PoseMLP (3→32→64, 64-d) ────────────────────────┤ + │ + concat (320-d) ← per-frame + │ + GRU (hidden=128) + │ + Head MLP (128→64→2) + │ + [v_right, v_forward] +``` + +**注意**:当前 CNN 编码器被禁用(输出全零),模型仅依赖 `PoseMLP + GRU + Head`。 + +### 输入 + +- `events`: `(B, S, 1, H, W)` — 模拟事件帧,值域 `{-1, 0, +1}` +- `tilt`: `(B, S, 3)` — body up 向量(世界 up 旋转到机体坐标系),仅含 pitch/roll,不含 yaw,单位向量 + +### 输出 + +- `v_body`: `(B, 2)` — 机体坐标系 `[v_right, v_forward]` 速度 (m/s) + +### 数据预处理管线 + +``` +shard_*.tar → DecodeSample → SimulateEvents → ComputeTilt → ComputeBodyVelocity → NormalizeVelocity +``` + +1. **DecodeSample**: JPEG → 灰度图 uint8 (H,W);bytes → float32 数组 +2. **SimulateEvents**: 帧间亮度变化 → 二值事件帧 `{-1, 0, +1}` +3. **ComputeTilt**: 四元数 (world→odom) → 应用 R_odom_to_body → 旋转 world-up [0,0,1] → body up 向量 (3,) +4. **ComputeBodyVelocity**: 世界速度 → 应用 R_odom_to_body → yaw 补偿(仅去除偏航,保留 tilt)→ 水平面 `[v_right, v_forward]` +5. **NormalizeVelocity**: 归一化 + +### 训练配置 + +- seq_len=8, batch_size=32, epochs=100 +- lr=1e-3, AdamW, StepLR (step=30, gamma=0.5) +- Loss: MSELoss +- 训练/验证/测试场景见 `config.py` + +## 关键命令 + +```bash +# 训练 +uv run python -m src.velocity_prediction.train --device cuda:0 + +# 评估 +uv run python -m src.velocity_prediction.evaluate --checkpoint checkpoints/best.pt + +# 数据集可视化(单场景) +uv run python -m visualize.visualize_dataset --scene indoor_forward_3 --output videos/scene.mp4 + +# 数据集可视化(全部场景) +uv run python -m visualize.visualize_dataset --all --output videos/ + +# 数据集可视化(实时显示) +uv run python -m visualize.visualize_dataset --scene indoor_forward_3 --show + +# Benchmark 评估 +uv run python -m benchmark.benchmark --checkpoint checkpoints/best.pt +``` + +## 可视化说明 + +`visualize/visualize_dataset.py` 在每帧图像上叠加: + +- 帧号、时间戳、世界坐标位置 +- 欧拉角 `[roll, pitch, yaw]`(从 body 四元数计算) +- Body up 向量 `[x, y, z]` +- 机体速度 `v_body [forward, lateral]` +- 世界速度 `v_world [vx, vy, vz]` +- 机体坐标系三轴箭头(左下角) +- 机体速度方向箭头(图像中心) + +## 关键约定 + +- 四元数格式:`[x, y, z, w]`(不是 `[w, x, y, z]`) +- GT 四元数表示 **world→odom**(不是 world→body),通过静态 R_odom_to_body 校正 +- Body 坐标系(ROS 右手系):`body_x=右, body_y=前, body_z=上` +- R_odom_to_body = R_y(45°) @ R_x(90°):先绕 odom_x 转 +90°,再绕 odom_y 转 +45° +- 速度归一化统计量:待重新计算 +- 模型预测 `[v_right, v_forward]`(右向和前向速度) +- 所有代码在项目根目录下以 `uv run python -m ` 运行 diff --git a/src/velocity_prediction/model.py b/src/velocity_prediction/model.py index 720a5c3..dc294a3 100644 --- a/src/velocity_prediction/model.py +++ b/src/velocity_prediction/model.py @@ -1,9 +1,9 @@ """ -VelocityPredictionModel: CNN + PoseMLP → concat → GRU → Head → [vx_body, vy_body]. +VelocityPredictionModel: CNN + PoseMLP → concat → GRU → Head → [v_right, v_forward]. Architecture: Event frame (1, H, W) ──► CNN ──┐ - Tilt angles (3,) ──► MLP ──┤──► concat ──► GRU ──► Head ──► [vx, vy] + Tilt angles (3,) ──► MLP ──┤──► concat ──► GRU ──► Head ──► [v_right, v_forward] """ import torch @@ -92,7 +92,7 @@ class VelocityPredictionModel(nn.Module): events: (B, S, 1, H, W) tilt: (B, S, 3) Output: - v_body: (B, 2) — body-frame [vx, vy] for the last frame in the sequence + v_body: (B, 2) — body-frame [v_forward, v_lateral] for the last frame in the sequence """ def __init__(self, cnn_cfg=model_cfg.cnn, pose_cfg=model_cfg.pose_mlp, @@ -129,10 +129,13 @@ class VelocityPredictionModel(nn.Module): tilt: (B, S, 3) Returns: - v_body: (B, 2) predicted body-frame [vx, vy] at the last timestep + v_body: (B, 2) predicted body-frame [v_forward, v_lateral] at the last timestep """ # Per-frame encoding - cnn_feat = self.cnn(events) # (B, S, 256) + # cnn_feat = self.cnn(events) # (B, S, 256) + B, S = events.shape[:2] + cnn_feat = events.new_zeros(B, S, self.cnn.out_dim) # 全零替代 + pose_feat = self.pose_mlp(tilt) # (B, S, 64) # Fuse per frame diff --git a/src/velocity_prediction/transforms.py b/src/velocity_prediction/transforms.py index 42f040c..cdcc118 100644 --- a/src/velocity_prediction/transforms.py +++ b/src/velocity_prediction/transforms.py @@ -15,7 +15,7 @@ import numpy as np import cv2 from src.event_utils import EventProcessor -from src.velocity_prediction.utils import body_attitude_np, world_vel_to_body_np +from src.velocity_prediction.utils import body_up_vector_np, world_vel_to_body_np from src.velocity_prediction.config import VELOCITY_MEAN, VELOCITY_STD @@ -75,18 +75,18 @@ class SimulateEvents: class ComputeTilt: - """Compute body attitude rotation vector from pose quaternion. + """Compute body up vector from pose quaternion. Applies the static calibration R_odom_to_body to obtain the true - world→body quaternion, then converts to a rotation vector. - Unlike the old approach, yaw is preserved — the model can decide - how to use it. + world→body quaternion, then rotates world-up [0,0,1] to get the + body up vector. This encodes only pitch and roll (no yaw), + matching the DiffPhysDrone approach (env.R[:, 2]). """ def __call__(self, sample: dict) -> dict: q = sample["pose"][3:7] # [qx, qy, qz, qw] world→odom - att = body_attitude_np(q) # (3,) rotation vector of true body - sample["tilt"] = att.astype(np.float32) + up = body_up_vector_np(q) # (3,) body up vector, unit norm + sample["tilt"] = up.astype(np.float32) return sample diff --git a/src/velocity_prediction/utils.py b/src/velocity_prediction/utils.py index 7cb1df4..3f9d00d 100644 --- a/src/velocity_prediction/utils.py +++ b/src/velocity_prediction/utils.py @@ -229,31 +229,39 @@ def decompose_tilt_from_odom(q_world_to_odom: torch.Tensor) -> torch.Tensor: return decompose_tilt(q_world_to_body) -# ──────────────────────────── Body attitude (new approach) ──────────────────────────── +# ──────────────────────────── Body up vector (DiffPhysDrone style) ──────────────────────────── # -# Instead of removing yaw from the body quaternion, we directly use the -# corrected world→body quaternion's rotation vector. This preserves yaw -# information and lets the model decide how to use it — analogous to how -# DiffPhysDrone uses the body-up vector as a tilt feature. +# Instead of using a rotation vector (which includes yaw), we extract the +# body-frame up vector by rotating world-up [0,0,1] by the corrected +# world→body quaternion. This is exactly what DiffPhysDrone does: +# tilt_feature = env.R[:, 2] (third column of rotation matrix = body up) +# +# The body up vector encodes pitch and roll only — yaw has no effect on it. +# It's a unit vector (norm=1), so no normalization is needed. -def body_attitude(q_world_to_odom: torch.Tensor) -> torch.Tensor: +def body_up_vector(q_world_to_odom: torch.Tensor) -> torch.Tensor: """ - Compute the true body attitude rotation vector from GT odom quaternion. + Compute the body-frame up vector from GT odom quaternion. - Applies the static calibration R_odom_to_body, then converts the - resulting world→body quaternion directly to a rotation vector. - Unlike decompose_tilt, this preserves yaw information. + Applies the static calibration R_odom_to_body, then rotates the + world up vector [0, 0, 1] by the resulting world→body quaternion. + + The result is the body's up direction expressed in world coordinates — + identical to the third column of the rotation matrix R[:, 2]. + This encodes only pitch and roll; yaw has no effect. Args: q_world_to_odom: (..., 4) world→odom unit quaternion from GT Returns: - attitude: (..., 3) rotation vector [rx, ry, rz] of the true body + up: (..., 3) body up vector [x, y, z] in world coordinates, unit norm """ q_R = quat_from_matrix(R_ODOM_TO_BODY.to(q_world_to_odom.device)) q_world_to_body = quat_mul(q_world_to_odom, q_R) q_world_to_body = quat_normalize(q_world_to_body) - return quat_to_rotvec(q_world_to_body) + world_up = torch.zeros_like(q_world_to_body[..., :3]) + world_up[..., 2] = 1.0 # [0, 0, 1] + return quat_rotate(q_world_to_body, world_up) def quat_to_euler(q: torch.Tensor) -> torch.Tensor: @@ -298,11 +306,11 @@ def decompose_tilt_np(q: np.ndarray) -> np.ndarray: return tilt.numpy() -def body_attitude_np(q: np.ndarray) -> np.ndarray: - """NumPy version of body_attitude.""" +def body_up_vector_np(q: np.ndarray) -> np.ndarray: + """NumPy version of body_up_vector.""" q_t = torch.from_numpy(q) - att = body_attitude(q_t) - return att.numpy() + up = body_up_vector(q_t) + return up.numpy() def quat_to_euler_np(q: np.ndarray) -> np.ndarray: diff --git a/visualize/__init__.py b/visualize/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/visualize/visualize_dataset.py b/visualize/visualize_dataset.py index 2fdce21..d92f2e9 100644 --- a/visualize/visualize_dataset.py +++ b/visualize/visualize_dataset.py @@ -24,7 +24,7 @@ import torch # Reuse the same coordinate transforms as the training pipeline from src.velocity_prediction.utils import ( - body_attitude_np, + body_up_vector_np, quat_to_euler_np, world_vel_to_body_np, quat_normalize, @@ -199,11 +199,11 @@ def draw_pose_overlay( ] put_text(euler_lines, origin=(10, 62), color=(0, 200, 255)) - # ── Body attitude (rotation vector) ── - tilt_lines = [ - f"Att: rx={tilt[0]:+.3f} ry={tilt[1]:+.3f} rz={tilt[2]:+.3f}", + # ── Body up vector (pitch & roll only) ── + up_lines = [ + f"Body up: ({tilt[0]:+.3f}, {tilt[1]:+.3f}, {tilt[2]:+.3f})", ] - put_text(tilt_lines, origin=(10, 104), color=(0, 200, 255)) + put_text(up_lines, origin=(10, 104), color=(0, 200, 255)) # ── Body-frame velocity ── v_right, v_forward = v_body # [v_right, v_forward] @@ -334,8 +334,8 @@ def create_video( for i, frame_data in enumerate(frames): q_raw = frame_data["pose"][3:7] # [qx, qy, qz, qw] world→odom - # True body attitude rotation vector (preserves yaw) - tilt = body_attitude_np(q_raw) # (3,) + # Body up vector (pitch & roll only, no yaw) — matches DiffPhysDrone + body_up = body_up_vector_np(q_raw) # (3,) unit vector # Euler angles from body quaternion for display q_body = correct_attitude(q_raw) @@ -349,7 +349,7 @@ def create_video( canvas=frame_data["img"], pose=frame_data["pose"], vel=frame_data["vel"], - tilt=tilt, + tilt=body_up, v_body=v_body, euler=euler_deg, frame_idx=i,