初步可跑通,但loss计算有问题,不收敛
This commit is contained in:
26
multi_gpu_temporal_train.sh
Executable file
26
multi_gpu_temporal_train.sh
Executable file
@@ -0,0 +1,26 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# Simple multi-GPU training script for SwiftFormerTemporal
|
||||
# Usage: ./multi_gpu_temporal_train.sh <NUM_GPUS> [OPTIONS]
|
||||
|
||||
NUM_GPUS=${1:-2}
|
||||
shift
|
||||
|
||||
echo "Starting multi-GPU training with $NUM_GPUS GPUs"
|
||||
|
||||
# Set environment variables for distributed training
|
||||
export MASTER_PORT=12345
|
||||
export MASTER_ADDR=localhost
|
||||
export WORLD_SIZE=$NUM_GPUS
|
||||
|
||||
# Launch training
|
||||
torchrun --nproc_per_node=$NUM_GPUS --master_port=$MASTER_PORT main_temporal.py \
|
||||
--data-path "./videos" \
|
||||
--model SwiftFormerTemporal_XS \
|
||||
--batch-size 32 \
|
||||
--epochs 100 \
|
||||
--lr 1e-3 \
|
||||
--output-dir "./temporal_output_multi" \
|
||||
--num-workers 8 \
|
||||
--pin-mem \
|
||||
"$@"
|
||||
Reference in New Issue
Block a user