diff --git a/README.md b/README.md index 2eb9db8..90d83ce 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,17 @@ # SwiftFormer ### **SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications** -[Abdelrahman Shaker](https://scholar.google.com/citations?hl=en&user=eEz4Wu4AAAAJ), -[Muhammad Maaz](https://scholar.google.com/citations?user=vTy9Te8AAAAJ&hl=en&authuser=1&oi=sra), -[Hanoona Rasheed](https://scholar.google.com/citations?user=yhDdEuEAAAAJ&hl=en&authuser=1&oi=sra), -[Salman Khan](https://salman-h-khan.github.io), -[Ming-Hsuan Yang](https://scholar.google.com/citations?user=p9-ohHsAAAAJ&hl=en), -and [Fahad Shahbaz Khan](https://scholar.google.es/citations?user=zvaeYnUAAAAJ&hl=en) +![](https://i.imgur.com/waxVImv.png) +[Abdelrahman Shaker](https://scholar.google.com/citations?hl=en&user=eEz4Wu4AAAAJ)*1, [Muhammad Maaz](https://scholar.google.com/citations?user=vTy9Te8AAAAJ&hl=en&authuser=1&oi=sra)1, [Hanoona Rasheed](https://scholar.google.com/citations?user=yhDdEuEAAAAJ&hl=en&authuser=1&oi=sra)1, [Salman Khan](https://salman-h-khan.github.io/)1, [Ming-Hsuan Yang](https://scholar.google.com/citations?user=p9-ohHsAAAAJ&hl=en)2,3 and [Fahad Shahbaz Khan](https://scholar.google.es/citations?user=zvaeYnUAAAAJ&hl=en)1,4 +Mohamed Bin Zayed University of Artificial Intelligence1, University of California Merced2, Google Research3, Linkoping University4 [![paper](https://img.shields.io/badge/arXiv-Paper-.svg)](https://arxiv.org/abs/2303.15446) ## :rocket: News +* **(Jul 14, 2023):** SwiftFormer has been accepted at ICCV 2023. :fire::fire: * **(Mar 27, 2023):** Classification training and evaluation codes along with pre-trained models are released.
@@ -47,10 +45,10 @@ Self-attention has become a defacto choice for capturing global context in vario | Model | Top-1 accuracy | #params | GMACs | Latency | Ckpt | CoreML| |:---------------|:----:|:---:|:--:|:--:|:--:|:--:| -| SwiftFormer-XS | 75.7% | 3.5M | 0.4G | 0.7ms | [XS](https://drive.google.com/file/d/15Ils-U96pQePXQXx2MpmaI-yAceFAr2x/view?usp=sharing) | [XS](https://drive.google.com/file/d/1tZVxtbtAZoLLoDc5qqoUGulilksomLeK/view?usp=sharing) | -| SwiftFormer-S | 78.5% | 6.1M | 1.0G | 0.8ms | [S](https://drive.google.com/file/d/1_0eWwgsejtS0bWGBQS3gwAtYjXdPRGlu/view?usp=sharing) | [S](https://drive.google.com/file/d/13EOCZmtvbMR2V6UjezSZnbBz2_-59Fva/view?usp=sharing) | -| SwiftFormer-L1 | 80.9% | 12.1M | 1.6G | 1.1ms | [L1](https://drive.google.com/file/d/1jlwrwWQ0SQzDRc5adtWIwIut5d1g9EsM/view?usp=sharing) | [L1](https://drive.google.com/file/d/1c3VUsi4q7QQ2ykXVS2d4iCRL478fWF3e/view?usp=sharing) | -| SwiftFormer-L3 | 83.0% | 28.5M | 4.0G | 1.9ms | [L3](https://drive.google.com/file/d/1ypBcjx04ShmPYRhhjBRubiVjbExUgSa7/view?usp=sharing) | [L3](https://drive.google.com/file/d/1svahgIjh7da781jHOHjX58mtzCzYXSsJ/view?usp=sharing) | +| SwiftFormer-XS | 75.7% | 3.5M | 0.6G | 0.7ms | [XS](https://drive.google.com/file/d/12RchxzyiJrtZS-2Bur9k4wcRQMItA43S/view?usp=sharing) | [XS](https://drive.google.com/file/d/1bkAP_BD6CdDqlbQsStZhLa0ST2NZTIvH/view?usp=sharing) | +| SwiftFormer-S | 78.5% | 6.1M | 1.0G | 0.8ms | [S](https://drive.google.com/file/d/1awpcXAaHH38WaHrOmUM8updxQazUZ3Nb/view?usp=sharing) | [S](https://drive.google.com/file/d/1qNAhecWIeQ1YJotWhbnLTCR5Uv1zBaf1/view?usp=sharing) | +| SwiftFormer-L1 | 80.9% | 12.1M | 1.6G | 1.1ms | [L1](https://drive.google.com/file/d/1SDzauVmpR5uExkOv3ajxdwFnP-Buj9Uo/view?usp=sharing) | [L1](https://drive.google.com/file/d/1CowZE7-lbxz93uwXqefe-HxGOHUdvX_a/view?usp=sharing) | +| SwiftFormer-L3 | 83.0% | 28.5M | 4.0G | 1.9ms | [L3](https://drive.google.com/file/d/1DAxMe6FlnZBBIpR-HYIDfFLWJzIgiF0Y/view?usp=sharing) | [L3](https://drive.google.com/file/d/1SO3bRWd9oWJemy-gpYUcwP-B4bJ-dsdg/view?usp=sharing) | ## Detection and Segmentation Qualitative Results @@ -77,6 +75,7 @@ conda activate swiftformer pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 --extra-index-url https://download.pytorch.org/whl/cu113 pip install timm +pip install coremltools==5.2.0 ``` ### Data preparation @@ -98,7 +97,7 @@ To train SwiftFormer models on an 8-GPU machine: sh dist_train.sh /path/to/imagenet 8 ``` -Note: specify which model command you want to run in the script. To reproduce the results of the paper, use 16-GPU machine with batch-size of 128 or 8-GPU machine with batch size of 256. Auto Augmentation, CutMix, MixUp are disabled for SwiftFormer-XS only. +Note: specify which model command you want to run in the script. To reproduce the results of the paper, use 16-GPU machine with batch-size of 128 or 8-GPU machine with batch size of 256. Auto Augmentation, CutMix, MixUp are disabled for SwiftFormer-XS, and CutMix, MixUp are disabled for SwiftFormer-S. ### Multi-node training diff --git a/dist_train.sh b/dist_train.sh index 0f81d00..2578b11 100644 --- a/dist_train.sh +++ b/dist_train.sh @@ -4,18 +4,18 @@ IMAGENET_PATH=$1 nGPUs=$2 -## SwiftFormer-XS +## SwiftFormer-XS training python -m torch.distributed.launch --nproc_per_node=$nGPUs --use_env main.py --model SwiftFormer_XS --aa="" --mixup 0 --cutmix 0 --data-path "$IMAGENET_PATH" \ --output_dir SwiftFormer_XS_results -## SwiftFormer-S -python -m torch.distributed.launch --nproc_per_node=$nGPUs --use_env main.py --model SwiftFormer_S --data-path "$IMAGENET_PATH" \ +## SwiftFormer-S training +python -m torch.distributed.launch --nproc_per_node=$nGPUs --use_env main.py --model SwiftFormer_S --mixup 0 --cutmix 0 --data-path "$IMAGENET_PATH" \ --output_dir SwiftFormer_S_results -## SwiftFormer-L1 +## SwiftFormer-L1 training python -m torch.distributed.launch --nproc_per_node=$nGPUs --use_env main.py --model SwiftFormer_L1 --data-path "$IMAGENET_PATH" \ --output_dir SwiftFormer_L1_results -## SwiftFormer-L3 +## SwiftFormer-L3 training python -m torch.distributed.launch --nproc_per_node=$nGPUs --use_env main.py --model SwiftFormer_L3 --data-path "$IMAGENET_PATH" \ --output_dir SwiftFormer_L3_results diff --git a/models/swiftformer.py b/models/swiftformer.py index 7243fae..1b74936 100644 --- a/models/swiftformer.py +++ b/models/swiftformer.py @@ -25,9 +25,6 @@ SwiftFormer_depth = { 'l3': [4, 4, 12, 6], } -CoreMLConversion = False - - def stem(in_chs, out_chs): """ Stem Layer that is implemented by two layers of conv. @@ -144,8 +141,8 @@ class Mlp(nn.Module): class EfficientAdditiveAttnetion(nn.Module): """ Efficient Additive Attention module for SwiftFormer. - Input: tensor in shape [B, C, H, W] - Output: tensor in shape [B, C, H, W] + Input: tensor in shape [B, N, D] + Output: tensor in shape [B, N, D] """ def __init__(self, in_dims=512, token_dim=256, num_heads=2): @@ -163,26 +160,23 @@ class EfficientAdditiveAttnetion(nn.Module): query = self.to_query(x) key = self.to_key(x) - if not CoreMLConversion: - # torch.nn.functional.normalize is not supported by the ANE of iPhone devices. - # Using this layer improves the accuracy by ~0.1-0.2% - query = torch.nn.functional.normalize(query, dim=-1) - key = torch.nn.functional.normalize(key, dim=-1) + query = torch.nn.functional.normalize(query, dim=-1) #BxNxD + key = torch.nn.functional.normalize(key, dim=-1) #BxNxD - query_weight = query @ self.w_g - A = query_weight * self.scale_factor + query_weight = query @ self.w_g # BxNx1 (BxNxD @ Dx1) + A = query_weight * self.scale_factor # BxNx1 - A = A.softmax(dim=-1) + A = torch.nn.functional.normalize(A, dim=1) # BxNx1 - G = torch.sum(A * query, dim=1) + G = torch.sum(A * query, dim=1) # BxD G = einops.repeat( G, "b d -> b repeat d", repeat=key.shape[1] - ) + ) # BxNxD - out = self.Proj(G * key) + query + out = self.Proj(G * key) + query #BxNxD - out = self.final(out) + out = self.final(out) # BxNxD return out @@ -505,3 +499,4 @@ def SwiftFormer_L3(pretrained=False, **kwargs): **kwargs) model.default_cfg = _cfg(crop_pct=0.9) return model + diff --git a/slurm_train.sh b/slurm_train.sh index 17c4650..85f485a 100644 --- a/slurm_train.sh +++ b/slurm_train.sh @@ -15,9 +15,8 @@ srun python main.py --model "$MODEL" \ --data-path "$IMAGENET_PATH" \ --batch-size 128 \ --epochs 300 \ ---aa="" --mixup 0 --cutmix 0 -## Note: Disable aa, mixup, and cutmix for SwiftFormer-XS only +## Note: Disable aa, mixup, and cutmix for SwiftFormer-XS, and disable mixup, and cutmix for SwiftFormer-S. ## By default, this script requests total 16 GPUs on 4 nodes. The batch size per gpu is set to 128, -## tha sums to 128*16=2048 in total. \ No newline at end of file +## tha sums to 128*16=2048 in total.