From 670dea3e1fa96f59ba47bb4d3334122bfc344f29 Mon Sep 17 00:00:00 2001
From: amshaker <abdelrahman.m.shaker@gmail.com>
Date: Wed, 26 Jul 2023 00:59:51 +0400
Subject: [PATCH 1/6] update README.md

---
 README.md             |  9 +++++----
 models/swiftformer.py | 31 ++++++++++++++-----------------
 2 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index 2eb9db8..529e3f5 100644
--- a/README.md
+++ b/README.md
@@ -47,10 +47,10 @@ Self-attention has become a defacto choice for capturing global context in vario
 
 | Model | Top-1 accuracy | #params | GMACs | Latency | Ckpt | CoreML|
 |:---------------|:----:|:---:|:--:|:--:|:--:|:--:|
-| SwiftFormer-XS |   75.7%    |     3.5M    |   0.4G   |      0.7ms     |  [XS](https://drive.google.com/file/d/15Ils-U96pQePXQXx2MpmaI-yAceFAr2x/view?usp=sharing)    |   [XS](https://drive.google.com/file/d/1tZVxtbtAZoLLoDc5qqoUGulilksomLeK/view?usp=sharing)    |
-| SwiftFormer-S  |   78.5%    |     6.1M    |   1.0G   |      0.8ms     |   [S](https://drive.google.com/file/d/1_0eWwgsejtS0bWGBQS3gwAtYjXdPRGlu/view?usp=sharing)   |   [S](https://drive.google.com/file/d/13EOCZmtvbMR2V6UjezSZnbBz2_-59Fva/view?usp=sharing)    |
-| SwiftFormer-L1 |   80.9%   |    12.1M   |   1.6G   |      1.1ms     |   [L1](https://drive.google.com/file/d/1jlwrwWQ0SQzDRc5adtWIwIut5d1g9EsM/view?usp=sharing)   |   [L1](https://drive.google.com/file/d/1c3VUsi4q7QQ2ykXVS2d4iCRL478fWF3e/view?usp=sharing)    |
-| SwiftFormer-L3 |   83.0%   |    28.5M    |   4.0G   |      1.9ms     |  [L3](https://drive.google.com/file/d/1ypBcjx04ShmPYRhhjBRubiVjbExUgSa7/view?usp=sharing)    |   [L3](https://drive.google.com/file/d/1svahgIjh7da781jHOHjX58mtzCzYXSsJ/view?usp=sharing)   |
+| SwiftFormer-XS |   75.7%    |     3.5M    |   0.6G   |      0.7ms     |  [XS](https://drive.google.com/file/d/12RchxzyiJrtZS-2Bur9k4wcRQMItA43S/view?usp=sharing)    |   [XS](https://drive.google.com/file/d/1bkAP_BD6CdDqlbQsStZhLa0ST2NZTIvH/view?usp=sharing)    |
+| SwiftFormer-S  |   78.5%    |     6.1M    |   1.0G   |      0.8ms     |   [S](https://drive.google.com/file/d/1awpcXAaHH38WaHrOmUM8updxQazUZ3Nb/view?usp=sharing)   |   [S](https://drive.google.com/file/d/1qNAhecWIeQ1YJotWhbnLTCR5Uv1zBaf1/view?usp=sharing)    |
+| SwiftFormer-L1 |   80.9%   |    12.1M   |   1.6G   |      1.1ms     |   [L1](https://drive.google.com/file/d/1SDzauVmpR5uExkOv3ajxdwFnP-Buj9Uo/view?usp=sharing)   |   [L1](https://drive.google.com/file/d/1CowZE7-lbxz93uwXqefe-HxGOHUdvX_a/view?usp=sharing)    |
+| SwiftFormer-L3 |   83.0%   |    28.5M    |   4.0G   |      1.9ms     |  [L3](https://drive.google.com/file/d/1DAxMe6FlnZBBIpR-HYIDfFLWJzIgiF0Y/view?usp=sharing)    |   [L3](https://drive.google.com/file/d/1SO3bRWd9oWJemy-gpYUcwP-B4bJ-dsdg/view?usp=sharing)   |
 
 
 ## Detection and Segmentation Qualitative Results
@@ -77,6 +77,7 @@ conda activate swiftformer
 
 pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 --extra-index-url https://download.pytorch.org/whl/cu113
 pip install timm
+pip install coremltools==5.2.0
 ```
 
 ### Data preparation
diff --git a/models/swiftformer.py b/models/swiftformer.py
index 7243fae..40597ae 100644
--- a/models/swiftformer.py
+++ b/models/swiftformer.py
@@ -25,9 +25,6 @@ SwiftFormer_depth = {
     'l3': [4, 4, 12, 6],
 }
 
-CoreMLConversion = False
-
-
 def stem(in_chs, out_chs):
     """
     Stem Layer that is implemented by two layers of conv.
@@ -144,8 +141,8 @@ class Mlp(nn.Module):
 class EfficientAdditiveAttnetion(nn.Module):
     """
     Efficient Additive Attention module for SwiftFormer.
-    Input: tensor in shape [B, C, H, W]
-    Output: tensor in shape [B, C, H, W]
+    Input: tensor in shape [B, N, D]
+    Output: tensor in shape [B, N, D]
     """
 
     def __init__(self, in_dims=512, token_dim=256, num_heads=2):
@@ -163,26 +160,23 @@ class EfficientAdditiveAttnetion(nn.Module):
         query = self.to_query(x)
         key = self.to_key(x)
 
-        if not CoreMLConversion:
-            # torch.nn.functional.normalize is not supported by the ANE of iPhone devices.
-            # Using this layer improves the accuracy by ~0.1-0.2%
-            query = torch.nn.functional.normalize(query, dim=-1)
-            key = torch.nn.functional.normalize(key, dim=-1)
+        query = torch.nn.functional.normalize(query, dim=-1) #BxNxD
+        key = torch.nn.functional.normalize(key, dim=-1) #BxNxD
 
-        query_weight = query @ self.w_g
-        A = query_weight * self.scale_factor
+        query_weight = query @ self.w_g # BxNx1 (BxNxD @ Dx1)
+        A = query_weight * self.scale_factor # BxNx1
 
-        A = A.softmax(dim=-1)
+        A = torch.nn.functional.normalize(A, dim=1) # BxNx1
 
-        G = torch.sum(A * query, dim=1)
+        G = torch.sum(A * query, dim=1) # BxD
 
         G = einops.repeat(
             G, "b d -> b repeat d", repeat=key.shape[1]
-        )
+        ) # BxNxD
 
-        out = self.Proj(G * key) + query
+        out = self.Proj(G * key) + query #BxNxD
 
-        out = self.final(out)
+        out = self.final(out) # BxNxD
 
         return out
 
@@ -215,6 +209,7 @@ class SwiftFormerLocalRepresentation(nn.Module):
                 nn.init.constant_(m.bias, 0)
 
     def forward(self, x):
+        print("SwiftFormerLocalRepresentation input is ", x.shape)
         input = x
         x = self.dwconv(x)
         x = self.norm(x)
@@ -225,6 +220,7 @@ class SwiftFormerLocalRepresentation(nn.Module):
             x = input + self.drop_path(self.layer_scale * x)
         else:
             x = input + self.drop_path(x)
+        
         return x
 
 
@@ -505,3 +501,4 @@ def SwiftFormer_L3(pretrained=False, **kwargs):
         **kwargs)
     model.default_cfg = _cfg(crop_pct=0.9)
     return model
+

From ff08bf624df2093b227695bf31a99a9858d29045 Mon Sep 17 00:00:00 2001
From: amshaker <abdelrahman.m.shaker@gmail.com>
Date: Wed, 26 Jul 2023 01:05:02 +0400
Subject: [PATCH 2/6] update dis_train.sh

---
 dist_train.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dist_train.sh b/dist_train.sh
index 0f81d00..f6cebbb 100644
--- a/dist_train.sh
+++ b/dist_train.sh
@@ -9,7 +9,7 @@ python -m torch.distributed.launch --nproc_per_node=$nGPUs --use_env main.py --m
 --output_dir SwiftFormer_XS_results
 
 ## SwiftFormer-S
-python -m torch.distributed.launch --nproc_per_node=$nGPUs --use_env main.py --model SwiftFormer_S --data-path "$IMAGENET_PATH" \
+python -m torch.distributed.launch --nproc_per_node=$nGPUs --use_env main.py --model SwiftFormer_S --mixup 0 --cutmix 0 --data-path "$IMAGENET_PATH" \
 --output_dir SwiftFormer_S_results
 
 ## SwiftFormer-L1

From adae6417b6f6b9e2c31da5ff0acf89f4ced7b8f8 Mon Sep 17 00:00:00 2001
From: amshaker <abdelrahman.m.shaker@gmail.com>
Date: Wed, 26 Jul 2023 01:11:47 +0400
Subject: [PATCH 3/6] update dist_train.sh

---
 dist_train.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/dist_train.sh b/dist_train.sh
index f6cebbb..2578b11 100644
--- a/dist_train.sh
+++ b/dist_train.sh
@@ -4,18 +4,18 @@
 IMAGENET_PATH=$1
 nGPUs=$2
 
-## SwiftFormer-XS
+## SwiftFormer-XS training
 python -m torch.distributed.launch --nproc_per_node=$nGPUs --use_env main.py --model SwiftFormer_XS --aa="" --mixup 0 --cutmix 0 --data-path "$IMAGENET_PATH" \
 --output_dir SwiftFormer_XS_results
 
-## SwiftFormer-S
+## SwiftFormer-S training
 python -m torch.distributed.launch --nproc_per_node=$nGPUs --use_env main.py --model SwiftFormer_S --mixup 0 --cutmix 0 --data-path "$IMAGENET_PATH" \
 --output_dir SwiftFormer_S_results
 
-## SwiftFormer-L1
+## SwiftFormer-L1 training
 python -m torch.distributed.launch --nproc_per_node=$nGPUs --use_env main.py --model SwiftFormer_L1 --data-path "$IMAGENET_PATH" \
 --output_dir SwiftFormer_L1_results
 
-## SwiftFormer-L3
+## SwiftFormer-L3 training
 python -m torch.distributed.launch --nproc_per_node=$nGPUs --use_env main.py --model SwiftFormer_L3 --data-path "$IMAGENET_PATH" \
 --output_dir SwiftFormer_L3_results

From 49bf3f55f0995377683221dda8c93df6543ab7f2 Mon Sep 17 00:00:00 2001
From: Abdelrahman Shaker <108531886+Amshaker@users.noreply.github.com>
Date: Wed, 26 Jul 2023 01:18:39 +0400
Subject: [PATCH 4/6] Update README.md

---
 README.md | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 529e3f5..90d83ce 100644
--- a/README.md
+++ b/README.md
@@ -1,19 +1,17 @@
 # SwiftFormer
 ### **SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications**
  
-[Abdelrahman Shaker](https://scholar.google.com/citations?hl=en&user=eEz4Wu4AAAAJ),
-[Muhammad Maaz](https://scholar.google.com/citations?user=vTy9Te8AAAAJ&hl=en&authuser=1&oi=sra),
-[Hanoona Rasheed](https://scholar.google.com/citations?user=yhDdEuEAAAAJ&hl=en&authuser=1&oi=sra),
-[Salman Khan](https://salman-h-khan.github.io),
-[Ming-Hsuan Yang](https://scholar.google.com/citations?user=p9-ohHsAAAAJ&hl=en),
-and [Fahad Shahbaz Khan](https://scholar.google.es/citations?user=zvaeYnUAAAAJ&hl=en)
+![](https://i.imgur.com/waxVImv.png)
+[Abdelrahman Shaker](https://scholar.google.com/citations?hl=en&user=eEz4Wu4AAAAJ)<sup>*1</sup>, [Muhammad Maaz](https://scholar.google.com/citations?user=vTy9Te8AAAAJ&hl=en&authuser=1&oi=sra)<sup>1</sup>, [Hanoona Rasheed](https://scholar.google.com/citations?user=yhDdEuEAAAAJ&hl=en&authuser=1&oi=sra)<sup>1</sup>, [Salman Khan](https://salman-h-khan.github.io/)<sup>1</sup>, [Ming-Hsuan Yang](https://scholar.google.com/citations?user=p9-ohHsAAAAJ&hl=en)<sup>2,3</sup> and [Fahad Shahbaz Khan](https://scholar.google.es/citations?user=zvaeYnUAAAAJ&hl=en)<sup>1,4</sup>
 
+Mohamed Bin Zayed University of Artificial Intelligence<sup>1</sup>, University of California Merced<sup>2</sup>, Google Research<sup>3</sup>, Linkoping University<sup>4</sup>
 <!-- [![Website](https://img.shields.io/badge/Project-Website-87CEEB)](site_url) -->
 [![paper](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2303.15446)
 <!-- [![video](https://img.shields.io/badge/Video-Presentation-F9D371)](youtube_link) -->
 <!-- [![slides](https://img.shields.io/badge/Presentation-Slides-B762C1)](presentation) -->
 
 ## :rocket: News
+* **(Jul 14, 2023):** SwiftFormer has been accepted at ICCV 2023. :fire::fire:
 * **(Mar 27, 2023):** Classification training and evaluation codes along with pre-trained models are released.
 
 <hr />
@@ -99,7 +97,7 @@ To train SwiftFormer models on an 8-GPU machine:
 sh dist_train.sh /path/to/imagenet 8
 ```
 
-Note: specify which model command you want to run in the script. To reproduce the results of the paper, use 16-GPU machine with batch-size of 128 or 8-GPU machine with batch size of 256. Auto Augmentation, CutMix, MixUp are disabled for SwiftFormer-XS only.
+Note: specify which model command you want to run in the script. To reproduce the results of the paper, use 16-GPU machine with batch-size of 128 or 8-GPU machine with batch size of 256. Auto Augmentation, CutMix, MixUp are disabled for SwiftFormer-XS, and CutMix, MixUp are disabled for SwiftFormer-S. 
 
 ### Multi-node training
 

From ef5daec20c3b82a28b85dfbc1cc84d15fff3065d Mon Sep 17 00:00:00 2001
From: Abdelrahman Shaker <108531886+Amshaker@users.noreply.github.com>
Date: Wed, 26 Jul 2023 01:20:31 +0400
Subject: [PATCH 5/6] Update slurm_train.sh

---
 slurm_train.sh | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/slurm_train.sh b/slurm_train.sh
index 17c4650..85f485a 100644
--- a/slurm_train.sh
+++ b/slurm_train.sh
@@ -15,9 +15,8 @@ srun python main.py --model "$MODEL" \
 --data-path "$IMAGENET_PATH" \
 --batch-size 128 \
 --epochs 300 \
---aa="" --mixup 0 --cutmix 0
 
 
-## Note: Disable aa, mixup, and cutmix for SwiftFormer-XS only
+## Note: Disable aa, mixup, and cutmix for SwiftFormer-XS, and disable mixup, and cutmix for SwiftFormer-S.
 ## By default, this script requests total 16 GPUs on 4 nodes. The batch size per gpu is set to 128,
-## tha sums to 128*16=2048 in total.
\ No newline at end of file
+## tha sums to 128*16=2048 in total.

From 37a4fe953d943c69ed844b6181bd1be59984fe49 Mon Sep 17 00:00:00 2001
From: Abdelrahman Shaker <108531886+Amshaker@users.noreply.github.com>
Date: Wed, 26 Jul 2023 01:32:27 +0400
Subject: [PATCH 6/6] Update swiftformer.py

---
 models/swiftformer.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/models/swiftformer.py b/models/swiftformer.py
index 40597ae..1b74936 100644
--- a/models/swiftformer.py
+++ b/models/swiftformer.py
@@ -209,7 +209,6 @@ class SwiftFormerLocalRepresentation(nn.Module):
                 nn.init.constant_(m.bias, 0)
 
     def forward(self, x):
-        print("SwiftFormerLocalRepresentation input is ", x.shape)
         input = x
         x = self.dwconv(x)
         x = self.norm(x)
@@ -220,7 +219,6 @@ class SwiftFormerLocalRepresentation(nn.Module):
             x = input + self.drop_path(self.layer_scale * x)
         else:
             x = input + self.drop_path(x)
-        
         return x