From f0b8dac2b687743fa076771f3b61c1765404b50e Mon Sep 17 00:00:00 2001
From: JiahuiYu <JiahuiYu@users.noreply.github.com>
Date: Mon, 18 Mar 2019 16:42:06 -0500
Subject: [PATCH] V2.0.0 alpha (#4)

* Format with flake8

* Release pretrained model of usnets

* Create MODEL_ZOO.md

* Update README.md
---
 .gitignore                   |   1 +
 MODEL_ZOO.md                 |  19 +++++
 README.md                    |  34 ++++++---
 apps/us_mobilenet_v1_val.yml |  64 ++++++++++++++++
 apps/us_mobilenet_v2_val.yml |  64 ++++++++++++++++
 models/s_mobilenet_v1.py     |  26 ++-----
 models/s_mobilenet_v2.py     |  27 ++-----
 models/s_resnet.py           |   6 +-
 models/s_shufflenet.py       |  41 +++++-----
 models/slimmable_ops.py      | 127 +++++++++++++++++++++++++++++++
 models/us_mobilenet_v1.py    | 107 ++++++++++++++++++++++++++
 models/us_mobilenet_v2.py    | 141 +++++++++++++++++++++++++++++++++++
 train.py                     |  17 +++--
 utils/config.py              |   4 +-
 utils/model_profiling.py     |  11 +--
 utils/transforms.py          |   4 +-
 16 files changed, 601 insertions(+), 92 deletions(-)
 create mode 100644 MODEL_ZOO.md
 create mode 100644 apps/us_mobilenet_v1_val.yml
 create mode 100644 apps/us_mobilenet_v2_val.yml
 create mode 100644 models/us_mobilenet_v1.py
 create mode 100644 models/us_mobilenet_v2.py
diff --git a/.gitignore b/.gitignore
index f733534..ce82486 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 logs
 data
+.flake8
diff --git a/MODEL_ZOO.md b/MODEL_ZOO.md
new file mode 100644
index 0000000..458e26e
--- /dev/null
+++ b/MODEL_ZOO.md
@@ -0,0 +1,19 @@
+# Slimmable Model Zoo
+
+## Slimmable Neural Networks ([ICLR 2019](https://arxiv.org/abs/1812.08928))
+
+
+| Model | Switches (Widths) | Top-1 Err. | MFLOPs | Model ID |
+| :--- | :---: | :---: | ---: | :---: |
+| S-MobileNet v1 | 1.00<br>0.75<br>0.50<br>0.25 | 28.5<br>30.5<br>35.2<br>46.9 | 569<br>325<br>150<br>41 | [a6285db](https://github.com/JiahuiYu/slimmable_networks/files/2709079/s_mobilenet_v1_0.25_0.5_0.75_1.0.pt.zip) |
+| S-MobileNet v2 | 1.00<br>0.75<br>0.50<br>0.35 | 29.5<br>31.1<br>35.6<br>40.3 | 301<br>209<br>97<br>59 | [0593ffd](https://github.com/JiahuiYu/slimmable_networks/files/2709080/s_mobilenet_v2_0.35_0.5_0.75_1.0.pt.zip) |
+| S-ShuffleNet | 2.00<br>1.00<br>0.50 | 28.6<br>34.5<br>42.8 | 524<br>138<br>38 | [1427f66](https://github.com/JiahuiYu/slimmable_networks/files/2709082/s_shufflenet_0.5_1.0_2.0.pt.zip) |
+| S-ResNet-50 | 1.00<br>0.75<br>0.50<br>0.25 | 24.0<br>25.1<br>27.9<br>35.0 | 4.1G<br>2.3G<br>1.1G<br>278 | [3fca9cc](https://drive.google.com/open?id=1f6q37OkZaz_0GoOAwllHlXNWuKwor2fC) |
+
+
+## Universally Slimmable Networks and Improved Training Techniques ([Preprint](https://arxiv.org/abs/1903.05134))
+
+| Model | Widths | Top-1 Err. | MFLOPs | Model ID |
+| :--- | :--- | :---: | ---: | :---: |
+| US-MobileNet v1 | 1.0<br> 0.975<br> 0.95<br> 0.925<br> 0.9<br> 0.875<br> 0.85<br> 0.825<br> 0.8<br> 0.775<br> 0.75<br> 0.725<br> 0.7<br> 0.675<br> 0.65<br> 0.625<br> 0.6<br> 0.575<br> 0.55<br> 0.525<br> 0.5<br> 0.475<br> 0.45<br> 0.425<br> 0.4<br> 0.375<br> 0.35<br> 0.325<br> 0.3<br> 0.275<br> 0.25 | 28.2<br> 28.3<br> 28.4<br> 28.7<br> 28.7<br> 29.1<br> 29.4<br> 29.7<br> 30.2<br> 30.3<br> 30.5<br> 30.9<br> 31.2<br> 31.7<br> 32.2<br> 32.5<br> 33.2<br> 33.7<br> 34.4<br> 35.0<br> 35.8<br> 36.5<br> 37.3<br> 38.1<br> 39.0<br> 40.0<br> 41.0<br> 41.9<br> 42.7<br> 44.2<br> 44.3 | 568<br> 543<br> 517<br> 490<br> 466<br> 443<br> 421<br> 389<br> 366<br> 345<br> 325<br> 306<br> 287<br> 267<br> 249<br> 232<br> 217<br> 201<br> 177<br> 162<br> 149<br> 136<br> 124<br> 114<br> 100<br> 89<br> 80<br> 71<br> 64<br> 48<br> 41 | [13d5af2](https://github.com/JiahuiYu/slimmable_networks/files/2979952/us_mobilenet_v1_calibrated.pt.zip) |
+| US-MobileNet v2 | 1.0<br> 0.975<br> 0.95<br> 0.925<br> 0.9<br> 0.875<br> 0.85<br> 0.825<br> 0.8<br> 0.775<br> 0.75<br> 0.725<br> 0.7<br> 0.675<br> 0.65<br> 0.625<br> 0.6<br> 0.575<br> 0.55<br> 0.525<br> 0.5<br> 0.475<br> 0.45<br> 0.425<br> 0.4<br> 0.375<br> 0.35 | 28.5<br> 28.5<br> 28.8<br> 28.9<br> 29.1<br> 29.1<br> 29.4<br> 29.9<br> 30.0<br> 30.2<br> 30.4<br> 30.7<br> 31.1<br> 31.4<br> 31.7<br> 31.7<br> 32.4<br> 32.4<br> 34.4<br> 34.6<br> 34.9<br> 35.1<br> 35.8<br> 35.8<br> 36.6<br> 36.7<br> 37.7<br> | 300<br> 299<br> 284<br> 274<br> 269<br> 268<br> 254<br> 235<br> 222<br> 213<br> 209<br> 185<br> 173<br> 165<br> 161<br> 161<br> 151<br> 150<br> 106<br> 100<br> 97<br> 96<br> 88<br> 88<br> 80<br> 80<br> 59 | [3880cad](https://github.com/JiahuiYu/slimmable_networks/files/2979953/us_mobilenet_v2_calibrated.pt.zip) |
diff --git a/README.md b/README.md
index b46364e..f14263b 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,25 @@
-# Slimmable Neural Networks
+# Slimmable Networks
 
-[ICLR 2019 Paper](https://arxiv.org/abs/1812.08928) | [ArXiv](https://arxiv.org/abs/1812.08928) | [OpenReview](https://openreview.net/forum?id=H1gMCsAqY7) | [Detection](https://github.com/JiahuiYu/slimmable_networks/tree/detection) | [Model Zoo](#model-zoo) | [BibTex](#citing)
+An open-source framework for slimmable training on ImageNet classification and COCO detection, which has enabled numerous projects.
+
+## [Slimmable Neural Networks](https://arxiv.org/abs/1812.08928)
+
+[ICLR 2019 Paper](https://arxiv.org/abs/1812.08928) | [OpenReview](https://openreview.net/forum?id=H1gMCsAqY7) | [Detection](https://github.com/JiahuiYu/slimmable_networks/tree/detection) | [Model Zoo](/MODEL_ZOO.md) | [BibTex](#citing)
 
 <img src="https://user-images.githubusercontent.com/22609465/50390872-1b3fb600-0702-11e9-8034-d0f41825d775.png" width=95%/>
 
 Illustration of slimmable neural networks. The same model can run at different widths (number of active channels), permitting instant and adaptive accuracy-efficiency trade-offs.
 
 
+## [Universally Slimmable Networks and Improved Training Techniques](https://arxiv.org/abs/1903.05134)
+
+[Preprint](https://arxiv.org/abs/1903.05134) | [Model Zoo](/MODEL_ZOO.md) | [BibTex](#citing)
+
+<img src="https://user-images.githubusercontent.com/22609465/54562571-45b5ae00-4995-11e9-8984-49e32d07e325.png" width=95%/>
+
+Illustration of universally slimmable networks. The same model can run at **arbitrary** widths.
+
+
 ## Run
 
 0. Requirements:
@@ -22,16 +35,6 @@ Illustration of slimmable neural networks. The same model can run at different w
     * If you still have questions, please search closed issues first. If the problem is not solved, please open a new.
 
 
-## Model Zoo
-
-| Model | Switches (Widths) | Top-1 Err. | MFLOPs | Model ID |
-| :--- | :---: | :---: | ---: | :---: |
-| S-MobileNet v1 | 1.00<br>0.75<br>0.50<br>0.25 | 28.5<br>30.5<br>35.2<br>46.9 | 569<br>325<br>150<br>41 | [a6285db](https://github.com/JiahuiYu/slimmable_networks/files/2709079/s_mobilenet_v1_0.25_0.5_0.75_1.0.pt.zip) |
-| S-MobileNet v2 | 1.00<br>0.75<br>0.50<br>0.35 | 29.5<br>31.1<br>35.6<br>40.3 | 301<br>209<br>97<br>59 | [0593ffd](https://github.com/JiahuiYu/slimmable_networks/files/2709080/s_mobilenet_v2_0.35_0.5_0.75_1.0.pt.zip) |
-| S-ShuffleNet | 2.00<br>1.00<br>0.50 | 28.6<br>34.5<br>42.8 | 524<br>138<br>38 | [1427f66](https://github.com/JiahuiYu/slimmable_networks/files/2709082/s_shufflenet_0.5_1.0_2.0.pt.zip) |
-| S-ResNet-50 | 1.00<br>0.75<br>0.50<br>0.25 | 24.0<br>25.1<br>27.9<br>35.0 | 4.1G<br>2.3G<br>1.1G<br>278 | [3fca9cc](https://drive.google.com/open?id=1f6q37OkZaz_0GoOAwllHlXNWuKwor2fC) |
-
-
 ## Technical Details
 
 Implementing slimmable networks and slimmable training is straightforward:
@@ -54,4 +57,11 @@ The software is for educaitonal and academic research purpose only.
   journal={arXiv preprint arXiv:1812.08928},
   year={2018}
 }
+
+@article{yu2019universally,
+  title={Universally Slimmable Networks and Improved Training Techniques},
+  author={Yu, Jiahui and Huang, Thomas},
+  journal={arXiv preprint arXiv:1903.05134},
+  year={2019}
+}
 ```
diff --git a/apps/us_mobilenet_v1_val.yml b/apps/us_mobilenet_v1_val.yml
new file mode 100644
index 0000000..eef1a60
--- /dev/null
+++ b/apps/us_mobilenet_v1_val.yml
@@ -0,0 +1,64 @@
+# =========================== Basic Settings ===========================
+# machine info
+num_gpus_per_job: 4  # number of gpus each job need
+num_cpus_per_job: 63  # number of cpus each job need
+memory_per_job: 380  # memory requirement each job need
+gpu_type: "nvidia-tesla-p100"
+
+# data
+dataset: imagenet1k
+data_transforms: imagenet1k_basic
+data_loader: imagenet1k_basic
+dataset_dir: data/imagenet
+data_loader_workers: 62
+
+# info
+num_classes: 1000
+image_size: 224
+topk: [1, 5]
+num_epochs: 100
+
+# optimizer
+optimizer: sgd
+momentum: 0.9
+weight_decay: 0.0001
+nesterov: True
+
+# lr
+lr: 0.1
+lr_scheduler: multistep
+multistep_lr_milestones: [30, 60, 90]
+multistep_lr_gamma: 0.1
+
+# model profiling
+profiling: [gpu]
+
+# pretrain, resume, test_only
+pretrained: ''
+resume: ''
+test_only: False
+
+#
+random_seed: 1995
+batch_size: 256
+model: ''
+reset_parameters: True
+
+
+# =========================== Override Settings ===========================
+log_dir: logs/
+slimmable_training: True
+model: models.us_mobilenet_v1
+width_mult: 1.0
+width_mult_list: [0.25, 0.275, 0.3, 0.325, 0.35, 0.375, 0.4, 0.425, 0.45, 0.475, 0.5, 0.525, 0.55, 0.575, 0.6, 0.625, 0.65, 0.675, 0.7, 0.725, 0.75, 0.775, 0.8, 0.825, 0.85, 0.875, 0.9, 0.925, 0.95, 0.975, 1.0]
+width_mult_range: [0.25, 1.0]
+data_transforms: imagenet1k_mobile
+# num_gpus_per_job:
+# lr:
+# lr_scheduler:
+# exp_decaying_lr_gamma:
+# num_epochs:
+# batch_size:
+# test pretrained
+test_only: True
+pretrained: logs/us_mobilenet_v1_calibrated.pt
diff --git a/apps/us_mobilenet_v2_val.yml b/apps/us_mobilenet_v2_val.yml
new file mode 100644
index 0000000..73c5f95
--- /dev/null
+++ b/apps/us_mobilenet_v2_val.yml
@@ -0,0 +1,64 @@
+# =========================== Basic Settings ===========================
+# machine info
+num_gpus_per_job: 4  # number of gpus each job need
+num_cpus_per_job: 63  # number of cpus each job need
+memory_per_job: 380  # memory requirement each job need
+gpu_type: "nvidia-tesla-p100"
+
+# data
+dataset: imagenet1k
+data_transforms: imagenet1k_basic
+data_loader: imagenet1k_basic
+dataset_dir: data/imagenet
+data_loader_workers: 62
+
+# info
+num_classes: 1000
+image_size: 224
+topk: [1, 5]
+num_epochs: 100
+
+# optimizer
+optimizer: sgd
+momentum: 0.9
+weight_decay: 0.0001
+nesterov: True
+
+# lr
+lr: 0.1
+lr_scheduler: multistep
+multistep_lr_milestones: [30, 60, 90]
+multistep_lr_gamma: 0.1
+
+# model profiling
+profiling: [gpu]
+
+# pretrain, resume, test_only
+pretrained: ''
+resume: ''
+test_only: False
+
+#
+random_seed: 1995
+batch_size: 256
+model: ''
+reset_parameters: True
+
+
+# =========================== Override Settings ===========================
+log_dir: logs/
+slimmable_training: True
+model: models.us_mobilenet_v2
+width_mult: 1.0
+width_mult_list: [0.35, 0.375, 0.4, 0.425, 0.45, 0.475, 0.5, 0.525, 0.55, 0.575, 0.6, 0.625, 0.65, 0.675, 0.7, 0.725, 0.75, 0.775, 0.8, 0.825, 0.85, 0.875, 0.9, 0.925, 0.95, 0.975, 1.0]
+width_mult_range: [0.35, 1.0]
+data_transforms: imagenet1k_mobile
+# num_gpus_per_job:
+# lr:
+# lr_scheduler:
+# exp_decaying_lr_gamma:
+# num_epochs:
+# batch_size:
+# test pretrained
+test_only: True
+pretrained: logs/us_mobilenet_v2_calibrated.pt
diff --git a/models/s_mobilenet_v1.py b/models/s_mobilenet_v1.py
index da90b74..65e2dd0 100644
--- a/models/s_mobilenet_v1.py
+++ b/models/s_mobilenet_v1.py
@@ -3,26 +3,10 @@
 
 
 from .slimmable_ops import SwitchableBatchNorm2d
-from .slimmable_ops import SlimmableConv2d, SlimmableLinear
+from .slimmable_ops import SlimmableConv2d, SlimmableLinear, make_divisible
 from utils.config import FLAGS
 
 
-def _make_divisible(v, divisor=8, min_value=8):
-    """
-    forked from slim:
-    https://github.com/tensorflow/models/blob/\
-    0344c5503ee55e24f0de7f37336a6e08f10976fd/\
-    research/slim/nets/mobilenet/mobilenet.py#L62-L69
-    """
-    if min_value is None:
-        min_value = divisor
-    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
-    # Make sure that round down does not go down by more than 10%.
-    if new_v < 0.9 * v:
-        new_v += divisor
-    return new_v
-
-
 class DepthwiseSeparableConv(nn.Module):
     def __init__(self, inp, outp, stride):
         super(DepthwiseSeparableConv, self).__init__()
@@ -63,10 +47,10 @@ def __init__(self, num_classes=1000, input_size=224):
         # head
         assert input_size % 32 == 0
         channels = [
-            _make_divisible(32 * width_mult)
+            make_divisible(32 * width_mult)
             for width_mult in FLAGS.width_mult_list]
         self.outp = [
-            _make_divisible(1024 * width_mult)
+            make_divisible(1024 * width_mult)
             for width_mult in FLAGS.width_mult_list]
         first_stride = 2
         self.features.append(
@@ -81,7 +65,7 @@ def __init__(self, num_classes=1000, input_size=224):
         # body
         for c, n, s in self.block_setting:
             outp = [
-                _make_divisible(c * width_mult)
+                make_divisible(c * width_mult)
                 for width_mult in FLAGS.width_mult_list]
             for i in range(n):
                 if i == 0:
@@ -92,7 +76,7 @@ def __init__(self, num_classes=1000, input_size=224):
                         DepthwiseSeparableConv(channels, outp, 1))
                 channels = outp
 
-        avg_pool_size = input_size//32
+        avg_pool_size = input_size // 32
         self.features.append(nn.AvgPool2d(avg_pool_size))
 
         # make it nn.Sequential
diff --git a/models/s_mobilenet_v2.py b/models/s_mobilenet_v2.py
index 4ffd4a1..936b661 100644
--- a/models/s_mobilenet_v2.py
+++ b/models/s_mobilenet_v2.py
@@ -3,25 +3,10 @@
 
 
 from .slimmable_ops import SwitchableBatchNorm2d, SlimmableConv2d
+from .slimmable_ops import make_divisible
 from utils.config import FLAGS
 
 
-def _make_divisible(v, divisor=8, min_value=1):
-    """
-    forked from slim:
-    https://github.com/tensorflow/models/blob/\
-    0344c5503ee55e24f0de7f37336a6e08f10976fd/\
-    research/slim/nets/mobilenet/mobilenet.py#L62-L69
-    """
-    if min_value is None:
-        min_value = divisor
-    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
-    # Make sure that round down does not go down by more than 10%.
-    if new_v < 0.9 * v:
-        new_v += divisor
-    return new_v
-
-
 class InvertedResidual(nn.Module):
     def __init__(self, inp, outp, stride, expand_ratio):
         super(InvertedResidual, self).__init__()
@@ -31,7 +16,7 @@ def __init__(self, inp, outp, stride, expand_ratio):
 
         layers = []
         # expand
-        expand_inp = [i*expand_ratio for i in inp]
+        expand_inp = [i * expand_ratio for i in inp]
         if expand_ratio != 1:
             layers += [
                 SlimmableConv2d(inp, expand_inp, 1, 1, 0, bias=False),
@@ -80,9 +65,9 @@ def __init__(self, num_classes=1000, input_size=224):
         # head
         assert input_size % 32 == 0
         channels = [
-            _make_divisible(32 * width_mult)
+            make_divisible(32 * width_mult)
             for width_mult in FLAGS.width_mult_list]
-        self.outp = _make_divisible(
+        self.outp = make_divisible(
             1280 * max(FLAGS.width_mult_list)) if max(
                 FLAGS.width_mult_list) > 1.0 else 1280
         first_stride = 2
@@ -98,7 +83,7 @@ def __init__(self, num_classes=1000, input_size=224):
         # body
         for t, c, n, s in self.block_setting:
             outp = [
-                _make_divisible(c * width_mult)
+                make_divisible(c * width_mult)
                 for width_mult in FLAGS.width_mult_list]
             for i in range(n):
                 if i == 0:
@@ -120,7 +105,7 @@ def __init__(self, num_classes=1000, input_size=224):
                 nn.ReLU6(inplace=True),
             )
         )
-        avg_pool_size = input_size//32
+        avg_pool_size = input_size // 32
         self.features.append(nn.AvgPool2d(avg_pool_size))
 
         # make it nn.Sequential
diff --git a/models/s_resnet.py b/models/s_resnet.py
index e530d0e..1ebce3e 100644
--- a/models/s_resnet.py
+++ b/models/s_resnet.py
@@ -12,7 +12,7 @@ def __init__(self, inp, outp, stride):
         super(Block, self).__init__()
         assert stride in [1, 2]
 
-        midp = [i//4 for i in outp]
+        midp = [i // 4 for i in outp]
         layers = [
             SlimmableConv2d(inp, midp, 1, 1, 0, bias=False),
             SwitchableBatchNorm2d(midp),
@@ -79,7 +79,7 @@ def __init__(self, num_classes=1000, input_size=224):
         # body
         for stage_id, n in enumerate(self.block_setting):
             outp = [
-                int(feats[stage_id]*width_mult*4)
+                int(feats[stage_id] * width_mult * 4)
                 for width_mult in FLAGS.width_mult_list]
             for i in range(n):
                 if i == 0 and stage_id != 0:
@@ -88,7 +88,7 @@ def __init__(self, num_classes=1000, input_size=224):
                     self.features.append(Block(channels, outp, 1))
                 channels = outp
 
-        avg_pool_size = input_size//32
+        avg_pool_size = input_size // 32
         self.features.append(nn.AvgPool2d(avg_pool_size))
 
         # make it nn.Sequential
diff --git a/models/s_shufflenet.py b/models/s_shufflenet.py
index 4cc3f48..6f2251b 100644
--- a/models/s_shufflenet.py
+++ b/models/s_shufflenet.py
@@ -16,7 +16,7 @@ def __init__(self, groups):
 
     def forward(self, x):
         b, n, h, w = x.size()
-        x = x.view(b, self.groups, n//self.groups, h, w)
+        x = x.view(b, self.groups, n // self.groups, h, w)
         x = torch.transpose(x, 1, 2).contiguous()
         x = x.view(b, -1, h, w)
         return x
@@ -41,10 +41,10 @@ def __init__(self, inp, outp, stride):
         else:
             self.first_group = FLAGS.groups
 
-        inp_split = [i//self.first_group for i in inp]
-        midp = [i//FLAGS.width_compress for i in outp]
-        lastp = [i//FLAGS.groups for i in block_outp]
-        firstp = [i//FLAGS.groups for i in midp]
+        inp_split = [i // self.first_group for i in inp]
+        midp = [i // FLAGS.width_compress for i in outp]
+        lastp = [i // FLAGS.groups for i in block_outp]
+        firstp = [i // FLAGS.groups for i in midp]
         self.firstp = firstp
         self.inp = inp
         self.midp = midp
@@ -75,8 +75,8 @@ def __init__(self, inp, outp, stride):
             SwitchableBatchNorm2d(midp),
             # nn.ReLU(inplace=True),
         ]
-        midp_split = [i//FLAGS.groups for i in midp]
-        lastp = [i//FLAGS.groups for i in block_outp]
+        midp_split = [i // FLAGS.groups for i in midp]
+        lastp = [i // FLAGS.groups for i in block_outp]
         layers_c = [
             nn.Sequential(
                 SlimmableConv2d(midp_split, lastp, 1, 1, 0, bias=False),
@@ -98,12 +98,14 @@ def __init__(self, inp, outp, stride):
     def forward(self, x):
         if self.residual_connection:
             res = x
-            x_split = torch.split(res, list(res.size())[1]//self.a_len, dim=1)
+            x_split = torch.split(
+                res, list(res.size())[1] // self.a_len, dim=1)
             res = torch.cat(
                 [getattr(self, 'a_{}'.format(i))(x_split[i]) for i in range(
                     self.a_len)], 1)
             res = self.b(res)
-            x_split = torch.split(res, list(res.size())[1]//self.c_len, dim=1)
+            x_split = torch.split(
+                res, list(res.size())[1] // self.c_len, dim=1)
             res = torch.cat(
                 [getattr(self, 'c_{}'.format(i))(x_split[i]) for i in range(
                     self.c_len)], 1)
@@ -118,12 +120,13 @@ def forward(self, x):
                         self.a_len)], 1)
             else:
                 x_split = torch.split(
-                    res, list(res.size())[1]//self.a_len, dim=1)
+                    res, list(res.size())[1] // self.a_len, dim=1)
                 res = torch.cat(
                     [getattr(self, 'a_{}'.format(i))(
                         x_split[i]) for i in range(self.a_len)], 1)
             res = self.b(res)
-            x_split = torch.split(res, list(res.size())[1]//self.c_len, dim=1)
+            x_split = torch.split(
+                res, list(res.size())[1] // self.c_len, dim=1)
             res = torch.cat(
                 [getattr(self, 'c_{}'.format(i))(x_split[i]) for i in range(
                     self.c_len)], 1)
@@ -161,9 +164,10 @@ def __init__(self, num_classes=1000, input_size=224):
 
         self.features = []
 
-        channels = [int(24*width_mult) for width_mult in FLAGS.width_mult_list]
+        channels = [
+            int(24 * width_mult) for width_mult in FLAGS.width_mult_list]
         first_stride = 2
-        group_channels = [i//FLAGS.groups for i in channels]
+        group_channels = [i // FLAGS.groups for i in channels]
         head = [
             nn.Sequential(
                 SlimmableConv2d(
@@ -179,11 +183,12 @@ def __init__(self, num_classes=1000, input_size=224):
             setattr(self, 'head_{}'.format(i), head[i])
 
         for c, s in self.block_setting:
-            outp = [int(c*width_mult) for width_mult in FLAGS.width_mult_list]
+            outp = [
+                int(c * width_mult) for width_mult in FLAGS.width_mult_list]
             self.features.append(Block(channels, outp, s))
             channels = outp
 
-        avg_pool_size = input_size//32
+        avg_pool_size = input_size // 32
         self.features.append(nn.AvgPool2d(avg_pool_size))
 
         # make it nn.Sequential
@@ -191,9 +196,9 @@ def __init__(self, num_classes=1000, input_size=224):
 
         # classifier
         self.classifier = nn.Sequential(
-          SlimmableLinear(
-            channels,
-            [num_classes for _ in range(len(channels))])
+            SlimmableLinear(
+                channels,
+                [num_classes for _ in range(len(channels))])
         )
         if FLAGS.reset_parameters:
             self.reset_parameters()
diff --git a/models/slimmable_ops.py b/models/slimmable_ops.py
index 2cccc9b..0fa43b7 100644
--- a/models/slimmable_ops.py
+++ b/models/slimmable_ops.py
@@ -71,3 +71,130 @@ def forward(self, input):
         else:
             bias = self.bias
         return nn.functional.linear(input, weight, bias)
+
+
+def make_divisible(v, divisor=8, min_value=1):
+    """
+    forked from slim:
+    https://github.com/tensorflow/models/blob/\
+    0344c5503ee55e24f0de7f37336a6e08f10976fd/\
+    research/slim/nets/mobilenet/mobilenet.py#L62-L69
+    """
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class USConv2d(nn.Conv2d):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, depthwise=False, bias=True,
+                 us=[True, True], ratio=[1, 1]):
+        super(USConv2d, self).__init__(
+            in_channels, out_channels,
+            kernel_size, stride=stride, padding=padding, dilation=dilation,
+            groups=groups, bias=bias)
+        self.depthwise = depthwise
+        self.in_channels_max = in_channels
+        self.out_channels_max = out_channels
+        self.width_mult = None
+        self.us = us
+        self.ratio = ratio
+
+    def forward(self, input):
+        if self.us[0]:
+            self.in_channels = make_divisible(
+                self.in_channels_max
+                * self.width_mult
+                / self.ratio[0]) * self.ratio[0]
+        if self.us[1]:
+            self.out_channels = make_divisible(
+                self.out_channels_max
+                * self.width_mult
+                / self.ratio[1]) * self.ratio[1]
+        self.groups = self.in_channels if self.depthwise else 1
+        weight = self.weight[:self.out_channels, :self.in_channels, :, :]
+        if self.bias is not None:
+            bias = self.bias[:self.out_channels]
+        else:
+            bias = self.bias
+        y = nn.functional.conv2d(
+            input, weight, bias, self.stride, self.padding,
+            self.dilation, self.groups)
+        if getattr(FLAGS, 'conv_averaged', False):
+            y = y * (max(self.in_channels_list)/self.in_channels)
+        return y
+
+
+class USLinear(nn.Linear):
+    def __init__(self, in_features, out_features, bias=True, us=[True, True]):
+        super(USLinear, self).__init__(
+            in_features, out_features, bias=bias)
+        self.in_features_max = in_features
+        self.out_features_max = out_features
+        self.width_mult = None
+        self.us = us
+
+    def forward(self, input):
+        if self.us[0]:
+            self.in_features = make_divisible(
+                self.in_features_max * self.width_mult)
+        if self.us[1]:
+            self.out_features = make_divisible(
+                self.out_features_max * self.width_mult)
+        weight = self.weight[:self.out_features, :self.in_features]
+        if self.bias is not None:
+            bias = self.bias[:self.out_features]
+        else:
+            bias = self.bias
+        return nn.functional.linear(input, weight, bias)
+
+
+class USBatchNorm2d(nn.BatchNorm2d):
+    def __init__(self, num_features, ratio=1):
+        super(USBatchNorm2d, self).__init__(
+            num_features, affine=True, track_running_stats=False)
+        self.num_features_max = num_features
+        # for tracking performance during training
+        self.bn = nn.ModuleList(
+            [nn.BatchNorm2d(i, affine=False)
+             for i in [
+                     make_divisible(
+                         self.num_features_max * width_mult / ratio) * ratio
+                     for width_mult in FLAGS.width_mult_list]
+             ]
+        )
+        self.ratio = ratio
+        self.width_mult = None
+        self.ignore_model_profiling = True
+
+    def forward(self, input):
+        weight = self.weight
+        bias = self.bias
+        c = make_divisible(
+            self.num_features_max * self.width_mult / self.ratio) * self.ratio
+        if self.width_mult in FLAGS.width_mult_list:
+            idx = FLAGS.width_mult_list.index(self.width_mult)
+            y = nn.functional.batch_norm(
+                input,
+                self.bn[idx].running_mean[:c],
+                self.bn[idx].running_var[:c],
+                weight[:c],
+                bias[:c],
+                self.training,
+                self.momentum,
+                self.eps)
+        else:
+            y = nn.functional.batch_norm(
+                input,
+                self.running_mean,
+                self.running_var,
+                weight[:c],
+                bias[:c],
+                self.training,
+                self.momentum,
+                self.eps)
+        return y
diff --git a/models/us_mobilenet_v1.py b/models/us_mobilenet_v1.py
new file mode 100644
index 0000000..ba24274
--- /dev/null
+++ b/models/us_mobilenet_v1.py
@@ -0,0 +1,107 @@
+import math
+import torch.nn as nn
+
+
+from .slimmable_ops import USBatchNorm2d, USConv2d, USLinear, make_divisible
+from utils.config import FLAGS
+
+
+class DepthwiseSeparableConv(nn.Module):
+    def __init__(self, inp, outp, stride):
+        super(DepthwiseSeparableConv, self).__init__()
+        assert stride in [1, 2]
+
+        layers = [
+            USConv2d(
+                inp, inp, 3, stride, 1, groups=inp, depthwise=True,
+                bias=False),
+            USBatchNorm2d(inp),
+            nn.ReLU6(inplace=True),
+
+            USConv2d(inp, outp, 1, 1, 0, bias=False),
+            USBatchNorm2d(outp),
+            nn.ReLU6(inplace=True),
+        ]
+        self.body = nn.Sequential(*layers)
+
+    def forward(self, x):
+        return self.body(x)
+
+class Model(nn.Module):
+    def __init__(self, num_classes=1000, input_size=224):
+        super(Model, self).__init__()
+
+        # setting of inverted residual blocks
+        self.block_setting = [
+            # c, n, s
+            [64, 1, 1],
+            [128, 2, 2],
+            [256, 2, 2],
+            [512, 6, 2],
+            [1024, 2, 2],
+        ]
+
+        self.features = []
+
+        width_mult = FLAGS.width_mult_range[-1]
+        # head
+        assert input_size % 32 == 0
+        channels = make_divisible(32 * width_mult)
+        self.outp = make_divisible(1024 * width_mult)
+        first_stride = 2
+        self.features.append(
+            nn.Sequential(
+                USConv2d(
+                    3, channels, 3, first_stride, 1, bias=False,
+                    us=[False, True]),
+                USBatchNorm2d(channels),
+                nn.ReLU6(inplace=True))
+        )
+
+        # body
+        for c, n, s in self.block_setting:
+            outp = make_divisible(c * width_mult)
+            for i in range(n):
+                if i == 0:
+                    self.features.append(
+                        DepthwiseSeparableConv(channels, outp, s))
+                else:
+                    self.features.append(
+                        DepthwiseSeparableConv(channels, outp, 1))
+                channels = outp
+
+        avg_pool_size = input_size// 32
+        self.features.append(nn.AvgPool2d(avg_pool_size))
+
+        # make it nn.Sequential
+        self.features = nn.Sequential(*self.features)
+
+        # classifier
+        self.classifier = nn.Sequential(
+            USLinear(self.outp, num_classes, us=[True, False])
+        )
+        if FLAGS.reset_parameters:
+            self.reset_parameters()
+
+    def forward(self, x):
+        x = self.features(x)
+        last_dim = x.size()[1]
+        x = x.view(-1, last_dim)
+        x = self.classifier(x)
+        return x
+
+    def reset_parameters(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+                if m.bias is not None:
+                    m.bias.data.zero_()
+            elif isinstance(m, nn.BatchNorm2d):
+                if m.affine:
+                    m.weight.data.fill_(1)
+                    m.bias.data.zero_()
+            elif isinstance(m, nn.Linear):
+                n = m.weight.size(1)
+                m.weight.data.normal_(0, 0.01)
+                m.bias.data.zero_()
diff --git a/models/us_mobilenet_v2.py b/models/us_mobilenet_v2.py
new file mode 100644
index 0000000..03ef058
--- /dev/null
+++ b/models/us_mobilenet_v2.py
@@ -0,0 +1,141 @@
+import math
+import torch.nn as nn
+
+
+from .slimmable_ops import USBatchNorm2d, USConv2d, USLinear, make_divisible
+from utils.config import FLAGS
+
+
+class InvertedResidual(nn.Module):
+    def __init__(self, inp, outp, stride, expand_ratio):
+        super(InvertedResidual, self).__init__()
+        assert stride in [1, 2]
+
+        self.residual_connection = stride == 1 and inp == outp
+
+        layers = []
+        # expand
+        expand_inp = inp * expand_ratio
+        if expand_ratio != 1:
+            layers += [
+                USConv2d(
+                    inp, expand_inp, 1, 1, 0, bias=False,
+                    ratio=[1, expand_ratio]),
+                USBatchNorm2d(expand_inp, ratio=expand_ratio),
+                nn.ReLU6(inplace=True),
+            ]
+        # depthwise + project back
+        layers += [
+                USConv2d(
+                    expand_inp, expand_inp, 3, stride, 1, groups=expand_inp,
+                    depthwise=True, bias=False,
+                    ratio=[expand_ratio, expand_ratio]),
+                USBatchNorm2d(expand_inp, ratio=expand_ratio),
+                nn.ReLU6(inplace=True),
+
+                USConv2d(
+                    expand_inp, outp, 1, 1, 0, bias=False,
+                    ratio=[expand_ratio, 1]),
+                USBatchNorm2d(outp),
+        ]
+        self.body = nn.Sequential(*layers)
+
+    def forward(self, x):
+        if self.residual_connection:
+            res = self.body(x)
+            res += x
+        else:
+            res = self.body(x)
+        return res
+
+
+class Model(nn.Module):
+    def __init__(self, num_classes=1000, input_size=224):
+        super(Model, self).__init__()
+
+        # setting of inverted residual blocks
+        self.block_setting = [
+            # t, c, n, s
+            [1, 16, 1, 1],
+            [6, 24, 2, 2],
+            [6, 32, 3, 2],
+            [6, 64, 4, 2],
+            [6, 96, 3, 1],
+            [6, 160, 3, 2],
+            [6, 320, 1, 1],
+        ]
+        if FLAGS.dataset == 'cifar10':
+            self.block_setting[2] = [6, 24, 2, 1]
+
+        self.features = []
+
+        width_mult = FLAGS.width_mult_range[-1]
+        # head
+        assert input_size % 32 == 0
+        channels = make_divisible(32 * width_mult)
+        self.outp = make_divisible(
+            1280 * width_mult) if width_mult > 1.0 else 1280
+        first_stride = 2
+        self.features.append(
+            nn.Sequential(
+                USConv2d(
+                    3, channels, 3, first_stride, 1, bias=False,
+                    us=[False, True]),
+                USBatchNorm2d(channels),
+                nn.ReLU6(inplace=True))
+        )
+
+        # body
+        for t, c, n, s in self.block_setting:
+            outp = make_divisible(c * width_mult)
+            for i in range(n):
+                if i == 0:
+                    self.features.append(
+                        InvertedResidual(channels, outp, s, t))
+                else:
+                    self.features.append(
+                        InvertedResidual(channels, outp, 1, t))
+                channels = outp
+
+        # tail
+        self.features.append(
+            nn.Sequential(
+                USConv2d(
+                    channels, self.outp, 1, 1, 0, bias=False,
+                    us=[True, False]),
+                nn.BatchNorm2d(self.outp),
+                nn.ReLU6(inplace=True),
+            )
+        )
+        avg_pool_size = input_size // 32
+        self.features.append(nn.AvgPool2d(avg_pool_size))
+
+        # make it nn.Sequential
+        self.features = nn.Sequential(*self.features)
+
+        # classifier
+        self.classifier = nn.Sequential(nn.Linear(self.outp, num_classes))
+        if FLAGS.reset_parameters:
+            self.reset_parameters()
+
+    def forward(self, x):
+        x = self.features(x)
+        x = x.view(-1, self.outp)
+        x = self.classifier(x)
+        return x
+
+    def reset_parameters(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+                if m.bias is not None:
+                    m.bias.data.zero_()
+            elif isinstance(m, nn.BatchNorm2d):
+                if m.affine:
+                    m.weight.data.fill_(1)
+                    m.bias.data.zero_()
+            elif isinstance(m, nn.Linear):
+                n = m.weight.size(1)
+                m.weight.data.normal_(0, 0.01)
+                m.bias.data.zero_()
diff --git a/train.py b/train.py
index c217b2c..10f575c 100644
--- a/train.py
+++ b/train.py
@@ -132,15 +132,15 @@ def get_lr_scheduler(optimizer):
             if i == 0:
                 lr_dict[i] = 1
             else:
-                lr_dict[i] = lr_dict[i-1] * FLAGS.exp_decaying_lr_gamma
-        lr_lambda = lambda epoch: lr_dict[epoch]  # noqa: E731
+                lr_dict[i] = lr_dict[i - 1] * FLAGS.exp_decaying_lr_gamma
+        lr_lambda = lambda epoch: lr_dict[epoch]
         lr_scheduler = torch.optim.lr_scheduler.LambdaLR(
             optimizer, lr_lambda=lr_lambda)
     elif FLAGS.lr_scheduler == 'linear_decaying':
         lr_dict = {}
         for i in range(FLAGS.num_epochs):
             lr_dict[i] = 1. - i / FLAGS.num_epochs
-        lr_lambda = lambda epoch: lr_dict[epoch]  # noqa: E731
+        lr_lambda = lambda epoch: lr_dict[epoch]
         lr_scheduler = torch.optim.lr_scheduler.LambdaLR(
             optimizer, lr_lambda=lr_lambda)
     else:
@@ -248,7 +248,7 @@ def forward_loss(model, criterion, input, target, meter):
     correct = pred.eq(target.view(1, -1).expand_as(pred))
     for k in FLAGS.topk:
         correct_k = correct[:k].float().sum(0)
-        error_list = list(1.-correct_k.cpu().detach().numpy())
+        error_list = list(1. - correct_k.cpu().detach().numpy())
         meter['top{}_error'.format(k)].cache_list(error_list)
     return loss
 
@@ -271,7 +271,8 @@ def run_one_epoch(
         other_widths.remove(min_width)
     if train and FLAGS.lr_scheduler == 'linear_decaying':
         linear_decaying_per_step = (
-            FLAGS.lr/FLAGS.num_epochs/len(loader.dataset)*FLAGS.batch_size)
+            FLAGS.lr / FLAGS.num_epochs /
+            len(loader.dataset) * FLAGS.batch_size)
     for batch_idx, (input, target) in enumerate(loader):
         target = target.cuda(non_blocking=True)
         if train:
@@ -311,8 +312,8 @@ def run_one_epoch(
                                               for k, v in results.items()))
     else:
         results = flush_scalar_meters(meters)
-        print('{:.1f}s\t{}\t{}/{}: '.format(
-            time.time() - t_start, phase, epoch, FLAGS.num_epochs) +
+        print('{:.1f}s\t{}\t{}/{}: '.format(time.time() - t_start,
+                                            phase, epoch, FLAGS.num_epochs) +
               ', '.join('{}: {:.3f}'.format(k, v) for k, v in results.items()))
     return results
 
@@ -391,7 +392,7 @@ def train_val_test():
         return
 
     print('Start training.')
-    for epoch in range(last_epoch+1, FLAGS.num_epochs):
+    for epoch in range(last_epoch + 1, FLAGS.num_epochs):
         lr_scheduler.step()
         # train
         results = run_one_epoch(
diff --git a/utils/config.py b/utils/config.py
index 1ad7e0b..51c5c8e 100644
--- a/utils/config.py
+++ b/utils/config.py
@@ -89,7 +89,7 @@ def __repr__(self):
                 ret_str.append('{}:'.format(key))
                 child_ret_str = value.__repr__().split('\n')
                 for item in child_ret_str:
-                    ret_str.append('    '+item)
+                    ret_str.append('    ' + item)
             elif isinstance(value, list):
                 if isinstance(value[0], AttrDict):
                     ret_str.append('{}:'.format(key))
@@ -97,7 +97,7 @@ def __repr__(self):
                         # treat as AttrDict above
                         child_ret_str = item.__repr__().split('\n')
                         for item in child_ret_str:
-                            ret_str.append('    '+item)
+                            ret_str.append('    ' + item)
                 else:
                     ret_str.append('{}: {}'.format(key, value))
             else:
diff --git a/utils/model_profiling.py b/utils/model_profiling.py
index 50b4d6a..361ace3 100644
--- a/utils/model_profiling.py
+++ b/utils/model_profiling.py
@@ -131,9 +131,8 @@ def module_profiling(self, input, output, verbose):
 def add_profiling_hooks(m, verbose):
     global model_profiling_hooks
     model_profiling_hooks.append(
-      m.register_forward_hook(
-        lambda m, input, output: module_profiling(
-          m, input, output, verbose=verbose)))
+        m.register_forward_hook(lambda m, input, output: module_profiling(
+            m, input, output, verbose=verbose)))
 
 
 def remove_profiling_hooks():
@@ -174,10 +173,12 @@ def model_profiling(model, height, width, batch=1, channel=3, use_cuda=True,
         'macs'.rjust(macs_space, ' ') +
         'nanosecs'.rjust(seconds_space, ' '))
     if verbose:
-        print(''.center(name_space+params_space+macs_space+seconds_space, '-'))
+        print(''.center(
+            name_space + params_space + macs_space + seconds_space, '-'))
     model(data)
     if verbose:
-        print(''.center(name_space+params_space+macs_space+seconds_space, '-'))
+        print(''.center(
+            name_space + params_space + macs_space + seconds_space, '-'))
     print(
         'Total'.ljust(name_space, ' ') +
         '{:,}'.format(model.n_params).rjust(params_space, ' ') +
diff --git a/utils/transforms.py b/utils/transforms.py
index c8c15e4..72fa4e8 100644
--- a/utils/transforms.py
+++ b/utils/transforms.py
@@ -8,9 +8,9 @@
 imagenet_pca = {
     'eigval': np.asarray([0.2175, 0.0188, 0.0045]),
     'eigvec': np.asarray([
-        [-0.5675,  0.7192,  0.4009],
+        [-0.5675, 0.7192, 0.4009],
         [-0.5808, -0.0045, -0.8140],
-        [-0.5836, -0.6948,  0.4203],
+        [-0.5836, -0.6948, 0.4203],
     ])
 }