From f0b8dac2b687743fa076771f3b61c1765404b50e Mon Sep 17 00:00:00 2001 From: JiahuiYu Date: Mon, 18 Mar 2019 16:42:06 -0500 Subject: [PATCH] V2.0.0 alpha (#4) * Format with flake8 * Release pretrained model of usnets * Create MODEL_ZOO.md * Update README.md --- .gitignore | 1 + MODEL_ZOO.md | 19 +++++ README.md | 34 ++++++--- apps/us_mobilenet_v1_val.yml | 64 ++++++++++++++++ apps/us_mobilenet_v2_val.yml | 64 ++++++++++++++++ models/s_mobilenet_v1.py | 26 ++----- models/s_mobilenet_v2.py | 27 ++----- models/s_resnet.py | 6 +- models/s_shufflenet.py | 41 +++++----- models/slimmable_ops.py | 127 +++++++++++++++++++++++++++++++ models/us_mobilenet_v1.py | 107 ++++++++++++++++++++++++++ models/us_mobilenet_v2.py | 141 +++++++++++++++++++++++++++++++++++ train.py | 17 +++-- utils/config.py | 4 +- utils/model_profiling.py | 11 +-- utils/transforms.py | 4 +- 16 files changed, 601 insertions(+), 92 deletions(-) create mode 100644 MODEL_ZOO.md create mode 100644 apps/us_mobilenet_v1_val.yml create mode 100644 apps/us_mobilenet_v2_val.yml create mode 100644 models/us_mobilenet_v1.py create mode 100644 models/us_mobilenet_v2.py diff --git a/.gitignore b/.gitignore index f733534..ce82486 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ logs data +.flake8 diff --git a/MODEL_ZOO.md b/MODEL_ZOO.md new file mode 100644 index 0000000..458e26e --- /dev/null +++ b/MODEL_ZOO.md @@ -0,0 +1,19 @@ +# Slimmable Model Zoo + +## Slimmable Neural Networks ([ICLR 2019](https://arxiv.org/abs/1812.08928)) + + +| Model | Switches (Widths) | Top-1 Err. | MFLOPs | Model ID | +| :--- | :---: | :---: | ---: | :---: | +| S-MobileNet v1 | 1.00
0.75
0.50
0.25 | 28.5
30.5
35.2
46.9 | 569
325
150
41 | [a6285db](https://github.com/JiahuiYu/slimmable_networks/files/2709079/s_mobilenet_v1_0.25_0.5_0.75_1.0.pt.zip) | +| S-MobileNet v2 | 1.00
0.75
0.50
0.35 | 29.5
31.1
35.6
40.3 | 301
209
97
59 | [0593ffd](https://github.com/JiahuiYu/slimmable_networks/files/2709080/s_mobilenet_v2_0.35_0.5_0.75_1.0.pt.zip) | +| S-ShuffleNet | 2.00
1.00
0.50 | 28.6
34.5
42.8 | 524
138
38 | [1427f66](https://github.com/JiahuiYu/slimmable_networks/files/2709082/s_shufflenet_0.5_1.0_2.0.pt.zip) | +| S-ResNet-50 | 1.00
0.75
0.50
0.25 | 24.0
25.1
27.9
35.0 | 4.1G
2.3G
1.1G
278 | [3fca9cc](https://drive.google.com/open?id=1f6q37OkZaz_0GoOAwllHlXNWuKwor2fC) | + + +## Universally Slimmable Networks and Improved Training Techniques ([Preprint](https://arxiv.org/abs/1903.05134)) + +| Model | Widths | Top-1 Err. | MFLOPs | Model ID | +| :--- | :--- | :---: | ---: | :---: | +| US-MobileNet v1 | 1.0
0.975
0.95
0.925
0.9
0.875
0.85
0.825
0.8
0.775
0.75
0.725
0.7
0.675
0.65
0.625
0.6
0.575
0.55
0.525
0.5
0.475
0.45
0.425
0.4
0.375
0.35
0.325
0.3
0.275
0.25 | 28.2
28.3
28.4
28.7
28.7
29.1
29.4
29.7
30.2
30.3
30.5
30.9
31.2
31.7
32.2
32.5
33.2
33.7
34.4
35.0
35.8
36.5
37.3
38.1
39.0
40.0
41.0
41.9
42.7
44.2
44.3 | 568
543
517
490
466
443
421
389
366
345
325
306
287
267
249
232
217
201
177
162
149
136
124
114
100
89
80
71
64
48
41 | [13d5af2](https://github.com/JiahuiYu/slimmable_networks/files/2979952/us_mobilenet_v1_calibrated.pt.zip) | +| US-MobileNet v2 | 1.0
0.975
0.95
0.925
0.9
0.875
0.85
0.825
0.8
0.775
0.75
0.725
0.7
0.675
0.65
0.625
0.6
0.575
0.55
0.525
0.5
0.475
0.45
0.425
0.4
0.375
0.35 | 28.5
28.5
28.8
28.9
29.1
29.1
29.4
29.9
30.0
30.2
30.4
30.7
31.1
31.4
31.7
31.7
32.4
32.4
34.4
34.6
34.9
35.1
35.8
35.8
36.6
36.7
37.7
| 300
299
284
274
269
268
254
235
222
213
209
185
173
165
161
161
151
150
106
100
97
96
88
88
80
80
59 | [3880cad](https://github.com/JiahuiYu/slimmable_networks/files/2979953/us_mobilenet_v2_calibrated.pt.zip) | diff --git a/README.md b/README.md index b46364e..f14263b 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,25 @@ -# Slimmable Neural Networks +# Slimmable Networks -[ICLR 2019 Paper](https://arxiv.org/abs/1812.08928) | [ArXiv](https://arxiv.org/abs/1812.08928) | [OpenReview](https://openreview.net/forum?id=H1gMCsAqY7) | [Detection](https://github.com/JiahuiYu/slimmable_networks/tree/detection) | [Model Zoo](#model-zoo) | [BibTex](#citing) +An open-source framework for slimmable training on ImageNet classification and COCO detection, which has enabled numerous projects. + +## [Slimmable Neural Networks](https://arxiv.org/abs/1812.08928) + +[ICLR 2019 Paper](https://arxiv.org/abs/1812.08928) | [OpenReview](https://openreview.net/forum?id=H1gMCsAqY7) | [Detection](https://github.com/JiahuiYu/slimmable_networks/tree/detection) | [Model Zoo](/MODEL_ZOO.md) | [BibTex](#citing) Illustration of slimmable neural networks. The same model can run at different widths (number of active channels), permitting instant and adaptive accuracy-efficiency trade-offs. +## [Universally Slimmable Networks and Improved Training Techniques](https://arxiv.org/abs/1903.05134) + +[Preprint](https://arxiv.org/abs/1903.05134) | [Model Zoo](/MODEL_ZOO.md) | [BibTex](#citing) + + + +Illustration of universally slimmable networks. The same model can run at **arbitrary** widths. + + ## Run 0. Requirements: @@ -22,16 +35,6 @@ Illustration of slimmable neural networks. The same model can run at different w * If you still have questions, please search closed issues first. If the problem is not solved, please open a new. -## Model Zoo - -| Model | Switches (Widths) | Top-1 Err. | MFLOPs | Model ID | -| :--- | :---: | :---: | ---: | :---: | -| S-MobileNet v1 | 1.00
0.75
0.50
0.25 | 28.5
30.5
35.2
46.9 | 569
325
150
41 | [a6285db](https://github.com/JiahuiYu/slimmable_networks/files/2709079/s_mobilenet_v1_0.25_0.5_0.75_1.0.pt.zip) | -| S-MobileNet v2 | 1.00
0.75
0.50
0.35 | 29.5
31.1
35.6
40.3 | 301
209
97
59 | [0593ffd](https://github.com/JiahuiYu/slimmable_networks/files/2709080/s_mobilenet_v2_0.35_0.5_0.75_1.0.pt.zip) | -| S-ShuffleNet | 2.00
1.00
0.50 | 28.6
34.5
42.8 | 524
138
38 | [1427f66](https://github.com/JiahuiYu/slimmable_networks/files/2709082/s_shufflenet_0.5_1.0_2.0.pt.zip) | -| S-ResNet-50 | 1.00
0.75
0.50
0.25 | 24.0
25.1
27.9
35.0 | 4.1G
2.3G
1.1G
278 | [3fca9cc](https://drive.google.com/open?id=1f6q37OkZaz_0GoOAwllHlXNWuKwor2fC) | - - ## Technical Details Implementing slimmable networks and slimmable training is straightforward: @@ -54,4 +57,11 @@ The software is for educaitonal and academic research purpose only. journal={arXiv preprint arXiv:1812.08928}, year={2018} } + +@article{yu2019universally, + title={Universally Slimmable Networks and Improved Training Techniques}, + author={Yu, Jiahui and Huang, Thomas}, + journal={arXiv preprint arXiv:1903.05134}, + year={2019} +} ``` diff --git a/apps/us_mobilenet_v1_val.yml b/apps/us_mobilenet_v1_val.yml new file mode 100644 index 0000000..eef1a60 --- /dev/null +++ b/apps/us_mobilenet_v1_val.yml @@ -0,0 +1,64 @@ +# =========================== Basic Settings =========================== +# machine info +num_gpus_per_job: 4 # number of gpus each job need +num_cpus_per_job: 63 # number of cpus each job need +memory_per_job: 380 # memory requirement each job need +gpu_type: "nvidia-tesla-p100" + +# data +dataset: imagenet1k +data_transforms: imagenet1k_basic +data_loader: imagenet1k_basic +dataset_dir: data/imagenet +data_loader_workers: 62 + +# info +num_classes: 1000 +image_size: 224 +topk: [1, 5] +num_epochs: 100 + +# optimizer +optimizer: sgd +momentum: 0.9 +weight_decay: 0.0001 +nesterov: True + +# lr +lr: 0.1 +lr_scheduler: multistep +multistep_lr_milestones: [30, 60, 90] +multistep_lr_gamma: 0.1 + +# model profiling +profiling: [gpu] + +# pretrain, resume, test_only +pretrained: '' +resume: '' +test_only: False + +# +random_seed: 1995 +batch_size: 256 +model: '' +reset_parameters: True + + +# =========================== Override Settings =========================== +log_dir: logs/ +slimmable_training: True +model: models.us_mobilenet_v1 +width_mult: 1.0 +width_mult_list: [0.25, 0.275, 0.3, 0.325, 0.35, 0.375, 0.4, 0.425, 0.45, 0.475, 0.5, 0.525, 0.55, 0.575, 0.6, 0.625, 0.65, 0.675, 0.7, 0.725, 0.75, 0.775, 0.8, 0.825, 0.85, 0.875, 0.9, 0.925, 0.95, 0.975, 1.0] +width_mult_range: [0.25, 1.0] +data_transforms: imagenet1k_mobile +# num_gpus_per_job: +# lr: +# lr_scheduler: +# exp_decaying_lr_gamma: +# num_epochs: +# batch_size: +# test pretrained +test_only: True +pretrained: logs/us_mobilenet_v1_calibrated.pt diff --git a/apps/us_mobilenet_v2_val.yml b/apps/us_mobilenet_v2_val.yml new file mode 100644 index 0000000..73c5f95 --- /dev/null +++ b/apps/us_mobilenet_v2_val.yml @@ -0,0 +1,64 @@ +# =========================== Basic Settings =========================== +# machine info +num_gpus_per_job: 4 # number of gpus each job need +num_cpus_per_job: 63 # number of cpus each job need +memory_per_job: 380 # memory requirement each job need +gpu_type: "nvidia-tesla-p100" + +# data +dataset: imagenet1k +data_transforms: imagenet1k_basic +data_loader: imagenet1k_basic +dataset_dir: data/imagenet +data_loader_workers: 62 + +# info +num_classes: 1000 +image_size: 224 +topk: [1, 5] +num_epochs: 100 + +# optimizer +optimizer: sgd +momentum: 0.9 +weight_decay: 0.0001 +nesterov: True + +# lr +lr: 0.1 +lr_scheduler: multistep +multistep_lr_milestones: [30, 60, 90] +multistep_lr_gamma: 0.1 + +# model profiling +profiling: [gpu] + +# pretrain, resume, test_only +pretrained: '' +resume: '' +test_only: False + +# +random_seed: 1995 +batch_size: 256 +model: '' +reset_parameters: True + + +# =========================== Override Settings =========================== +log_dir: logs/ +slimmable_training: True +model: models.us_mobilenet_v2 +width_mult: 1.0 +width_mult_list: [0.35, 0.375, 0.4, 0.425, 0.45, 0.475, 0.5, 0.525, 0.55, 0.575, 0.6, 0.625, 0.65, 0.675, 0.7, 0.725, 0.75, 0.775, 0.8, 0.825, 0.85, 0.875, 0.9, 0.925, 0.95, 0.975, 1.0] +width_mult_range: [0.35, 1.0] +data_transforms: imagenet1k_mobile +# num_gpus_per_job: +# lr: +# lr_scheduler: +# exp_decaying_lr_gamma: +# num_epochs: +# batch_size: +# test pretrained +test_only: True +pretrained: logs/us_mobilenet_v2_calibrated.pt diff --git a/models/s_mobilenet_v1.py b/models/s_mobilenet_v1.py index da90b74..65e2dd0 100644 --- a/models/s_mobilenet_v1.py +++ b/models/s_mobilenet_v1.py @@ -3,26 +3,10 @@ from .slimmable_ops import SwitchableBatchNorm2d -from .slimmable_ops import SlimmableConv2d, SlimmableLinear +from .slimmable_ops import SlimmableConv2d, SlimmableLinear, make_divisible from utils.config import FLAGS -def _make_divisible(v, divisor=8, min_value=8): - """ - forked from slim: - https://github.com/tensorflow/models/blob/\ - 0344c5503ee55e24f0de7f37336a6e08f10976fd/\ - research/slim/nets/mobilenet/mobilenet.py#L62-L69 - """ - if min_value is None: - min_value = divisor - new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) - # Make sure that round down does not go down by more than 10%. - if new_v < 0.9 * v: - new_v += divisor - return new_v - - class DepthwiseSeparableConv(nn.Module): def __init__(self, inp, outp, stride): super(DepthwiseSeparableConv, self).__init__() @@ -63,10 +47,10 @@ def __init__(self, num_classes=1000, input_size=224): # head assert input_size % 32 == 0 channels = [ - _make_divisible(32 * width_mult) + make_divisible(32 * width_mult) for width_mult in FLAGS.width_mult_list] self.outp = [ - _make_divisible(1024 * width_mult) + make_divisible(1024 * width_mult) for width_mult in FLAGS.width_mult_list] first_stride = 2 self.features.append( @@ -81,7 +65,7 @@ def __init__(self, num_classes=1000, input_size=224): # body for c, n, s in self.block_setting: outp = [ - _make_divisible(c * width_mult) + make_divisible(c * width_mult) for width_mult in FLAGS.width_mult_list] for i in range(n): if i == 0: @@ -92,7 +76,7 @@ def __init__(self, num_classes=1000, input_size=224): DepthwiseSeparableConv(channels, outp, 1)) channels = outp - avg_pool_size = input_size//32 + avg_pool_size = input_size // 32 self.features.append(nn.AvgPool2d(avg_pool_size)) # make it nn.Sequential diff --git a/models/s_mobilenet_v2.py b/models/s_mobilenet_v2.py index 4ffd4a1..936b661 100644 --- a/models/s_mobilenet_v2.py +++ b/models/s_mobilenet_v2.py @@ -3,25 +3,10 @@ from .slimmable_ops import SwitchableBatchNorm2d, SlimmableConv2d +from .slimmable_ops import make_divisible from utils.config import FLAGS -def _make_divisible(v, divisor=8, min_value=1): - """ - forked from slim: - https://github.com/tensorflow/models/blob/\ - 0344c5503ee55e24f0de7f37336a6e08f10976fd/\ - research/slim/nets/mobilenet/mobilenet.py#L62-L69 - """ - if min_value is None: - min_value = divisor - new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) - # Make sure that round down does not go down by more than 10%. - if new_v < 0.9 * v: - new_v += divisor - return new_v - - class InvertedResidual(nn.Module): def __init__(self, inp, outp, stride, expand_ratio): super(InvertedResidual, self).__init__() @@ -31,7 +16,7 @@ def __init__(self, inp, outp, stride, expand_ratio): layers = [] # expand - expand_inp = [i*expand_ratio for i in inp] + expand_inp = [i * expand_ratio for i in inp] if expand_ratio != 1: layers += [ SlimmableConv2d(inp, expand_inp, 1, 1, 0, bias=False), @@ -80,9 +65,9 @@ def __init__(self, num_classes=1000, input_size=224): # head assert input_size % 32 == 0 channels = [ - _make_divisible(32 * width_mult) + make_divisible(32 * width_mult) for width_mult in FLAGS.width_mult_list] - self.outp = _make_divisible( + self.outp = make_divisible( 1280 * max(FLAGS.width_mult_list)) if max( FLAGS.width_mult_list) > 1.0 else 1280 first_stride = 2 @@ -98,7 +83,7 @@ def __init__(self, num_classes=1000, input_size=224): # body for t, c, n, s in self.block_setting: outp = [ - _make_divisible(c * width_mult) + make_divisible(c * width_mult) for width_mult in FLAGS.width_mult_list] for i in range(n): if i == 0: @@ -120,7 +105,7 @@ def __init__(self, num_classes=1000, input_size=224): nn.ReLU6(inplace=True), ) ) - avg_pool_size = input_size//32 + avg_pool_size = input_size // 32 self.features.append(nn.AvgPool2d(avg_pool_size)) # make it nn.Sequential diff --git a/models/s_resnet.py b/models/s_resnet.py index e530d0e..1ebce3e 100644 --- a/models/s_resnet.py +++ b/models/s_resnet.py @@ -12,7 +12,7 @@ def __init__(self, inp, outp, stride): super(Block, self).__init__() assert stride in [1, 2] - midp = [i//4 for i in outp] + midp = [i // 4 for i in outp] layers = [ SlimmableConv2d(inp, midp, 1, 1, 0, bias=False), SwitchableBatchNorm2d(midp), @@ -79,7 +79,7 @@ def __init__(self, num_classes=1000, input_size=224): # body for stage_id, n in enumerate(self.block_setting): outp = [ - int(feats[stage_id]*width_mult*4) + int(feats[stage_id] * width_mult * 4) for width_mult in FLAGS.width_mult_list] for i in range(n): if i == 0 and stage_id != 0: @@ -88,7 +88,7 @@ def __init__(self, num_classes=1000, input_size=224): self.features.append(Block(channels, outp, 1)) channels = outp - avg_pool_size = input_size//32 + avg_pool_size = input_size // 32 self.features.append(nn.AvgPool2d(avg_pool_size)) # make it nn.Sequential diff --git a/models/s_shufflenet.py b/models/s_shufflenet.py index 4cc3f48..6f2251b 100644 --- a/models/s_shufflenet.py +++ b/models/s_shufflenet.py @@ -16,7 +16,7 @@ def __init__(self, groups): def forward(self, x): b, n, h, w = x.size() - x = x.view(b, self.groups, n//self.groups, h, w) + x = x.view(b, self.groups, n // self.groups, h, w) x = torch.transpose(x, 1, 2).contiguous() x = x.view(b, -1, h, w) return x @@ -41,10 +41,10 @@ def __init__(self, inp, outp, stride): else: self.first_group = FLAGS.groups - inp_split = [i//self.first_group for i in inp] - midp = [i//FLAGS.width_compress for i in outp] - lastp = [i//FLAGS.groups for i in block_outp] - firstp = [i//FLAGS.groups for i in midp] + inp_split = [i // self.first_group for i in inp] + midp = [i // FLAGS.width_compress for i in outp] + lastp = [i // FLAGS.groups for i in block_outp] + firstp = [i // FLAGS.groups for i in midp] self.firstp = firstp self.inp = inp self.midp = midp @@ -75,8 +75,8 @@ def __init__(self, inp, outp, stride): SwitchableBatchNorm2d(midp), # nn.ReLU(inplace=True), ] - midp_split = [i//FLAGS.groups for i in midp] - lastp = [i//FLAGS.groups for i in block_outp] + midp_split = [i // FLAGS.groups for i in midp] + lastp = [i // FLAGS.groups for i in block_outp] layers_c = [ nn.Sequential( SlimmableConv2d(midp_split, lastp, 1, 1, 0, bias=False), @@ -98,12 +98,14 @@ def __init__(self, inp, outp, stride): def forward(self, x): if self.residual_connection: res = x - x_split = torch.split(res, list(res.size())[1]//self.a_len, dim=1) + x_split = torch.split( + res, list(res.size())[1] // self.a_len, dim=1) res = torch.cat( [getattr(self, 'a_{}'.format(i))(x_split[i]) for i in range( self.a_len)], 1) res = self.b(res) - x_split = torch.split(res, list(res.size())[1]//self.c_len, dim=1) + x_split = torch.split( + res, list(res.size())[1] // self.c_len, dim=1) res = torch.cat( [getattr(self, 'c_{}'.format(i))(x_split[i]) for i in range( self.c_len)], 1) @@ -118,12 +120,13 @@ def forward(self, x): self.a_len)], 1) else: x_split = torch.split( - res, list(res.size())[1]//self.a_len, dim=1) + res, list(res.size())[1] // self.a_len, dim=1) res = torch.cat( [getattr(self, 'a_{}'.format(i))( x_split[i]) for i in range(self.a_len)], 1) res = self.b(res) - x_split = torch.split(res, list(res.size())[1]//self.c_len, dim=1) + x_split = torch.split( + res, list(res.size())[1] // self.c_len, dim=1) res = torch.cat( [getattr(self, 'c_{}'.format(i))(x_split[i]) for i in range( self.c_len)], 1) @@ -161,9 +164,10 @@ def __init__(self, num_classes=1000, input_size=224): self.features = [] - channels = [int(24*width_mult) for width_mult in FLAGS.width_mult_list] + channels = [ + int(24 * width_mult) for width_mult in FLAGS.width_mult_list] first_stride = 2 - group_channels = [i//FLAGS.groups for i in channels] + group_channels = [i // FLAGS.groups for i in channels] head = [ nn.Sequential( SlimmableConv2d( @@ -179,11 +183,12 @@ def __init__(self, num_classes=1000, input_size=224): setattr(self, 'head_{}'.format(i), head[i]) for c, s in self.block_setting: - outp = [int(c*width_mult) for width_mult in FLAGS.width_mult_list] + outp = [ + int(c * width_mult) for width_mult in FLAGS.width_mult_list] self.features.append(Block(channels, outp, s)) channels = outp - avg_pool_size = input_size//32 + avg_pool_size = input_size // 32 self.features.append(nn.AvgPool2d(avg_pool_size)) # make it nn.Sequential @@ -191,9 +196,9 @@ def __init__(self, num_classes=1000, input_size=224): # classifier self.classifier = nn.Sequential( - SlimmableLinear( - channels, - [num_classes for _ in range(len(channels))]) + SlimmableLinear( + channels, + [num_classes for _ in range(len(channels))]) ) if FLAGS.reset_parameters: self.reset_parameters() diff --git a/models/slimmable_ops.py b/models/slimmable_ops.py index 2cccc9b..0fa43b7 100644 --- a/models/slimmable_ops.py +++ b/models/slimmable_ops.py @@ -71,3 +71,130 @@ def forward(self, input): else: bias = self.bias return nn.functional.linear(input, weight, bias) + + +def make_divisible(v, divisor=8, min_value=1): + """ + forked from slim: + https://github.com/tensorflow/models/blob/\ + 0344c5503ee55e24f0de7f37336a6e08f10976fd/\ + research/slim/nets/mobilenet/mobilenet.py#L62-L69 + """ + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than 10%. + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +class USConv2d(nn.Conv2d): + def __init__(self, in_channels, out_channels, kernel_size, stride=1, + padding=0, dilation=1, groups=1, depthwise=False, bias=True, + us=[True, True], ratio=[1, 1]): + super(USConv2d, self).__init__( + in_channels, out_channels, + kernel_size, stride=stride, padding=padding, dilation=dilation, + groups=groups, bias=bias) + self.depthwise = depthwise + self.in_channels_max = in_channels + self.out_channels_max = out_channels + self.width_mult = None + self.us = us + self.ratio = ratio + + def forward(self, input): + if self.us[0]: + self.in_channels = make_divisible( + self.in_channels_max + * self.width_mult + / self.ratio[0]) * self.ratio[0] + if self.us[1]: + self.out_channels = make_divisible( + self.out_channels_max + * self.width_mult + / self.ratio[1]) * self.ratio[1] + self.groups = self.in_channels if self.depthwise else 1 + weight = self.weight[:self.out_channels, :self.in_channels, :, :] + if self.bias is not None: + bias = self.bias[:self.out_channels] + else: + bias = self.bias + y = nn.functional.conv2d( + input, weight, bias, self.stride, self.padding, + self.dilation, self.groups) + if getattr(FLAGS, 'conv_averaged', False): + y = y * (max(self.in_channels_list)/self.in_channels) + return y + + +class USLinear(nn.Linear): + def __init__(self, in_features, out_features, bias=True, us=[True, True]): + super(USLinear, self).__init__( + in_features, out_features, bias=bias) + self.in_features_max = in_features + self.out_features_max = out_features + self.width_mult = None + self.us = us + + def forward(self, input): + if self.us[0]: + self.in_features = make_divisible( + self.in_features_max * self.width_mult) + if self.us[1]: + self.out_features = make_divisible( + self.out_features_max * self.width_mult) + weight = self.weight[:self.out_features, :self.in_features] + if self.bias is not None: + bias = self.bias[:self.out_features] + else: + bias = self.bias + return nn.functional.linear(input, weight, bias) + + +class USBatchNorm2d(nn.BatchNorm2d): + def __init__(self, num_features, ratio=1): + super(USBatchNorm2d, self).__init__( + num_features, affine=True, track_running_stats=False) + self.num_features_max = num_features + # for tracking performance during training + self.bn = nn.ModuleList( + [nn.BatchNorm2d(i, affine=False) + for i in [ + make_divisible( + self.num_features_max * width_mult / ratio) * ratio + for width_mult in FLAGS.width_mult_list] + ] + ) + self.ratio = ratio + self.width_mult = None + self.ignore_model_profiling = True + + def forward(self, input): + weight = self.weight + bias = self.bias + c = make_divisible( + self.num_features_max * self.width_mult / self.ratio) * self.ratio + if self.width_mult in FLAGS.width_mult_list: + idx = FLAGS.width_mult_list.index(self.width_mult) + y = nn.functional.batch_norm( + input, + self.bn[idx].running_mean[:c], + self.bn[idx].running_var[:c], + weight[:c], + bias[:c], + self.training, + self.momentum, + self.eps) + else: + y = nn.functional.batch_norm( + input, + self.running_mean, + self.running_var, + weight[:c], + bias[:c], + self.training, + self.momentum, + self.eps) + return y diff --git a/models/us_mobilenet_v1.py b/models/us_mobilenet_v1.py new file mode 100644 index 0000000..ba24274 --- /dev/null +++ b/models/us_mobilenet_v1.py @@ -0,0 +1,107 @@ +import math +import torch.nn as nn + + +from .slimmable_ops import USBatchNorm2d, USConv2d, USLinear, make_divisible +from utils.config import FLAGS + + +class DepthwiseSeparableConv(nn.Module): + def __init__(self, inp, outp, stride): + super(DepthwiseSeparableConv, self).__init__() + assert stride in [1, 2] + + layers = [ + USConv2d( + inp, inp, 3, stride, 1, groups=inp, depthwise=True, + bias=False), + USBatchNorm2d(inp), + nn.ReLU6(inplace=True), + + USConv2d(inp, outp, 1, 1, 0, bias=False), + USBatchNorm2d(outp), + nn.ReLU6(inplace=True), + ] + self.body = nn.Sequential(*layers) + + def forward(self, x): + return self.body(x) + +class Model(nn.Module): + def __init__(self, num_classes=1000, input_size=224): + super(Model, self).__init__() + + # setting of inverted residual blocks + self.block_setting = [ + # c, n, s + [64, 1, 1], + [128, 2, 2], + [256, 2, 2], + [512, 6, 2], + [1024, 2, 2], + ] + + self.features = [] + + width_mult = FLAGS.width_mult_range[-1] + # head + assert input_size % 32 == 0 + channels = make_divisible(32 * width_mult) + self.outp = make_divisible(1024 * width_mult) + first_stride = 2 + self.features.append( + nn.Sequential( + USConv2d( + 3, channels, 3, first_stride, 1, bias=False, + us=[False, True]), + USBatchNorm2d(channels), + nn.ReLU6(inplace=True)) + ) + + # body + for c, n, s in self.block_setting: + outp = make_divisible(c * width_mult) + for i in range(n): + if i == 0: + self.features.append( + DepthwiseSeparableConv(channels, outp, s)) + else: + self.features.append( + DepthwiseSeparableConv(channels, outp, 1)) + channels = outp + + avg_pool_size = input_size// 32 + self.features.append(nn.AvgPool2d(avg_pool_size)) + + # make it nn.Sequential + self.features = nn.Sequential(*self.features) + + # classifier + self.classifier = nn.Sequential( + USLinear(self.outp, num_classes, us=[True, False]) + ) + if FLAGS.reset_parameters: + self.reset_parameters() + + def forward(self, x): + x = self.features(x) + last_dim = x.size()[1] + x = x.view(-1, last_dim) + x = self.classifier(x) + return x + + def reset_parameters(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, math.sqrt(2. / n)) + if m.bias is not None: + m.bias.data.zero_() + elif isinstance(m, nn.BatchNorm2d): + if m.affine: + m.weight.data.fill_(1) + m.bias.data.zero_() + elif isinstance(m, nn.Linear): + n = m.weight.size(1) + m.weight.data.normal_(0, 0.01) + m.bias.data.zero_() diff --git a/models/us_mobilenet_v2.py b/models/us_mobilenet_v2.py new file mode 100644 index 0000000..03ef058 --- /dev/null +++ b/models/us_mobilenet_v2.py @@ -0,0 +1,141 @@ +import math +import torch.nn as nn + + +from .slimmable_ops import USBatchNorm2d, USConv2d, USLinear, make_divisible +from utils.config import FLAGS + + +class InvertedResidual(nn.Module): + def __init__(self, inp, outp, stride, expand_ratio): + super(InvertedResidual, self).__init__() + assert stride in [1, 2] + + self.residual_connection = stride == 1 and inp == outp + + layers = [] + # expand + expand_inp = inp * expand_ratio + if expand_ratio != 1: + layers += [ + USConv2d( + inp, expand_inp, 1, 1, 0, bias=False, + ratio=[1, expand_ratio]), + USBatchNorm2d(expand_inp, ratio=expand_ratio), + nn.ReLU6(inplace=True), + ] + # depthwise + project back + layers += [ + USConv2d( + expand_inp, expand_inp, 3, stride, 1, groups=expand_inp, + depthwise=True, bias=False, + ratio=[expand_ratio, expand_ratio]), + USBatchNorm2d(expand_inp, ratio=expand_ratio), + nn.ReLU6(inplace=True), + + USConv2d( + expand_inp, outp, 1, 1, 0, bias=False, + ratio=[expand_ratio, 1]), + USBatchNorm2d(outp), + ] + self.body = nn.Sequential(*layers) + + def forward(self, x): + if self.residual_connection: + res = self.body(x) + res += x + else: + res = self.body(x) + return res + + +class Model(nn.Module): + def __init__(self, num_classes=1000, input_size=224): + super(Model, self).__init__() + + # setting of inverted residual blocks + self.block_setting = [ + # t, c, n, s + [1, 16, 1, 1], + [6, 24, 2, 2], + [6, 32, 3, 2], + [6, 64, 4, 2], + [6, 96, 3, 1], + [6, 160, 3, 2], + [6, 320, 1, 1], + ] + if FLAGS.dataset == 'cifar10': + self.block_setting[2] = [6, 24, 2, 1] + + self.features = [] + + width_mult = FLAGS.width_mult_range[-1] + # head + assert input_size % 32 == 0 + channels = make_divisible(32 * width_mult) + self.outp = make_divisible( + 1280 * width_mult) if width_mult > 1.0 else 1280 + first_stride = 2 + self.features.append( + nn.Sequential( + USConv2d( + 3, channels, 3, first_stride, 1, bias=False, + us=[False, True]), + USBatchNorm2d(channels), + nn.ReLU6(inplace=True)) + ) + + # body + for t, c, n, s in self.block_setting: + outp = make_divisible(c * width_mult) + for i in range(n): + if i == 0: + self.features.append( + InvertedResidual(channels, outp, s, t)) + else: + self.features.append( + InvertedResidual(channels, outp, 1, t)) + channels = outp + + # tail + self.features.append( + nn.Sequential( + USConv2d( + channels, self.outp, 1, 1, 0, bias=False, + us=[True, False]), + nn.BatchNorm2d(self.outp), + nn.ReLU6(inplace=True), + ) + ) + avg_pool_size = input_size // 32 + self.features.append(nn.AvgPool2d(avg_pool_size)) + + # make it nn.Sequential + self.features = nn.Sequential(*self.features) + + # classifier + self.classifier = nn.Sequential(nn.Linear(self.outp, num_classes)) + if FLAGS.reset_parameters: + self.reset_parameters() + + def forward(self, x): + x = self.features(x) + x = x.view(-1, self.outp) + x = self.classifier(x) + return x + + def reset_parameters(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, math.sqrt(2. / n)) + if m.bias is not None: + m.bias.data.zero_() + elif isinstance(m, nn.BatchNorm2d): + if m.affine: + m.weight.data.fill_(1) + m.bias.data.zero_() + elif isinstance(m, nn.Linear): + n = m.weight.size(1) + m.weight.data.normal_(0, 0.01) + m.bias.data.zero_() diff --git a/train.py b/train.py index c217b2c..10f575c 100644 --- a/train.py +++ b/train.py @@ -132,15 +132,15 @@ def get_lr_scheduler(optimizer): if i == 0: lr_dict[i] = 1 else: - lr_dict[i] = lr_dict[i-1] * FLAGS.exp_decaying_lr_gamma - lr_lambda = lambda epoch: lr_dict[epoch] # noqa: E731 + lr_dict[i] = lr_dict[i - 1] * FLAGS.exp_decaying_lr_gamma + lr_lambda = lambda epoch: lr_dict[epoch] lr_scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=lr_lambda) elif FLAGS.lr_scheduler == 'linear_decaying': lr_dict = {} for i in range(FLAGS.num_epochs): lr_dict[i] = 1. - i / FLAGS.num_epochs - lr_lambda = lambda epoch: lr_dict[epoch] # noqa: E731 + lr_lambda = lambda epoch: lr_dict[epoch] lr_scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=lr_lambda) else: @@ -248,7 +248,7 @@ def forward_loss(model, criterion, input, target, meter): correct = pred.eq(target.view(1, -1).expand_as(pred)) for k in FLAGS.topk: correct_k = correct[:k].float().sum(0) - error_list = list(1.-correct_k.cpu().detach().numpy()) + error_list = list(1. - correct_k.cpu().detach().numpy()) meter['top{}_error'.format(k)].cache_list(error_list) return loss @@ -271,7 +271,8 @@ def run_one_epoch( other_widths.remove(min_width) if train and FLAGS.lr_scheduler == 'linear_decaying': linear_decaying_per_step = ( - FLAGS.lr/FLAGS.num_epochs/len(loader.dataset)*FLAGS.batch_size) + FLAGS.lr / FLAGS.num_epochs / + len(loader.dataset) * FLAGS.batch_size) for batch_idx, (input, target) in enumerate(loader): target = target.cuda(non_blocking=True) if train: @@ -311,8 +312,8 @@ def run_one_epoch( for k, v in results.items())) else: results = flush_scalar_meters(meters) - print('{:.1f}s\t{}\t{}/{}: '.format( - time.time() - t_start, phase, epoch, FLAGS.num_epochs) + + print('{:.1f}s\t{}\t{}/{}: '.format(time.time() - t_start, + phase, epoch, FLAGS.num_epochs) + ', '.join('{}: {:.3f}'.format(k, v) for k, v in results.items())) return results @@ -391,7 +392,7 @@ def train_val_test(): return print('Start training.') - for epoch in range(last_epoch+1, FLAGS.num_epochs): + for epoch in range(last_epoch + 1, FLAGS.num_epochs): lr_scheduler.step() # train results = run_one_epoch( diff --git a/utils/config.py b/utils/config.py index 1ad7e0b..51c5c8e 100644 --- a/utils/config.py +++ b/utils/config.py @@ -89,7 +89,7 @@ def __repr__(self): ret_str.append('{}:'.format(key)) child_ret_str = value.__repr__().split('\n') for item in child_ret_str: - ret_str.append(' '+item) + ret_str.append(' ' + item) elif isinstance(value, list): if isinstance(value[0], AttrDict): ret_str.append('{}:'.format(key)) @@ -97,7 +97,7 @@ def __repr__(self): # treat as AttrDict above child_ret_str = item.__repr__().split('\n') for item in child_ret_str: - ret_str.append(' '+item) + ret_str.append(' ' + item) else: ret_str.append('{}: {}'.format(key, value)) else: diff --git a/utils/model_profiling.py b/utils/model_profiling.py index 50b4d6a..361ace3 100644 --- a/utils/model_profiling.py +++ b/utils/model_profiling.py @@ -131,9 +131,8 @@ def module_profiling(self, input, output, verbose): def add_profiling_hooks(m, verbose): global model_profiling_hooks model_profiling_hooks.append( - m.register_forward_hook( - lambda m, input, output: module_profiling( - m, input, output, verbose=verbose))) + m.register_forward_hook(lambda m, input, output: module_profiling( + m, input, output, verbose=verbose))) def remove_profiling_hooks(): @@ -174,10 +173,12 @@ def model_profiling(model, height, width, batch=1, channel=3, use_cuda=True, 'macs'.rjust(macs_space, ' ') + 'nanosecs'.rjust(seconds_space, ' ')) if verbose: - print(''.center(name_space+params_space+macs_space+seconds_space, '-')) + print(''.center( + name_space + params_space + macs_space + seconds_space, '-')) model(data) if verbose: - print(''.center(name_space+params_space+macs_space+seconds_space, '-')) + print(''.center( + name_space + params_space + macs_space + seconds_space, '-')) print( 'Total'.ljust(name_space, ' ') + '{:,}'.format(model.n_params).rjust(params_space, ' ') + diff --git a/utils/transforms.py b/utils/transforms.py index c8c15e4..72fa4e8 100644 --- a/utils/transforms.py +++ b/utils/transforms.py @@ -8,9 +8,9 @@ imagenet_pca = { 'eigval': np.asarray([0.2175, 0.0188, 0.0045]), 'eigvec': np.asarray([ - [-0.5675, 0.7192, 0.4009], + [-0.5675, 0.7192, 0.4009], [-0.5808, -0.0045, -0.8140], - [-0.5836, -0.6948, 0.4203], + [-0.5836, -0.6948, 0.4203], ]) }