Using torch.jit.trace with optimize=True shows no performance difference with optimize=False
The test model I used is resnet from torchvision. I modified it to run only the features extraction (no ave pooling and fc for classification).
Inference test.py python script:
""" Pytorch inference script """
from __future__ import print_function
from __future__ import absolute_import
from __future__ import division
import argparse
import timeit
import numpy as np
import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
# Select appropriate model for test
import resnet
def timeGraph(model, batch_size, num_loops):
# Create random input tensor of certain size
input = torch.rand(batch_size, 3, 1200, 1920, dtype=torch.float).cuda()
print("Warm up ...")
with torch.no_grad():
for _ in range(20):
model(input)
print("Start timing ...")
timings = []
with torch.no_grad():
for i in range(num_loops):
start_time = timeit.default_timer()
features = model(input)
end_time = timeit.default_timer()
timings.append(end_time - start_time)
print("Iteration {}: {:.6f} s".format(i, end_time - start_time))
print("Output features size:", features.size())
return timings
def printStats(graphName,timings,batch_size):
times = np.array(timings)
steps = len(times)
speeds = batch_size / times
time_mean = np.mean(times)
time_med = np.median(times)
time_99th = np.percentile(times, 99)
time_std = np.std(times, ddof=0)
speed_mean = np.mean(speeds)
speed_med = np.median(speeds)
msg = ("\n%s =================================\n"
"batch size=%d, num iterations=%d\n"
" Median FPS: %.1f, mean: %.1f\n"
" Median latency: %.6f, mean: %.6f, 99th_p: %.6f, std_dev: %.6f\n"
) % (graphName,
batch_size, steps,
speed_med, speed_mean,
time_med, time_mean, time_99th, time_std)
print(msg)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Run inference on a model with random input values")
parser.add_argument('--gpu', default=None, type=int, help='GPU id to use.')
parser.add_argument("--batch_size", type=int, default=1, help="Batch size (default=1)")
parser.add_argument('--optimize', action='store_true', help='Turn on optimization for traced model')
parser.add_argument("--iter", default=10, type=int, help="Number of iteration loops")
args = parser.parse_args()
# Creating model with random weights
model = resnet.resnet50()
print("Tracing model... Optimization=", args.optimize)
example_input = torch.rand(args.batch_size, 3, 1200, 1920, dtype=torch.float)
traced_model = torch.jit.trace(model, example_input,
check_trace=True,
check_tolerance=1e-05,
optimize=args.optimize,
)
# Save the script module
# traced_model.save("model_traced.pt")
# Create graph on GPU if CUDA is available
if args.gpu is not None:
if torch.cuda.is_available():
# Enable CuDNN autotune for better performance (with fixed inputs)
cudnn.benchmark = True
traced_model = traced_model.cuda(args.gpu)
else:
raise Exception("No cuda available.")
dev = torch.cuda.current_device()
print("Cuda device id, count=", dev, torch.cuda.device_count())
print("Cuda DNN version=", cudnn.version())
print("Cuda compute capability=", torch.cuda.get_device_capability(dev))
print("Cuda device name=", torch.cuda.get_device_name(dev))
# Timing graph inference
timings = timeGraph(traced_model, args.batch_size, args.iter)
printStats("resnet", timings, args.batch_size)
Modified resnet.py from torchvision
import torch.nn as nn
import math
import torch.utils.model_zoo as model_zoo
__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
'resnet152']
model_urls = {
'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
}
def conv3x3(in_planes, out_planes, stride=1):
"""3x3 convolution with padding"""
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=1, bias=False)
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = nn.BatchNorm2d(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
self.bn2 = nn.BatchNorm2d(planes)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class Bottleneck(nn.Module):
expansion = 4
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(Bottleneck, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(planes * self.expansion)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class ResNet(nn.Module):
def __init__(self, block, layers, num_classes=1000):
self.inplanes = 64
super(ResNet, self).__init__()
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
self.avgpool = nn.AvgPool2d(7, stride=1)
self.fc = nn.Linear(512 * block.expansion, num_classes)
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
def _make_layer(self, block, planes, blocks, stride=1):
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(self.inplanes, planes * block.expansion,
kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(planes * block.expansion),
)
layers = []
layers.append(block(self.inplanes, planes, stride, downsample))
self.inplanes = planes * block.expansion
for i in range(1, blocks):
layers.append(block(self.inplanes, planes))
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
#x = self.avgpool(x)
#x = x.view(x.size(0), -1)
#x = self.fc(x)
return x
def resnet50(pretrained=False, **kwargs):
"""Constructs a ResNet-50 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
return model
To Reproduce
Steps to reproduce the behavior:
Run test.py with GPU:
python test.py --gpu 0 --iter 100
Run test.py with GPU and trace optimize:
Expected behaviorpython test.py --gpu 0 --optimize --iter 100
Tracing model... Optimization= True
Cuda device id, count= 0 1
Cuda DNN version= 7401
Cuda compute capability= (6, 1)
Cuda device name= GeForce GTX 1080
Warm up ...
Start timing ...
Iteration 0: 0.133147 s
Iteration 1: 0.137695 s
Iteration 2: 0.132463 s
Iteration 3: 0.132877 s
Iteration 4: 0.132633 s
Iteration 5: 0.137405 s
Iteration 6: 0.134528 s
Iteration 7: 0.133907 s
Iteration 8: 0.134656 s
Iteration 9: 0.133537 s
Output features size: (1, 2048, 38, 60)
resnet =================================
batch size=1, num iterations=10
Median FPS: 7.5, mean: 7.4
Median latency: 0.133722, mean: 0.134285, 99th_p: 0.137669, std_dev: 0.001777
Environment
conda
, pip
, source): pip/sourceAlso tried to run the model without jit.trace, and there seem to be little change in performance also.
cc @suo
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4