Open10
HybridNetsのONNXエクスポート試行
Traceback (most recent call last):
File "hybridnets_test.py", line 122, in <module>
torch.onnx.export(
File "/usr/local/lib/python3.8/dist-packages/torch/onnx/__init__.py", line 316, in export
return utils.export(model, args, f, export_params, verbose, training,
File "/usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py", line 107, in export
_export(model, args, f, export_params, verbose, training, input_names, output_names,
File "/usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py", line 737, in _export
proto, export_map, val_use_external_data_format = graph._export_onnx(
RuntimeError: ONNX export failed: Couldn't export Python operator SwishImplementation
Defined at:
/usr/local/lib/python3.8/dist-packages/efficientnet_pytorch/utils.py(80): forward
/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(1090): _slow_forward
/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(1102): _call_impl
/usr/local/lib/python3.8/dist-packages/torch/nn/modules/container.py(141): forward
/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(1090): _slow_forward
/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(1102): _call_impl
/home/user/workdir/encoders/efficientnet.py(66): forward
/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(1090): _slow_forward
/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(1102): _call_impl
/home/user/workdir/backbone.py(102): forward
/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(1090): _slow_forward
/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(1102): _call_impl
/usr/local/lib/python3.8/dist-packages/torch/jit/_trace.py(118): wrapper
/usr/local/lib/python3.8/dist-packages/torch/jit/_trace.py(127): forward
/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(1102): _call_impl
/usr/local/lib/python3.8/dist-packages/torch/jit/_trace.py(1166): _get_trace_graph
/usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py(388): _trace_and_get_graph_from_model
/usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py(437): _create_jit_graph
/usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py(493): _model_to_graph
/usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py(724): _export
/usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py(107): export
/usr/local/lib/python3.8/dist-packages/torch/onnx/__init__.py(316): export
hybridnets_test.py(123): <module>
From:
backbone.py
self.bifpn = nn.Sequential(
*[BiFPN(self.fpn_num_filters[self.compound_coef],
conv_channel_coef[compound_coef],
True if _ == 0 else False,
attention=True if compound_coef < 6 else False,
use_p8=compound_coef > 7)
To:
backbone.py
self.bifpn = nn.Sequential(
*[BiFPN(self.fpn_num_filters[self.compound_coef],
conv_channel_coef[compound_coef],
True if _ == 0 else False,
attention=True if compound_coef < 6 else False,
use_p8=compound_coef > 7,
onnx_export=True)
From:
backbone.py
self.regressor = Regressor(in_channels=self.fpn_num_filters[self.compound_coef], num_anchors=num_anchors,
num_layers=self.box_class_repeats[self.compound_coef],
pyramid_levels=self.pyramid_levels[self.compound_coef])
To:
backbone.py
self.regressor = Regressor(in_channels=self.fpn_num_filters[self.compound_coef], num_anchors=num_anchors,
num_layers=self.box_class_repeats[self.compound_coef],
pyramid_levels=self.pyramid_levels[self.compound_coef],
onnx_export=True)
From:
backbone.py
self.classifier = Classifier(in_channels=self.fpn_num_filters[self.compound_coef], num_anchors=num_anchors,
num_classes=num_classes,
num_layers=self.box_class_repeats[self.compound_coef],
pyramid_levels=self.pyramid_levels[self.compound_coef])
To:
backbone.py
self.classifier = Classifier(in_channels=self.fpn_num_filters[self.compound_coef], num_anchors=num_anchors,
num_classes=num_classes,
num_layers=self.box_class_repeats[self.compound_coef],
pyramid_levels=self.pyramid_levels[self.compound_coef],
onnx_export=True)
-
onnx_export
で grep - 全て
onnx_export = True
に変更
class SeparableConvBlock(nn.Module):
def __init__(self, in_channels, out_channels=None, norm=True, activation=False, onnx_export=False):
class SeparableConvBlock(nn.Module):
def __init__(self, in_channels, out_channels=None, norm=True, activation=False, onnx_export=True):
class BiFPN(nn.Module):
def __init__(self, num_channels, conv_channels, first_time=False, epsilon=1e-4, onnx_export=False, attention=True,
class BiFPN(nn.Module):
def __init__(self, num_channels, conv_channels, first_time=False, epsilon=1e-4, onnx_export=True, attention=True,
class Regressor(nn.Module):
def __init__(self, in_channels, num_anchors, num_layers, pyramid_levels=5, onnx_export=False):
class Regressor(nn.Module):
def __init__(self, in_channels, num_anchors, num_layers, pyramid_levels=5, onnx_export=True):
class Classifier(nn.Module):
def __init__(self, in_channels, num_anchors, num_classes, num_layers, pyramid_levels=5, onnx_export=False):
class Classifier(nn.Module):
def __init__(self, in_channels, num_anchors, num_classes, num_layers, pyramid_levels=5, onnx_export=True):
backbone.py
if backbone_name:
# Use timm to create another backbone that you prefer
# https://github.com/rwightman/pytorch-image-models
self.encoder = timm.create_model(backbone_name, pretrained=True, features_only=True, out_indices=(2,3,4)) # P3,P4,P5
backbone.py
if backbone_name:
# Use timm to create another backbone that you prefer
# https://github.com/rwightman/pytorch-image-models
self.encoder = timm.create_model(backbone_name, pretrained=True, features_only=True, out_indices=(2,3,4), exportable=True) # P3,P4,P5
From:
/usr/local/lib/python3.8/dist-packages/efficientnet_pytorch/model.py
self._swish = MemoryEfficientSwish()
To:
/usr/local/lib/python3.8/dist-packages/efficientnet_pytorch/model.py
self._swish = Swish()
- 入力解像度は 128 の倍数?
- ベンチマーク用ロジック
batch_size=1
test.py
import onnxruntime as ort
import numpy as np
import time
H=256
W=384
# MODEL_FILE1 = f'hybridnets_Nx{H}x{W}.onnx'
MODEL_FILE2 = f'hybridnets_{H}x{W}.onnx'
#============================================================================
print(f'@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ TensorRT test')
session_option = ort.SessionOptions()
session_option.log_severity_level = 4
# Single inference x10
onnx_session = ort.InferenceSession(
MODEL_FILE2,
sess_options=session_option,
providers=[
(
'TensorrtExecutionProvider', {
'trt_engine_cache_enable': True,
'trt_fp16_enable': True,
}
),
],
)
input_name = onnx_session.get_inputs()[0].name
output_name = onnx_session.get_outputs()[0].name
print('@@@@@@@@@@ Single inference x10')
# Warm up
results = onnx_session.run(
None,
{input_name: np.ones(([1, 3, H, W]), dtype=np.float32)},
)
# Inference
print(f'input.shape: {onnx_session.get_inputs()[0].shape}')
start = time.time()
for i in range(10):
results = onnx_session.run(
None,
{input_name: np.ones(([1, 3, H, W]), dtype=np.float32)},
)
print(f'results.shape: {results[0].shape}')
total_time = (time.time()-start)*1000
print(f'elapsed time: {total_time} ms')
print(f'avg time: {total_time/10} ms')
print()
#============================================================================
print(f'@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ CUDA test')
session_option2 = ort.SessionOptions()
session_option2.log_severity_level = 4
session_option2.optimized_model_filepath = f"{MODEL_FILE2}_cudaopt.onnx"
session_option2.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
# Single inference x10
onnx_session = ort.InferenceSession(
MODEL_FILE2,
sess_options=session_option2,
providers=[
'CUDAExecutionProvider',
],
)
input_name = onnx_session.get_inputs()[0].name
output_name = onnx_session.get_outputs()[0].name
print('@@@@@@@@@@ Single inference x10')
# Warm up
results = onnx_session.run(
None,
{input_name: np.ones(([1, 3, H, W]), dtype=np.float32)},
)
# Inference
print(f'input.shape: {onnx_session.get_inputs()[0].shape}')
start = time.time()
for i in range(10):
results = onnx_session.run(
None,
{input_name: np.ones(([1, 3, H, W]), dtype=np.float32)},
)
print(f'results.shape: {results[0].shape}')
total_time = (time.time()-start)*1000
print(f'elapsed time: {total_time} ms')
print(f'avg time: {total_time/10} ms')
print()
#============================================================================
print(f'@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ CPU test')
session_option = ort.SessionOptions()
session_option.log_severity_level = 4
# Single inference x10
onnx_session = ort.InferenceSession(
MODEL_FILE2,
sess_options=session_option,
providers=[
'CPUExecutionProvider',
],
)
input_name = onnx_session.get_inputs()[0].name
output_name = onnx_session.get_outputs()[0].name
print('@@@@@@@@@@ Single inference x10')
# Warm up
results = onnx_session.run(
None,
{input_name: np.ones(([1, 3, H, W]), dtype=np.float32)},
)
# Inference
print(f'input.shape: {onnx_session.get_inputs()[0].shape}')
start = time.time()
for i in range(10):
results = onnx_session.run(
None,
{input_name: np.ones(([1, 3, H, W]), dtype=np.float32)},
)
print(f'results.shape: {results[0].shape}')
total_time = (time.time()-start)*1000
print(f'elapsed time: {total_time} ms')
print(f'avg time: {total_time/10} ms')
print()
#============================================================================
print(f'@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ OpenVINO Execution Provider "CPU" test')
session_option = ort.SessionOptions()
session_option.log_severity_level = 4
session_option.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL
ort.capi._pybind_state.set_openvino_device('CPU_FP32')
# Single inference x10
onnx_session = ort.InferenceSession(
MODEL_FILE2,
sess_options=session_option,
providers=[
'OpenVINOExecutionProvider',
],
)
input_name = onnx_session.get_inputs()[0].name
output_name = onnx_session.get_outputs()[0].name
print('@@@@@@@@@@ Single inference x10')
# Warm up
results = onnx_session.run(
None,
{input_name: np.ones(([1, 3, H, W]), dtype=np.float32)},
)
# Inference
print(f'input.shape: {onnx_session.get_inputs()[0].shape}')
start = time.time()
for i in range(10):
results = onnx_session.run(
None,
{input_name: np.ones(([1, 3, H, W]), dtype=np.float32)},
)
print(f'results.shape: {results[0].shape}')
total_time = (time.time()-start)*1000
print(f'elapsed time: {total_time} ms')
print(f'avg time: {total_time/10} ms')
print()
- ベンチマーク 256x384
- ONNX -> TensorRT
- ONNX -> CUDA
- onnxruntime CPU
- onnxruntime OpenVINO Execution Provider CPU
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ TensorRT test
@@@@@@@@@@ Single inference x10
input.shape: [1, 3, 256, 384]
results.shape: (1, 18414, 4)
elapsed time: 44.289588928222656 ms
avg time: 4.428958892822266 ms
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ CUDA test
@@@@@@@@@@ Single inference x10
input.shape: [1, 3, 256, 384]
results.shape: (1, 18414, 4)
elapsed time: 106.24074935913086 ms
avg time: 10.624074935913086 ms
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ CPU test
@@@@@@@@@@ Single inference x10
input.shape: [1, 3, 256, 384]
results.shape: (1, 18414, 4)
elapsed time: 513.0438804626465 ms
avg time: 51.30438804626465 ms
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ OpenVINO Execution Provider "CPU" test
@@@@@@@@@@ Single inference x10
input.shape: [1, 3, 256, 384]
results.shape: (1, 18414, 4)
elapsed time: 164.85214233398438 ms
avg time: 16.485214233398438 ms
- PyTorch EfficientNet 改造済み
- timm 改造済み
docker pull pinto0309/hybrid_nets_export:latest
docker run --gpus all -it --rm \
-v `pwd`:/home/user/workdir \
pinto0309/hybrid_nets_export:latest