HybridNetsのONNXエクスポート試行

Traceback (most recent call last):
  File "hybridnets_test.py", line 122, in <module>
    torch.onnx.export(
  File "/usr/local/lib/python3.8/dist-packages/torch/onnx/__init__.py", line 316, in export
    return utils.export(model, args, f, export_params, verbose, training,
  File "/usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py", line 107, in export
    _export(model, args, f, export_params, verbose, training, input_names, output_names,
  File "/usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py", line 737, in _export
    proto, export_map, val_use_external_data_format = graph._export_onnx(
RuntimeError: ONNX export failed: Couldn't export Python operator SwishImplementation

PINTO

Defined at:
/usr/local/lib/python3.8/dist-packages/efficientnet_pytorch/utils.py(80): forward
/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(1090): _slow_forward
/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(1102): _call_impl
/usr/local/lib/python3.8/dist-packages/torch/nn/modules/container.py(141): forward
/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(1090): _slow_forward
/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(1102): _call_impl
/home/user/workdir/encoders/efficientnet.py(66): forward
/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(1090): _slow_forward
/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(1102): _call_impl
/home/user/workdir/backbone.py(102): forward
/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(1090): _slow_forward
/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(1102): _call_impl
/usr/local/lib/python3.8/dist-packages/torch/jit/_trace.py(118): wrapper
/usr/local/lib/python3.8/dist-packages/torch/jit/_trace.py(127): forward
/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py(1102): _call_impl
/usr/local/lib/python3.8/dist-packages/torch/jit/_trace.py(1166): _get_trace_graph
/usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py(388): _trace_and_get_graph_from_model
/usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py(437): _create_jit_graph
/usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py(493): _model_to_graph
/usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py(724): _export
/usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py(107): export
/usr/local/lib/python3.8/dist-packages/torch/onnx/__init__.py(316): export
hybridnets_test.py(123): <module>

PINTO

From:

backbone.py
        self.bifpn = nn.Sequential(
            *[BiFPN(self.fpn_num_filters[self.compound_coef],
                    conv_channel_coef[compound_coef],
                    True if _ == 0 else False,
                    attention=True if compound_coef < 6 else False,
                    use_p8=compound_coef > 7)

To:

backbone.py
        self.bifpn = nn.Sequential(
            *[BiFPN(self.fpn_num_filters[self.compound_coef],
                    conv_channel_coef[compound_coef],
                    True if _ == 0 else False,
                    attention=True if compound_coef < 6 else False,
                    use_p8=compound_coef > 7,
                    onnx_export=True)

From:

backbone.py
        self.regressor = Regressor(in_channels=self.fpn_num_filters[self.compound_coef], num_anchors=num_anchors,
                                   num_layers=self.box_class_repeats[self.compound_coef],
                                   pyramid_levels=self.pyramid_levels[self.compound_coef])

To:

backbone.py
        self.regressor = Regressor(in_channels=self.fpn_num_filters[self.compound_coef], num_anchors=num_anchors,
                                   num_layers=self.box_class_repeats[self.compound_coef],
                                   pyramid_levels=self.pyramid_levels[self.compound_coef],
                                   onnx_export=True)

From:

backbone.py
        self.classifier = Classifier(in_channels=self.fpn_num_filters[self.compound_coef], num_anchors=num_anchors,
                                     num_classes=num_classes,
                                     num_layers=self.box_class_repeats[self.compound_coef],
                                     pyramid_levels=self.pyramid_levels[self.compound_coef])

To:

backbone.py
        self.classifier = Classifier(in_channels=self.fpn_num_filters[self.compound_coef], num_anchors=num_anchors,
                                     num_classes=num_classes,
                                     num_layers=self.box_class_repeats[self.compound_coef],
                                     pyramid_levels=self.pyramid_levels[self.compound_coef],
                                     onnx_export=True)

PINTO

onnx_export で grep
全て onnx_export = True に変更

class SeparableConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels=None, norm=True, activation=False, onnx_export=False):

class SeparableConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels=None, norm=True, activation=False, onnx_export=True):

class BiFPN(nn.Module):
    def __init__(self, num_channels, conv_channels, first_time=False, epsilon=1e-4, onnx_export=False, attention=True,

class BiFPN(nn.Module):
    def __init__(self, num_channels, conv_channels, first_time=False, epsilon=1e-4, onnx_export=True, attention=True,

class Regressor(nn.Module):
    def __init__(self, in_channels, num_anchors, num_layers, pyramid_levels=5, onnx_export=False):

class Regressor(nn.Module):
    def __init__(self, in_channels, num_anchors, num_layers, pyramid_levels=5, onnx_export=True):

class Classifier(nn.Module):
    def __init__(self, in_channels, num_anchors, num_classes, num_layers, pyramid_levels=5, onnx_export=False):

class Classifier(nn.Module):
    def __init__(self, in_channels, num_anchors, num_classes, num_layers, pyramid_levels=5, onnx_export=True):

PINTO

backbone.py
        if backbone_name:
            # Use timm to create another backbone that you prefer
            # https://github.com/rwightman/pytorch-image-models
            self.encoder = timm.create_model(backbone_name, pretrained=True, features_only=True, out_indices=(2,3,4))  # P3,P4,P5

backbone.py
        if backbone_name:
            # Use timm to create another backbone that you prefer
            # https://github.com/rwightman/pytorch-image-models
            self.encoder = timm.create_model(backbone_name, pretrained=True, features_only=True, out_indices=(2,3,4), exportable=True)  # P3,P4,P5

PINTO

From:

/usr/local/lib/python3.8/dist-packages/efficientnet_pytorch/model.py
self._swish = MemoryEfficientSwish()

To:

/usr/local/lib/python3.8/dist-packages/efficientnet_pytorch/model.py
self._swish = Swish()

PINTO

入力解像度は 128 の倍数？

PINTO

ベンチマーク用ロジック batch_size=1

test.py
import onnxruntime as ort
import numpy as np
import time

H=256
W=384
# MODEL_FILE1 = f'hybridnets_Nx{H}x{W}.onnx'
MODEL_FILE2 = f'hybridnets_{H}x{W}.onnx'

#============================================================================
print(f'@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ TensorRT test')

session_option = ort.SessionOptions()
session_option.log_severity_level = 4

# Single inference x10
onnx_session = ort.InferenceSession(
    MODEL_FILE2,
    sess_options=session_option,
    providers=[
        (
            'TensorrtExecutionProvider', {
                'trt_engine_cache_enable': True,
                'trt_fp16_enable': True,
            }
        ),
    ],
)
input_name = onnx_session.get_inputs()[0].name
output_name = onnx_session.get_outputs()[0].name
print('@@@@@@@@@@ Single inference x10')
# Warm up
results = onnx_session.run(
    None,
    {input_name: np.ones(([1, 3, H, W]), dtype=np.float32)},
)
# Inference
print(f'input.shape: {onnx_session.get_inputs()[0].shape}')
start = time.time()
for i in range(10):
    results = onnx_session.run(
        None,
        {input_name: np.ones(([1, 3, H, W]), dtype=np.float32)},
    )
print(f'results.shape: {results[0].shape}')
total_time = (time.time()-start)*1000
print(f'elapsed time: {total_time} ms')
print(f'avg time: {total_time/10} ms')
print()


#============================================================================
print(f'@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ CUDA test')

session_option2 = ort.SessionOptions()
session_option2.log_severity_level = 4
session_option2.optimized_model_filepath = f"{MODEL_FILE2}_cudaopt.onnx"
session_option2.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED

# Single inference x10
onnx_session = ort.InferenceSession(
    MODEL_FILE2,
    sess_options=session_option2,
    providers=[
        'CUDAExecutionProvider',
    ],
)
input_name = onnx_session.get_inputs()[0].name
output_name = onnx_session.get_outputs()[0].name
print('@@@@@@@@@@ Single inference x10')
# Warm up
results = onnx_session.run(
    None,
    {input_name: np.ones(([1, 3, H, W]), dtype=np.float32)},
)
# Inference
print(f'input.shape: {onnx_session.get_inputs()[0].shape}')
start = time.time()
for i in range(10):
    results = onnx_session.run(
        None,
        {input_name: np.ones(([1, 3, H, W]), dtype=np.float32)},
    )
print(f'results.shape: {results[0].shape}')
total_time = (time.time()-start)*1000
print(f'elapsed time: {total_time} ms')
print(f'avg time: {total_time/10} ms')
print()


#============================================================================
print(f'@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ CPU test')

session_option = ort.SessionOptions()
session_option.log_severity_level = 4

# Single inference x10
onnx_session = ort.InferenceSession(
    MODEL_FILE2,
    sess_options=session_option,
    providers=[
        'CPUExecutionProvider',
    ],
)
input_name = onnx_session.get_inputs()[0].name
output_name = onnx_session.get_outputs()[0].name
print('@@@@@@@@@@ Single inference x10')
# Warm up
results = onnx_session.run(
    None,
    {input_name: np.ones(([1, 3, H, W]), dtype=np.float32)},
)
# Inference
print(f'input.shape: {onnx_session.get_inputs()[0].shape}')
start = time.time()
for i in range(10):
    results = onnx_session.run(
        None,
        {input_name: np.ones(([1, 3, H, W]), dtype=np.float32)},
    )
print(f'results.shape: {results[0].shape}')
total_time = (time.time()-start)*1000
print(f'elapsed time: {total_time} ms')
print(f'avg time: {total_time/10} ms')
print()


#============================================================================
print(f'@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ OpenVINO Execution Provider "CPU" test')

session_option = ort.SessionOptions()
session_option.log_severity_level = 4
session_option.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL
ort.capi._pybind_state.set_openvino_device('CPU_FP32')

# Single inference x10
onnx_session = ort.InferenceSession(
    MODEL_FILE2,
    sess_options=session_option,
    providers=[
        'OpenVINOExecutionProvider',
    ],
)
input_name = onnx_session.get_inputs()[0].name
output_name = onnx_session.get_outputs()[0].name
print('@@@@@@@@@@ Single inference x10')
# Warm up
results = onnx_session.run(
    None,
    {input_name: np.ones(([1, 3, H, W]), dtype=np.float32)},
)
# Inference
print(f'input.shape: {onnx_session.get_inputs()[0].shape}')
start = time.time()
for i in range(10):
    results = onnx_session.run(
        None,
        {input_name: np.ones(([1, 3, H, W]), dtype=np.float32)},
    )
print(f'results.shape: {results[0].shape}')
total_time = (time.time()-start)*1000
print(f'elapsed time: {total_time} ms')
print(f'avg time: {total_time/10} ms')
print()

PINTO

ベンチマーク 256x384

ONNX -> TensorRT
ONNX -> CUDA
onnxruntime CPU
onnxruntime OpenVINO Execution Provider CPU

@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ TensorRT test
@@@@@@@@@@ Single inference x10
input.shape: [1, 3, 256, 384]
results.shape: (1, 18414, 4)
elapsed time: 44.289588928222656 ms
avg time: 4.428958892822266 ms

@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ CUDA test
@@@@@@@@@@ Single inference x10
input.shape: [1, 3, 256, 384]
results.shape: (1, 18414, 4)
elapsed time: 106.24074935913086 ms
avg time: 10.624074935913086 ms

@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ CPU test
@@@@@@@@@@ Single inference x10
input.shape: [1, 3, 256, 384]
results.shape: (1, 18414, 4)
elapsed time: 513.0438804626465 ms
avg time: 51.30438804626465 ms

@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ OpenVINO Execution Provider "CPU" test
@@@@@@@@@@ Single inference x10
input.shape: [1, 3, 256, 384]
results.shape: (1, 18414, 4)
elapsed time: 164.85214233398438 ms
avg time: 16.485214233398438 ms

PINTO

PyTorch EfficientNet 改造済み
timm 改造済み

docker pull pinto0309/hybrid_nets_export:latest

docker run --gpus all -it --rm \
-v `pwd`:/home/user/workdir \
pinto0309/hybrid_nets_export:latest

ログインするとコメントできます