Open6

onnxのレイヤーごとの処理時間や使用スレッド情報などのプロファイリング・ベンチマーク

PINTOPINTO

今までどおり普通に推論するとプロファイリング結果が記録された xxxx.json が勝手に出力される。

onnx_inference_profiling.py
import onnxruntime

options = onnxruntime.SessionOptions()
options.enable_profiling = True
onnx_session = onnxruntime.InferenceSession(
    'xxxx.onnx',
    sess_options=options,
    providers=['CPUExecutionProvider'],
)

input_name = onnx_session.get_inputs()[0].name
data = np.ones((256,256))
result = onnx_session.run(None, {input_name: data})
prof_file = onnx_session.end_profiling()
print(prof_file)
PINTOPINTO
  • 出力サンプル (.json)
[
{"cat" : "Session","pid" :98799,"tid" :98799,"dur" :4488,"ts" :7,"ph" : "X","name" :"model_loading_uri","args" : {}},
{"cat" : "Session","pid" :98799,"tid" :98799,"dur" :34878,"ts" :4569,"ph" : "X","name" :"session_initialization","args" : {}},
{"cat" : "Node","pid" :98799,"tid" :98799,"dur" :0,"ts" :551778,"ph" : "X","name" :"Mul_20_fence_before","args" : {"op_name" : "Mul"}},
{"cat" : "Node","pid" :98799,"tid" :98799,"dur" :146,"ts" :551780,"ph" : "X","name" :"Mul_20_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "140479742736192", "block_size": [17280], "core": 15, "Distribution": 5, "DistributionEnqueue": 4, "Run": 98, "Wait": 14, "WaitRevoke": 0}, "sub_threads": {"140478953723648": {"num_run": 0, "core": -1},"140478995670784": {"num_run": 1, "core": 1},"140479037617920": {"num_run": 1, "core": 2},"140479079565056": {"num_run": 1, "core": 3},"140479247353600": {"num_run": 1, "core": 4},"140479205406464": {"num_run": 1, "core": 5},"140479163459328": {"num_run": 0, "core": -1},"140479121512192": {"num_run": 0, "core": -1},"140477999970048": {"num_run": 0, "core": -1}}},"activation_size" : "691200","parameter_size" : "4","graph_index" : "19","output_size" : "691200","provider" : "CPUExecutionProvider","exec_plan_index" : "19","op_name" : "Mul"}},
{"cat" : "Node","pid" :98799,"tid" :98799,"dur" :0,"ts" :551930,"ph" : "X","name" :"Mul_20_fence_after","args" : {"op_name" : "Mul"}},
{"cat" : "Node","pid" :98799,"tid" :98799,"dur" :0,"ts" :551932,"ph" : "X","name" :"Mul_21_fence_before","args" : {"op_name" : "Mul"}},
{"cat" : "Node","pid" :98799,"tid" :98799,"dur" :75,"ts" :551932,"ph" : "X","name" :"Mul_21_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "140479742736192", "block_size": [17280], "core": 15, "Distribution": 0, "DistributionEnqueue": 0, "Run": 35, "Wait": 5, "WaitRevoke": 0}, "sub_threads": {"140478953723648": {"num_run": 1, "core": 0},"140478995670784": {"num_run": 2, "core": 1},"140479037617920": {"num_run": 2, "core": 2},"140479079565056": {"num_run": 2, "core": 3},"140479247353600": {"num_run": 2, "core": 4},"140479205406464": {"num_run": 2, "core": 5},"140479163459328": {"num_run": 1, "core": 6},"140479121512192": {"num_run": 1, "core": 7},"140477999970048": {"num_run": 1, "core": 8}}},"activation_size" : "1382400","parameter_size" : "0","graph_index" : "20","output_size" : "691200","provider" : "CPUExecutionProvider","exec_plan_index" : "20","op_name" : "Mul"}},
{"cat" : "Node","pid" :98799,"tid" :98799,"dur" :0,"ts" :552010,"ph" : "X","name" :"Mul_21_fence_after","args" : {"op_name" : "Mul"}},
{"cat" : "Node","pid" :98799,"tid" :98799,"dur" :0,"ts" :552011,"ph" : "X","name" :"Sub_22_fence_before","args" : {"op_name" : "Sub"}},
{"cat" : "Node","pid" :98799,"tid" :98799,"dur" :19,"ts" :552012,"ph" : "X","name" :"Sub_22_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "140479742736192", "block_size": [17280], "core": 15, "Distribution": 0, "DistributionEnqueue": 0, "Run": 6, "Wait": 3, "WaitRevoke": 0}, "sub_threads": {"140478953723648": {"num_run": 2, "core": 0},"140478995670784": {"num_run": 3, "core": 1},"140479037617920": {"num_run": 3, "core": 2},"140479079565056": {"num_run": 3, "core": 3},"140479247353600": {"num_run": 3, "core": 4},"140479205406464": {"num_run": 3, "core": 5},"140479163459328": {"num_run": 2, "core": 6},"140479121512192": {"num_run": 2, "core": 7},"140477999970048": {"num_run": 2, "core": 8}}},"activation_size" : "1382400","parameter_size" : "0","graph_index" : "21","output_size" : "691200","provider" : "CPUExecutionProvider","exec_plan_index" : "21","op_name" : "Sub"}},
{"cat" : "Node","pid" :98799,"tid" :98799,"dur" :0,"ts" :552035,"ph" : "X","name" :"Sub_22_fence_after","args" : {"op_name" : "Sub"}},
    :
{"cat" : "Node","pid" :98799,"tid" :98799,"dur" :19,"ts" :662081,"ph" : "X","name" :"Add_66_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "140479742736192", "block_size": [17280], "core": 15, "Distribution": 0, "DistributionEnqueue": 0, "Run": 6, "Wait": 3, "WaitRevoke": 0}, "sub_threads": {"140478953723648": {"num_run": 53, "core": 0},"140478995670784": {"num_run": 54, "core": 1},"140479037617920": {"num_run": 54, "core": 2},"140479079565056": {"num_run": 54, "core": 3},"140479247353600": {"num_run": 54, "core": 4},"140479205406464": {"num_run": 54, "core": 5},"140479163459328": {"num_run": 53, "core": 6},"140479121512192": {"num_run": 53, "core": 7},"140477999970048": {"num_run": 53, "core": 8}}},"activation_size" : "1382400","parameter_size" : "0","graph_index" : "58","output_size" : "691200","provider" : "CPUExecutionProvider","exec_plan_index" : "58","op_name" : "Add"}},
{"cat" : "Node","pid" :98799,"tid" :98799,"dur" :0,"ts" :662103,"ph" : "X","name" :"Add_66_fence_after","args" : {"op_name" : "Add"}},
{"cat" : "Session","pid" :98799,"tid" :98799,"dur" :110337,"ts" :551771,"ph" : "X","name" :"SequentialExecutor::Execute","args" : {}},
{"cat" : "Session","pid" :98799,"tid" :98799,"dur" :110350,"ts" :551765,"ph" : "X","name" :"model_run","args" : {}}
]
PINTOPINTO

色付きのバーの部分をクリックするとオペレーションごとの推論時間やスレッドごとの処理結果などが細かく表示される。

PINTOPINTO

Lite-HRNet をONNXで推論するとやたらと遅い現象の調査ログ。各オペレーションの間に未処理の時間が大量にある。それに加えてかなり大きな時間にわたって未処理の時間が何箇所か有るように見える。