🐾

Pythonのオブジェクトはメモリ上でどう表現されているのか? 後編

に公開

はじめに

Pythonでの開発中はCやC++と違いメモリの内容を意識することは、ほとんどありません。

Pythonのオブジェクトはメモリ上でどう表現されているのか? 前編ではfloat, int, bytes, str型のオブジェクトが実際どのようにメモリに格納されているかを確認しました。

本ドキュメントではtuple, listやdictなどのオブジェクトが実際どのようにメモリに格納されているかを実験しつつ確認してみます。

対象のCPythonのバージョン:
https://github.com/python/cpython/blob/3.13

Python上でのオブジェクトのメモリアドレスの探索

PyTupleObjectの探索

pythonのtuple型はPyTupleObject構造体で表せます。

以下はその構造を解析して、出力するサンプルになります。

サンプルコードと出力結果

サンプルコード

import ctypes, sys

assert sys.version_info.major == 3
assert sys.version_info.minor == 13
assert sys.implementation.name == 'cpython'
assert sys._is_gil_enabled()

# _object
# 実際には何も PyObject として宣言されていないが、あらゆる Python オブジェクトへのポインタは PyObject* にキャストできる。
# これは手作業で実現した継承に相当する。
# 同様に、可変サイズの Python オブジェクトへのポインタは、さらに PyVarObject* にキャストできる。
# https://github.com/python/cpython/blob/3.13/Include/object.h
# GILが有効であることが前提である
class PyObject(ctypes.Structure):
    _fields_ = [
        ("ob_refcnt", ctypes.c_ssize_t),
        ("ob_type", ctypes.c_void_p),
    ]


# PyVarObject
# https://github.com/python/cpython/blob/3.13/Include/object.h#L224
class PyVarObject(ctypes.Structure):
    _fields_ = [
        ("ob_base", PyObject),
        ("ob_size", ctypes.c_ssize_t),
    ]

def inspect_base(addr):
    # 指定のポインタをPyObjectの型にキャストする
    obj = ctypes.cast(addr, ctypes.POINTER(PyObject)).contents
    ob_refcnt = obj.ob_refcnt
    ob_type = obj.ob_type
    try:
        type_obj = ctypes.cast(ob_type, ctypes.py_object).value
    except Exception as e:
        type_obj = f"<cannot cast to py_object: {e}>"
    return {
        "addr": hex(addr),
        "ob_refcnt": ob_refcnt,
        "ob_type": hex(ob_type),
        "type_obj": type_obj
    }

def inspect_tuple(addr):
    # https://github.com/python/cpython/blob/3.13/Include/cpython/tupleobject.h
    # PyObject_VAR_HEADの後に、PyObject *ob_itemが続く
    res = inspect_base(addr)
    obj = ctypes.cast(addr, ctypes.POINTER(PyVarObject)).contents
    res['ob_size'] = obj.ob_size
    ob_size = obj.ob_size
    elements = []
    data_addr = addr + ctypes.sizeof(PyVarObject)
    if ob_size:
        BufType = ctypes.c_void_p * ob_size
        ptr_list = BufType.from_address(data_addr)   # 危険: アドレスが有効であることが前提
        for ptr in ptr_list:
            elements.append(inspect_base(ptr))
    res['elements'] = elements
    return res

def print_tuple(name, addr, indent=0):
    res = inspect_tuple(addr)
    print('------------------------')
    print(" " * indent, "name:", name)
    print(" " * indent, "addr:", res.get('addr'))
    print(" " * indent, "ob_refcnt:", res.get('ob_refcnt'))
    print(" " * indent, "ob_type:", res.get('ob_type'))
    print(" " * indent, "型", res.get('type_obj'))
    print(" " * indent, "ob_size", res.get('ob_size'))
    for ix, element in enumerate(res.get('elements')):
        print(" " * (indent + 1), f"[{ix}]-----------------")
        print(" " * (indent + 1), "addr:", element.get('addr'))
        print(" " * (indent + 1), "ob_refcnt:", element.get('ob_refcnt'))
        print(" " * (indent + 1), "ob_type:", element.get('ob_type'))
        print(" " * (indent + 1), "型", element.get('type_obj'))


var1 = (1, "文字")
print_tuple(f"tuple: {var1}", id(var1))

var2 = ()
print_tuple(f"tuple: {var2}", id(var2))

出力結果

------------------------
 name: tuple: (1, '文字')
 addr: 0x106586640
 ob_refcnt: 3
 ob_type: 0x106c86b80
 型 <class 'tuple'>
 ob_size 2
  [0]-----------------
  addr: 0x106ca74c0
  ob_refcnt: 4294967295
  ob_type: 0x106c7eec0
  型 <class 'int'>
  [1]-----------------
  addr: 0x10658c8a0
  ob_refcnt: 2
  ob_type: 0x106c8bc10
  型 <class 'str'>
------------------------
 name: tuple: ()
 addr: 0x106cb96d8
 ob_refcnt: 4294967295
 ob_type: 0x106c86b80
 型 <class 'tuple'>
 ob_size 0

オブジェクトのアドレスからPyTupleObjectを表現する

PyTupleObject型は以下の構造体です。

typedef struct {
    PyObject_VAR_HEAD
    /* ob_item contains space for 'ob_size' elements.
       Items must normally not be NULL, except during construction when
       the tuple is not yet visible outside the function that builds it. */
    PyObject *ob_item[1];
} PyTupleObject;

すなわち、PyObject_VAR_HEAD(PyVarObject)の後にPyObjectがPyVarObject.ob_sizeのサイズだけ続く構成になっています。
これをpythonで取得するには以下のようになります。

    res = inspect_base(addr)
    obj = ctypes.cast(addr, ctypes.POINTER(PyVarObject)).contents
    res['ob_size'] = obj.ob_size
    ob_size = obj.ob_size
    elements = []
    data_addr = addr + ctypes.sizeof(PyVarObject)
    if ob_size:
        BufType = ctypes.c_void_p * ob_size
        ptr_list = BufType.from_address(data_addr)   # 危険: アドレスが有効であることが前提
        for ptr in ptr_list:
            elements.append(inspect_base(ptr))

PyListObjectの探索

pythonのlist型はPyListObject構造体で表せます。

これの構造を解析するサンプルコードは以下のようになります。

サンプルコードと出力結果

サンプルコード

import ctypes, sys

assert sys.version_info.major == 3
assert sys.version_info.minor == 13
assert sys.implementation.name == 'cpython'
assert sys._is_gil_enabled()

# _object
# 実際には何も PyObject として宣言されていないが、あらゆる Python オブジェクトへのポインタは PyObject* にキャストできる。
# これは手作業で実現した継承に相当する。
# 同様に、可変サイズの Python オブジェクトへのポインタは、さらに PyVarObject* にキャストできる。
# https://github.com/python/cpython/blob/3.13/Include/object.h
# GILが有効であることが前提である
class PyObject(ctypes.Structure):
    _fields_ = [
        ("ob_refcnt", ctypes.c_ssize_t),
        ("ob_type", ctypes.c_void_p),
    ]


# PyVarObject
# https://github.com/python/cpython/blob/3.13/Include/object.h#L224
class PyVarObject(ctypes.Structure):
    _fields_ = [
        ("ob_base", PyObject),
        ("ob_size", ctypes.c_ssize_t),
    ]

# https://github.com/python/cpython/blob/3.13/Include/cpython/listobject.h
class PyListObject(ctypes.Structure):
    _fields_ = [
        ("head",   PyVarObject), 
        ("ob_item",   ctypes.POINTER(ctypes.c_void_p)),  # PyObject**(要素ポインタ配列)
        ("allocated", ctypes.c_ssize_t),                 # capacity
    ]

def inspect_base(addr):
    # 指定のポインタをPyObjectの型にキャストする
    obj = ctypes.cast(addr, ctypes.POINTER(PyObject)).contents
    ob_refcnt = obj.ob_refcnt
    ob_type = obj.ob_type
    try:
        type_obj = ctypes.cast(ob_type, ctypes.py_object).value
    except Exception as e:
        type_obj = f"<cannot cast to py_object: {e}>"
    return {
        "addr": hex(addr),
        "ob_refcnt": ob_refcnt,
        "ob_type": hex(ob_type),
        "type_obj": type_obj
    }

def inspect_list(addr):
    # https://github.com/python/cpython/blob/3.13/Include/cpython/tupleobject.h
    # PyObject_VAR_HEADの後に、PyObject *ob_itemが続く
    res = inspect_base(addr)
    obj = ctypes.cast(addr, ctypes.POINTER(PyListObject)).contents
    ob_size = obj.head.ob_size
    allocated = obj.allocated
    res['ob_size'] = ob_size
    res['allocated'] = allocated
    elements = []
    for i in range(ob_size):
        elem_addr = int(obj.ob_item[i])
        elements.append(inspect_base(elem_addr))
    res['elements'] = elements
    return res

def print_list(name, addr, indent=0):
    res = inspect_list(addr)
    print('------------------------')
    print(" " * indent, "name:", name)
    print(" " * indent, "addr:", res.get('addr'))
    print(" " * indent, "ob_refcnt:", res.get('ob_refcnt'))
    print(" " * indent, "ob_type:", res.get('ob_type'))
    print(" " * indent, "型", res.get('type_obj'))
    print(" " * indent, "ob_size", res.get('ob_size'))
    print(" " * indent, "allocated", res.get('allocated'))
    for ix, element in enumerate(res.get('elements')):
        print(" " * (indent + 1), f"[{ix}]-----------------")
        print(" " * (indent + 1), "addr:", element.get('addr'))
        print(" " * (indent + 1), "ob_refcnt:", element.get('ob_refcnt'))
        print(" " * (indent + 1), "ob_type:", element.get('ob_type'))
        print(" " * (indent + 1), "型", element.get('type_obj'))


var1 = [100, 200, 50, 1]
print_list(f"list: {var1}", id(var1))

var1.append(10)
print_list(f"list: {var1}の項目を追加したケース", id(var1))

var1[0] = 105
print_list(f"list: {var1}の項目0を同じ型で更新したケース", id(var1))

var1[0] = "105"
print_list(f"list: {var1}の項目0を別の型で更新したケース", id(var1))

del var1[0]
print_list(f"list: {var1}の項目0を削除したケース", id(var1))

var1.sort()
print_list(f"list: {var1}をソートした結果", id(var1))

var1.clear()
print_list(f"listをクリアした", id(var1))

出力結果

------------------------
 name: list: [100, 200, 50, 1]
 addr: 0x10f9cff80
 ob_refcnt: 1
 ob_type: 0x1100154c0
 型 <class 'list'>
 ob_size 4
 allocated 4
  [0]-----------------
  addr: 0x11003f120
  ob_refcnt: 4294967295
  ob_type: 0x110015ec0
  型 <class 'int'>
  [1]-----------------
  addr: 0x11003fda0
  ob_refcnt: 4294967295
  ob_type: 0x110015ec0
  型 <class 'int'>
  [2]-----------------
  addr: 0x11003eae0
  ob_refcnt: 4294967295
  ob_type: 0x110015ec0
  型 <class 'int'>
  [3]-----------------
  addr: 0x11003e4c0
  ob_refcnt: 4294967295
  ob_type: 0x110015ec0
  型 <class 'int'>
------------------------
 name: list: [100, 200, 50, 1, 10]の項目を追加したケース
 addr: 0x10f9cff80
 ob_refcnt: 1
 ob_type: 0x1100154c0
 型 <class 'list'>
 ob_size 5
 allocated 8
  [0]-----------------
  addr: 0x11003f120
  ob_refcnt: 4294967295
  ob_type: 0x110015ec0
  型 <class 'int'>
  [1]-----------------
  addr: 0x11003fda0
  ob_refcnt: 4294967295
  ob_type: 0x110015ec0
  型 <class 'int'>
  [2]-----------------
  addr: 0x11003eae0
  ob_refcnt: 4294967295
  ob_type: 0x110015ec0
  型 <class 'int'>
  [3]-----------------
  addr: 0x11003e4c0
  ob_refcnt: 4294967295
  ob_type: 0x110015ec0
  型 <class 'int'>
  [4]-----------------
  addr: 0x11003e5e0
  ob_refcnt: 4294967295
  ob_type: 0x110015ec0
  型 <class 'int'>
------------------------
 name: list: [105, 200, 50, 1, 10]の項目0を同じ型で更新したケース
 addr: 0x10f9cff80
 ob_refcnt: 1
 ob_type: 0x1100154c0
 型 <class 'list'>
 ob_size 5
 allocated 8
  [0]-----------------
  addr: 0x11003f1c0
  ob_refcnt: 4294967295
  ob_type: 0x110015ec0
  型 <class 'int'>
  [1]-----------------
  addr: 0x11003fda0
  ob_refcnt: 4294967295
  ob_type: 0x110015ec0
  型 <class 'int'>
  [2]-----------------
  addr: 0x11003eae0
  ob_refcnt: 4294967295
  ob_type: 0x110015ec0
  型 <class 'int'>
  [3]-----------------
  addr: 0x11003e4c0
  ob_refcnt: 4294967295
  ob_type: 0x110015ec0
  型 <class 'int'>
  [4]-----------------
  addr: 0x11003e5e0
  ob_refcnt: 4294967295
  ob_type: 0x110015ec0
  型 <class 'int'>
------------------------
 name: list: ['105', 200, 50, 1, 10]の項目0を別の型で更新したケース
 addr: 0x10f9cff80
 ob_refcnt: 1
 ob_type: 0x1100154c0
 型 <class 'list'>
 ob_size 5
 allocated 8
  [0]-----------------
  addr: 0x10f9d8570
  ob_refcnt: 3
  ob_type: 0x110022c10
  型 <class 'str'>
  [1]-----------------
  addr: 0x11003fda0
  ob_refcnt: 4294967295
  ob_type: 0x110015ec0
  型 <class 'int'>
  [2]-----------------
  addr: 0x11003eae0
  ob_refcnt: 4294967295
  ob_type: 0x110015ec0
  型 <class 'int'>
  [3]-----------------
  addr: 0x11003e4c0
  ob_refcnt: 4294967295
  ob_type: 0x110015ec0
  型 <class 'int'>
  [4]-----------------
  addr: 0x11003e5e0
  ob_refcnt: 4294967295
  ob_type: 0x110015ec0
  型 <class 'int'>
------------------------
 name: list: [200, 50, 1, 10]の項目0を削除したケース
 addr: 0x10f9cff80
 ob_refcnt: 1
 ob_type: 0x1100154c0
 型 <class 'list'>
 ob_size 4
 allocated 8
  [0]-----------------
  addr: 0x11003fda0
  ob_refcnt: 4294967295
  ob_type: 0x110015ec0
  型 <class 'int'>
  [1]-----------------
  addr: 0x11003eae0
  ob_refcnt: 4294967295
  ob_type: 0x110015ec0
  型 <class 'int'>
  [2]-----------------
  addr: 0x11003e4c0
  ob_refcnt: 4294967295
  ob_type: 0x110015ec0
  型 <class 'int'>
  [3]-----------------
  addr: 0x11003e5e0
  ob_refcnt: 4294967295
  ob_type: 0x110015ec0
  型 <class 'int'>
------------------------
 name: list: [1, 10, 50, 200]をソートした結果
 addr: 0x10f9cff80
 ob_refcnt: 1
 ob_type: 0x1100154c0
 型 <class 'list'>
 ob_size 4
 allocated 8
  [0]-----------------
  addr: 0x11003e4c0
  ob_refcnt: 4294967295
  ob_type: 0x110015ec0
  型 <class 'int'>
  [1]-----------------
  addr: 0x11003e5e0
  ob_refcnt: 4294967295
  ob_type: 0x110015ec0
  型 <class 'int'>
  [2]-----------------
  addr: 0x11003eae0
  ob_refcnt: 4294967295
  ob_type: 0x110015ec0
  型 <class 'int'>
  [3]-----------------
  addr: 0x11003fda0
  ob_refcnt: 4294967295
  ob_type: 0x110015ec0
  型 <class 'int'>
------------------------
 name: listをクリアした
 addr: 0x10f9cff80
 ob_refcnt: 1
 ob_type: 0x1100154c0
 型 <class 'list'>
 ob_size 0
 allocated 0

オブジェクトのアドレスからPyListObjectを表現する

Cで記載されたPyListObjectctypes.Structureを継承したクラスで表現することが可能です

class PyListObject(ctypes.Structure):
    _fields_ = [
        ("head",   PyVarObject), 
        ("ob_item",   ctypes.POINTER(ctypes.c_void_p)),  # PyObject**(要素ポインタ配列)
        ("allocated", ctypes.c_ssize_t),                 # capacity
    ]

tuple型と違い、動的に要素の項目が追加・削除されるため、ob_itemは「要素(PyObject*)を並べた配列を指すポインタ」で表現されており、その確保済みの数が確保済みスロット数が allocatedとなります。実際の要素数はob_sizeですが、allocatedについては伸長時の再割り当て回数を減らすために過剰確保されます。

idを使用して取得したオブジェクトのアドレスから、ctypes.Structureを継承したクラスにキャストすることが可能です。

    res = inspect_base(addr)
    obj = ctypes.cast(addr, ctypes.POINTER(PyListObject)).contents

各要素の内容を取得するコードは以下のようになります。

    elements = []
    for i in range(ob_size):
        elem_addr = int(obj.ob_item[i])
        elements.append(inspect_base(elem_addr))
    res['elements'] = elements

要素の更新に伴う挙動の確認

var1 = [100, 200, 50, 1]と宣言した場合、ob_size=4, allocated=4となっています。

この変数に対してvar1.append(10)によって要素を追加すると、ob_size=5, allocated=8となります。

var1[0]=105のように要素と同じ型で更新した場合、ob_size=5, allocated=8のままで、要素0のアドレスは同じアドレスのままとなります。

var1[0]='105'のように要素と違う型で更新した場合、ob_size=5, allocated=8のままで、要素0のアドレスは違うアドレスになります。

del var1[0]で要素0を削除した場合、ob_size=5, allocated=8のままでした。

var1.sort()でソートを実行した場合、ob_size=5, allocated=8のままで、ソート前に対応する各要素のアドレスはソート後と一致します。

var1.clear()でクリアをした場合、ob_size=0, allocated=0となります。

PyDictObjectの探索

pythonのdict型はPyDictObject構造体で表せます。

この構造体を解析するには、構成する別の構造体も見る必要があります。

その他、以下のドキュメントやコメントが解析の役に立ちます。
https://github.com/python/cpython/blob/3.13/Objects/dictobject.c#L288-L376
https://github.com/python/cpython/blob/3.13/Objects/dictnotes.txt

以下はdict型の挙動を実験したコードとなります。

サンプルコードと出力結果

サンプルコード

import ctypes, sys

assert sys.version_info.major == 3
assert sys.version_info.minor == 13
assert sys.implementation.name == 'cpython'
assert sys._is_gil_enabled()

# _object
# 実際には何も PyObject として宣言されていないが、あらゆる Python オブジェクトへのポインタは PyObject* にキャストできる。
# これは手作業で実現した継承に相当する。
# 同様に、可変サイズの Python オブジェクトへのポインタは、さらに PyVarObject* にキャストできる。
# https://github.com/python/cpython/blob/3.13/Include/object.h
# GILが有効であることが前提である
class PyObject(ctypes.Structure):
    _fields_ = [
        ("ob_refcnt", ctypes.c_ssize_t),
        ("ob_type", ctypes.c_void_p),
    ]


# https://github.com/python/cpython/blob/3.13/Include/cpython/dictobject.h#L5
class PyDictObject(ctypes.Structure):
    _fields_ = [
        ("head", PyObject),
        ("ma_used", ctypes.c_ssize_t),
        # ビット 0–7 は辞書ウォッチャー(dict watchers)用。
        # ビット 8–11 は監視対象の変更カウンタ(tier2 最適化で使用)。
        # 残りのビット(12–63)は実際のバージョンタグ
        ("ma_version_tag", ctypes.c_uint64),
        ("ma_keys", ctypes.c_void_p),
        # ma_values が NULL の場合、テーブルは「combined(結合)」である:
        # キーと値は ma_keys に格納される。
        # ma_values が NULL でない場合、テーブルは「split(分離)」である:
        # キーは ma_keys に、値は ma_values に格納される。
        ("ma_values", ctypes.c_void_p),
    ]

# https://github.com/python/cpython/blob/3.13/Include/internal/pycore_dict.h#L140
class PyDictKeysObject(ctypes.Structure):
    _fields_ = [
        ("dk_refcnt", ctypes.c_ssize_t),
        ("dk_log2_size", ctypes.c_uint8),
        ("dk_log2_index_bytes", ctypes.c_uint8),
        ("dk_kind", ctypes.c_uint8),
        # uint32 整列用のパディング
        ("_pad", ctypes.c_uint8),
        ("dk_version", ctypes.c_uint32),
        ("dk_usable", ctypes.c_ssize_t),
        ("dk_nentries", ctypes.c_ssize_t),
        # ここから先に flexible array: char dk_indices[]
    ]

# https://github.com/python/cpython/blob/3.13/Include/internal/pycore_dict.h#L76
class PyDictKeyEntry(ctypes.Structure):
    _fields_ = [
        ("me_hash", ctypes.c_ssize_t),
        ("me_key", ctypes.c_void_p),
        # This field is only meaningful for combined tables
        ("me_value", ctypes.c_void_p),
    ]

# https://github.com/python/cpython/blob/3.13/Include/internal/pycore_dict.h#L83
class PyDictUnicodeEntry(ctypes.Structure):
    _fields_ = [
        # The key must be Unicode and have hash
        ("me_key", ctypes.c_void_p),
        # This field is only meaningful for combined tables
        ("me_value", ctypes.c_void_p),
    ]

# https://github.com/python/cpython/blob/3.13/Include/internal/pycore_dict.h#L196
class PyDictValues(ctypes.Structure):
    _fields_ = [
        ("capacity", ctypes.c_uint8),
        ("size", ctypes.c_uint8),
        ("embedded", ctypes.c_uint8),
        ("valid", ctypes.c_uint8),
        ("values", ctypes.c_void_p),
    ]


def inspect_base(addr):
    # 指定のポインタをPyObjectの型にキャストする
    obj = ctypes.cast(addr, ctypes.POINTER(PyObject)).contents
    ob_refcnt = obj.ob_refcnt
    ob_type = obj.ob_type
    try:
        type_obj = ctypes.cast(ob_type, ctypes.py_object).value
    except Exception as e:
        type_obj = f"<cannot cast to py_object: {e}>"
    return {
        "addr": hex(addr),
        "ob_refcnt": ob_refcnt,
        "ob_type": hex(ob_type),
        "type_obj": type_obj
    }

def inspect_dict(addr):
    # https://github.com/python/cpython/blob/3.13/Objects/dictobject.c
    # https://github.com/python/cpython/blob/3.13/Objects/dictnotes.txt
    res_base = inspect_base(addr)
    assert res_base.get('type_obj') == dict
    obj = ctypes.cast(addr, ctypes.POINTER(PyDictObject)).contents
    res = {
        "ma_used": obj.ma_used,
        "ma_version_tag": obj.ma_version_tag,
        "ma_keys_addr": obj.ma_keys,
        "ma_values_addr": obj.ma_values,
    } | res_base
    key_obj = ctypes.cast(obj.ma_keys, ctypes.POINTER(PyDictKeysObject)).contents
    DictKeysKind = {
        0: "DICT_KEYS_GENERAL",
        1: "DICT_KEYS_UNICODE",
        2: "DICT_KEYS_SPLIT",
    }
    dk_kind = DictKeysKind.get(key_obj.dk_kind)
    ma_keys = {
        "dk_refcnt": key_obj.dk_refcnt,
        "dk_log2_size": key_obj.dk_log2_size,
        "dk_log2_index_bytes": key_obj.dk_log2_index_bytes,
        "dk_kind": dk_kind,
        "dk_version": key_obj.dk_version,
        "dk_usable": key_obj.dk_usable,
        "dk_nentries": key_obj.dk_nentries,
    }
    res['ma_keys'] = ma_keys
    base_indices = obj.ma_keys + ctypes.sizeof(PyDictKeysObject) 
    # https://github.com/python/cpython/blob/main/Include/internal/pycore_dict.h#L237C9-L237C20
    # Include/pymacro.h
    # Prevent using an expression as a l-value.
    # For example, "int x; _Py_RVALUE(x) = 1;" fails with a compiler error.
    # #define _Py_RVALUE(EXPR) ((void)0, (EXPR))
    dk_size = 1 << key_obj.dk_log2_size
    ma_keys['dk_size'] = dk_size

    #  https://github.com/python/cpython/blob//3.13/Include/internal/pycore_dict.h#L167
    if dk_size <= 0xff:
        dk_indices = (ctypes.c_int8 * dk_size).from_address(base_indices)
    elif dk_size <= 0xffff:
        dk_indices = (ctypes.c_int16 * dk_size).from_address(base_indices)
    elif dk_size <= 0xffffffff:
        dk_indices = (ctypes.c_int32 * dk_size).from_address(base_indices)
    else:
        dk_indices = (ctypes.c_int64 * dk_size).from_address(base_indices)
    indices = []
    for i in range(dk_size):
        indices.append(dk_indices[i])
    ma_keys['dk_indices'] = indices
    indices_bytes_total = 1 << key_obj.dk_log2_index_bytes       # dk_indices 全体のバイト数
    entries_addr = base_indices + indices_bytes_total

    # USABLE_FRACTION
    # https://github.com/python/cpython/blob/3.13/Objects/dictobject.c#L576
    usable = (dk_size * 2) // 3
    Entry = PyDictUnicodeEntry if dk_kind != "DICT_KEYS_GENERAL" else PyDictKeyEntry
    entries = (Entry * usable).from_address(entries_addr)
    dk_entries = []
    for i in range(usable):
        e = entries[i]
        key = None
        if e.me_key:
            key = ctypes.cast(e.me_key, ctypes.py_object).value
        value = None
        if e.me_value:
            value = ctypes.cast(e.me_value, ctypes.py_object).value
        entry = {
            "me_key": key,
            "me_value": value,
        }
        if dk_kind == "DICT_KEYS_GENERAL":
            entry['hash'] = e.me_hash
        elif key:
            entry['hash'] = key.__hash__()
        dk_entries.append(entry)
    ma_keys['dk_entries'] = dk_entries
    if obj.ma_values:
        value_obj = ctypes.cast(obj.ma_values, ctypes.POINTER(PyDictValues)).contents
        ma_values = {}
        ma_values['capacity'] = value_obj.capacity
        ma_values['size'] = value_obj.size
        ma_values['embedded'] = value_obj.embedded
        ma_values['valid'] = value_obj.valid

        value_addr = obj.ma_values + PyDictValues.values.offset
        BufType = ctypes.c_void_p * value_obj.size
        ptr_list = BufType.from_address(value_addr)
        ma_values['values'] = []
        for ptr in ptr_list:
            v = ctypes.cast(ptr, ctypes.py_object).value
            ma_values['values'].append(v)
        res['ma_values'] = ma_values

    return res

def print_dict(name, addr, indent=0):
    res = inspect_dict(addr)
    print('------------------------')
    print(" " * indent, "name:", name)
    print(" " * indent, "addr:", res.get('addr'))
    print(" " * indent, "ob_refcnt:", res.get('ob_refcnt'))
    print(" " * indent, "ob_type:", res.get('ob_type'))
    print(" " * indent, "型:", res.get('type_obj'))
    print(" " * indent, "ma_used:", res.get('ma_used'))
    print(" " * indent, "ma_version_tag:", res.get('ma_version_tag'))
    ma_keys = res.get('ma_keys')
    print(" " * indent, "ma_keys-------------", hex(res.get('ma_keys_addr')))
    print(" " * (indent + 1), "dk_refcnt:", ma_keys.get('dk_refcnt'))
    print(" " * (indent + 1), "dk_log2_size:", ma_keys.get('dk_log2_size'))
    print(" " * (indent + 1), "dk_log2_index_bytes:", ma_keys.get('dk_log2_index_bytes'))
    print(" " * (indent + 1), "dk_kind:", ma_keys.get('dk_kind'))
    print(" " * (indent + 1), "dk_version:", ma_keys.get('dk_version'))
    print(" " * (indent + 1), "dk_usable:", ma_keys.get('dk_usable'))
    print(" " * (indent + 1), "dk_nentries:", ma_keys.get('dk_nentries'))
    print(" " * (indent + 1), "dk_indices--------")
    for ix, element in enumerate(ma_keys.get('dk_indices')):
        print(" " * (indent + 2), f"{ix}: {element}")
    print(" " * (indent + 1), "dk_entries--------")
    for ix, element in enumerate(ma_keys.get('dk_entries')):
        print(" " * (indent + 2), f"{ix}: me_key:{element.get('me_key')} me_value:{element.get('me_value')} hash:{element.get('hash')}")
    ma_values = res.get('ma_values')
    if ma_values:
        print(" " * indent, "ma_values-------------", hex(res.get('ma_values_addr')))
        print(" " * (indent + 1), "capacity:", ma_values.get('capacity'))
        print(" " * (indent + 1), "size:", ma_values.get('size'))
        print(" " * (indent + 1), "embedded:", ma_values.get('embedded'))
        print(" " * (indent + 1), "valid:", ma_values.get('valid'))
        print(" " * (indent + 1), "values--------")
        for ix, value in enumerate(ma_values.get('values')):
            print(" " * (indent + 2), f"{ix}: {value}")


var1 = {9: "test1", 103: "test2", 1: "test3"}
print_dict("数値がキー", id(var1))

var2 = {
    "a": "test1",
    "b": "ああああ"
}
print_dict("文字がキー", id(var2))

class C:
    def __init__(self):
        self.x = 1
        self.y = "test"
        self.z = [1,2,3]

var3 = C().__dict__
print_dict("SPLITの場合", id(var3))

var2["c"] = "追加"
print_dict("キーを追加したケース", id(var2))

del var2["a"]
print_dict(f"キーを削除したケース {var2}", id(var2))

var2["d"] = "削除後に追加"
print_dict("削除後に追加したケース", id(var2))

var2["e"] = "拡張A"
var2["f"] = "拡張B"
print_dict("dk_usableの残り以上に追加", id(var2))

出力結果

------------------------
 name: 数値がキー
 addr: 0x10befbc80
 ob_refcnt: 1
 ob_type: 0x10c536698
 型: <class 'dict'>
 ma_used: 3
 ma_version_tag: 104620032
 ma_keys------------- 0x10bf14670
  dk_refcnt: 1
  dk_log2_size: 3
  dk_log2_index_bytes: 3
  dk_kind: DICT_KEYS_GENERAL
  dk_version: 0
  dk_usable: 2
  dk_nentries: 3
  dk_indices--------
   0: -1
   1: 0
   2: -1
   3: -1
   4: -1
   5: -1
   6: 2
   7: 1
  dk_entries--------
   0: me_key:9 me_value:test1 hash:9
   1: me_key:103 me_value:test2 hash:103
   2: me_key:1 me_value:test3 hash:1
   3: me_key:None me_value:None hash:0
   4: me_key:None me_value:None hash:0
------------------------
 name: 文字がキー
 addr: 0x10beef400
 ob_refcnt: 1
 ob_type: 0x10c536698
 型: <class 'dict'>
 ma_used: 2
 ma_version_tag: 105148416
 ma_keys------------- 0x10bf1c630
  dk_refcnt: 1
  dk_log2_size: 3
  dk_log2_index_bytes: 3
  dk_kind: DICT_KEYS_UNICODE
  dk_version: 0
  dk_usable: 3
  dk_nentries: 2
  dk_indices--------
   0: -1
   1: 0
   2: -1
   3: -1
   4: -1
   5: -1
   6: 1
   7: -1
  dk_entries--------
   0: me_key:a me_value:test1 hash:-4045657521913962983
   1: me_key:b me_value:ああああ hash:-4531556536679314122
   2: me_key:None me_value:None hash:None
   3: me_key:None me_value:None hash:None
   4: me_key:None me_value:None hash:None
------------------------
 name: SPLITの場合
 addr: 0x10beef2c0
 ob_refcnt: 1
 ob_type: 0x10c536698
 型: <class 'dict'>
 ma_used: 3
 ma_version_tag: 105512960
 ma_keys------------- 0x7fd0f9951a70
  dk_refcnt: 2
  dk_log2_size: 6
  dk_log2_index_bytes: 6
  dk_kind: DICT_KEYS_SPLIT
  dk_version: 0
  dk_usable: 26
  dk_nentries: 3
  dk_indices--------
   0: -1
   1: 2
   2: -1
   3: -1
   4: -1
   5: 1
   6: -1
   7: -1
   8: -1
   9: -1
   10: -1
   11: -1
   12: -1
   13: -1
   14: -1
   15: -1
   16: -1
   17: -1
   18: -1
   19: -1
   20: -1
   21: -1
   22: -1
   23: -1
   24: -1
   25: -1
   26: -1
   27: -1
   28: -1
   29: -1
   30: -1
   31: -1
   32: -1
   33: -1
   34: -1
   35: -1
   36: -1
   37: -1
   38: -1
   39: 0
   40: -1
   41: -1
   42: -1
   43: -1
   44: -1
   45: -1
   46: -1
   47: -1
   48: -1
   49: -1
   50: -1
   51: -1
   52: -1
   53: -1
   54: -1
   55: -1
   56: -1
   57: -1
   58: -1
   59: -1
   60: -1
   61: -1
   62: -1
   63: -1
  dk_entries--------
   0: me_key:x me_value:None hash:-1577677568910002585
   1: me_key:y me_value:None hash:4391882689425133189
   2: me_key:z me_value:None hash:2049973063166464897
   3: me_key:None me_value:None hash:None
   4: me_key:None me_value:None hash:None
   5: me_key:None me_value:None hash:None
   6: me_key:None me_value:None hash:None
   7: me_key:None me_value:None hash:None
   8: me_key:None me_value:None hash:None
   9: me_key:None me_value:None hash:None
   10: me_key:None me_value:None hash:None
   11: me_key:None me_value:None hash:None
   12: me_key:None me_value:None hash:None
   13: me_key:None me_value:None hash:None
   14: me_key:None me_value:None hash:None
   15: me_key:None me_value:None hash:None
   16: me_key:None me_value:None hash:None
   17: me_key:None me_value:None hash:None
   18: me_key:None me_value:None hash:None
   19: me_key:None me_value:None hash:None
   20: me_key:None me_value:None hash:None
   21: me_key:None me_value:None hash:None
   22: me_key:None me_value:None hash:None
   23: me_key:None me_value:None hash:None
   24: me_key:None me_value:None hash:None
   25: me_key:None me_value:None hash:None
   26: me_key:None me_value:None hash:None
   27: me_key:None me_value:None hash:None
   28: me_key:None me_value:None hash:None
   29: me_key:None me_value:None hash:None
   30: me_key:None me_value:None hash:None
   31: me_key:None me_value:None hash:None
   32: me_key:None me_value:None hash:None
   33: me_key:None me_value:None hash:None
   34: me_key:None me_value:None hash:None
   35: me_key:None me_value:None hash:None
   36: me_key:None me_value:None hash:None
   37: me_key:None me_value:None hash:None
   38: me_key:None me_value:None hash:None
   39: me_key:None me_value:None hash:None
   40: me_key:None me_value:None hash:None
   41: me_key:None me_value:None hash:None
 ma_values------------- 0x10be89bd0
  capacity: 29
  size: 3
  embedded: 0
  valid: 0
  values--------
   0: 1
   1: test
   2: [1, 2, 3]
------------------------
 name: キーを追加したケース
 addr: 0x10beef400
 ob_refcnt: 1
 ob_type: 0x10c536698
 型: <class 'dict'>
 ma_used: 3
 ma_version_tag: 106618880
 ma_keys------------- 0x10bf1c630
  dk_refcnt: 1
  dk_log2_size: 3
  dk_log2_index_bytes: 3
  dk_kind: DICT_KEYS_UNICODE
  dk_version: 0
  dk_usable: 2
  dk_nentries: 3
  dk_indices--------
   0: 2
   1: 0
   2: -1
   3: -1
   4: -1
   5: -1
   6: 1
   7: -1
  dk_entries--------
   0: me_key:a me_value:test1 hash:-4045657521913962983
   1: me_key:b me_value:ああああ hash:-4531556536679314122
   2: me_key:c me_value:追加 hash:2418154100820689008
   3: me_key:None me_value:None hash:None
   4: me_key:None me_value:None hash:None
------------------------
 name: キーを削除したケース {'b': 'ああああ', 'c': '追加'}
 addr: 0x10beef400
 ob_refcnt: 1
 ob_type: 0x10c536698
 型: <class 'dict'>
 ma_used: 2
 ma_version_tag: 106864640
 ma_keys------------- 0x10bf1c630
  dk_refcnt: 1
  dk_log2_size: 3
  dk_log2_index_bytes: 3
  dk_kind: DICT_KEYS_UNICODE
  dk_version: 0
  dk_usable: 2
  dk_nentries: 3
  dk_indices--------
   0: 2
   1: -2
   2: -1
   3: -1
   4: -1
   5: -1
   6: 1
   7: -1
  dk_entries--------
   0: me_key:None me_value:None hash:None
   1: me_key:b me_value:ああああ hash:-4531556536679314122
   2: me_key:c me_value:追加 hash:2418154100820689008
   3: me_key:None me_value:None hash:None
   4: me_key:None me_value:None hash:None
------------------------
 name: 削除後に追加したケース
 addr: 0x10beef400
 ob_refcnt: 1
 ob_type: 0x10c536698
 型: <class 'dict'>
 ma_used: 3
 ma_version_tag: 107106304
 ma_keys------------- 0x10bf1c630
  dk_refcnt: 1
  dk_log2_size: 3
  dk_log2_index_bytes: 3
  dk_kind: DICT_KEYS_UNICODE
  dk_version: 0
  dk_usable: 1
  dk_nentries: 4
  dk_indices--------
   0: 2
   1: -2
   2: -1
   3: -1
   4: -1
   5: -1
   6: 1
   7: 3
  dk_entries--------
   0: me_key:None me_value:None hash:None
   1: me_key:b me_value:ああああ hash:-4531556536679314122
   2: me_key:c me_value:追加 hash:2418154100820689008
   3: me_key:d me_value:削除後に追加 hash:-1764641579136718161
   4: me_key:None me_value:None hash:None
------------------------
 name: dk_usableの残り以上に追加
 addr: 0x10beef400
 ob_refcnt: 1
 ob_type: 0x10c536698
 型: <class 'dict'>
 ma_used: 5
 ma_version_tag: 107356160
 ma_keys------------- 0x10bf06b40
  dk_refcnt: 1
  dk_log2_size: 4
  dk_log2_index_bytes: 4
  dk_kind: DICT_KEYS_UNICODE
  dk_version: 0
  dk_usable: 5
  dk_nentries: 5
  dk_indices--------
   0: 1
   1: -1
   2: 4
   3: 3
   4: -1
   5: -1
   6: 0
   7: -1
   8: -1
   9: -1
   10: -1
   11: -1
   12: -1
   13: -1
   14: -1
   15: 2
  dk_entries--------
   0: me_key:b me_value:ああああ hash:-4531556536679314122
   1: me_key:c me_value:追加 hash:2418154100820689008
   2: me_key:d me_value:削除後に追加 hash:-1764641579136718161
   3: me_key:e me_value:拡張A hash:5065613218728902051
   4: me_key:f me_value:拡張B hash:5021542961059956192
   5: me_key:None me_value:None hash:None
   6: me_key:None me_value:None hash:None
   7: me_key:None me_value:None hash:None
   8: me_key:None me_value:None hash:None
   9: me_key:None me_value:None hash:None

オブジェクトのアドレスからPyDictObjectを表現する

Cで記載されたPyDictObject/PyDictKeysObject/PyDictKeyEntry/PyDictUnicodeEntry/PyDictValuesctypes.Structureを継承したクラスで表現することが可能です。

# https://github.com/python/cpython/blob/3.13/Include/cpython/dictobject.h#L5
class PyDictObject(ctypes.Structure):
    _fields_ = [
        ("head", PyObject),
        ("ma_used", ctypes.c_ssize_t),
        # ビット 0–7 は辞書ウォッチャー(dict watchers)用。
        # ビット 8–11 は監視対象の変更カウンタ(tier2 最適化で使用)。
        # 残りのビット(12–63)は実際のバージョンタグ
        ("ma_version_tag", ctypes.c_uint64),
        ("ma_keys", ctypes.c_void_p),
        # ma_values が NULL の場合、テーブルは「combined(結合)」である:
        # キーと値は ma_keys に格納される。
        # ma_values が NULL でない場合、テーブルは「split(分離)」である:
        # キーは ma_keys に、値は ma_values に格納される。
        ("ma_values", ctypes.c_void_p),
    ]

# https://github.com/python/cpython/blob/3.13/Include/internal/pycore_dict.h#L140
class PyDictKeysObject(ctypes.Structure):
    _fields_ = [
        ("dk_refcnt", ctypes.c_ssize_t),
        ("dk_log2_size", ctypes.c_uint8),
        ("dk_log2_index_bytes", ctypes.c_uint8),
        ("dk_kind", ctypes.c_uint8),
        # uint32 整列用のパディング
        ("_pad", ctypes.c_uint8),
        ("dk_version", ctypes.c_uint32),
        ("dk_usable", ctypes.c_ssize_t),
        ("dk_nentries", ctypes.c_ssize_t),
        # ここから先に flexible array: char dk_indices[]
    ]

# https://github.com/python/cpython/blob/3.13/Include/internal/pycore_dict.h#L76
class PyDictKeyEntry(ctypes.Structure):
    _fields_ = [
        ("me_hash", ctypes.c_ssize_t),
        ("me_key", ctypes.c_void_p),
        # This field is only meaningful for combined tables
        ("me_value", ctypes.c_void_p),
    ]

# https://github.com/python/cpython/blob/3.13/Include/internal/pycore_dict.h#L83
class PyDictUnicodeEntry(ctypes.Structure):
    _fields_ = [
        # The key must be Unicode and have hash
        ("me_key", ctypes.c_void_p),
        # This field is only meaningful for combined tables
        ("me_value", ctypes.c_void_p),
    ]

# https://github.com/python/cpython/blob/3.13/Include/internal/pycore_dict.h#L196
class PyDictValues(ctypes.Structure):
    _fields_ = [
        ("capacity", ctypes.c_uint8),
        ("size", ctypes.c_uint8),
        ("embedded", ctypes.c_uint8),
        ("valid", ctypes.c_uint8),
        ("values", ctypes.c_void_p),
    ]

idを使用して取得したオブジェクトのアドレスからPyDictObjectにキャストするには以下のような実装を行います。

    # https://github.com/python/cpython/blob/3.13/Objects/dictobject.c
    # https://github.com/python/cpython/blob/3.13/Objects/dictnotes.txt
    res_base = inspect_base(addr)
    assert res_base.get('type_obj') == dict
    obj = ctypes.cast(addr, ctypes.POINTER(PyDictObject)).contents
    res = {
        "ma_used": obj.ma_used,
        "ma_version_tag": obj.ma_version_tag,
        "ma_keys_addr": obj.ma_keys,
        "ma_values_addr": obj.ma_values,
    } | res_base

obj.ma_keysのアドレスをPyDictKeysObjectにキャストするのは以下のようにします。

    key_obj = ctypes.cast(obj.ma_keys, ctypes.POINTER(PyDictKeysObject)).contents
    DictKeysKind = {
        0: "DICT_KEYS_GENERAL",
        1: "DICT_KEYS_UNICODE",
        2: "DICT_KEYS_SPLIT",
    }
    dk_kind = DictKeysKind.get(key_obj.dk_kind)
    ma_keys = {
        "dk_refcnt": key_obj.dk_refcnt,
        "dk_log2_size": key_obj.dk_log2_size,
        "dk_log2_index_bytes": key_obj.dk_log2_index_bytes,
        "dk_kind": dk_kind,
        "dk_version": key_obj.dk_version,
        "dk_usable": key_obj.dk_usable,
        "dk_nentries": key_obj.dk_nentries,
    }
    res['ma_keys'] = ma_keys

次にdk_indicesを構築しますが、そのサイズとデータ幅を取得するためにdk_sizeを求めたのち、そこからdk_indicesを取得する必要があります。

    base_indices = obj.ma_keys + ctypes.sizeof(PyDictKeysObject) 
    # https://github.com/python/cpython/blob/main/Include/internal/pycore_dict.h#L237C9-L237C20
    # Include/pymacro.h
    # Prevent using an expression as a l-value.
    # For example, "int x; _Py_RVALUE(x) = 1;" fails with a compiler error.
    # #define _Py_RVALUE(EXPR) ((void)0, (EXPR))
    dk_size = 1 << key_obj.dk_log2_size
    ma_keys['dk_size'] = dk_size

    #  https://github.com/python/cpython/blob//3.13/Include/internal/pycore_dict.h#L167
    if dk_size <= 0xff:
        dk_indices = (ctypes.c_int8 * dk_size).from_address(base_indices)
    elif dk_size <= 0xffff:
        dk_indices = (ctypes.c_int16 * dk_size).from_address(base_indices)
    elif dk_size <= 0xffffffff:
        dk_indices = (ctypes.c_int32 * dk_size).from_address(base_indices)
    else:
        dk_indices = (ctypes.c_int64 * dk_size).from_address(base_indices)
    indices = []
    for i in range(dk_size):
        indices.append(dk_indices[i])
    ma_keys['dk_indices'] = indices

dk_entriesはdk_log2_index_bytesから開始位置を取得して、kindの種類とdk_sizeからサイズを取得して解析できます。

    indices_bytes_total = 1 << key_obj.dk_log2_index_bytes       # dk_indices 全体のバイト数
    entries_addr = base_indices + indices_bytes_total

    # USABLE_FRACTION
    # https://github.com/python/cpython/blob/3.13/Objects/dictobject.c#L576
    usable = (dk_size * 2) // 3
    Entry = PyDictUnicodeEntry if dk_kind != "DICT_KEYS_GENERAL" else PyDictKeyEntry
    entries = (Entry * usable).from_address(entries_addr)
    dk_entries = []
    for i in range(usable):
        e = entries[i]
        key = None
        if e.me_key:
            key = ctypes.cast(e.me_key, ctypes.py_object).value
        value = None
        if e.me_value:
            value = ctypes.cast(e.me_value, ctypes.py_object).value
        entry = {
            "me_key": key,
            "me_value": value,
        }
        if dk_kind == "DICT_KEYS_GENERAL":
            entry['hash'] = e.me_hash
        elif key:
            entry['hash'] = key.__hash__()
        dk_entries.append(entry)

dk_kindの確認

次のケースにおいてdk_kindが異なることが確認できます。
また、dk_kindがDICT_KEYS_SPLITの場合はma_valuesが存在しており、辞書の値が別領域に確保されていることが確認できます。

宣言 dk_kind ma_values
var1 = {9: "test1", 103: "test2", 1: "test3"} DICT_KEYS_GENERAL なし
`var2 = { "a": "test1", "b": "ああああ"} DICT_KEYS_UNICODE なし
var3 = C().__dict__ DICT_KEYS_SPLIT あり

データの追加と削除

データの追加削除をしたケースにおけるデータの変化を確認します。

操作 ma_used ma_version_tag dk_usable dk_nentries dk_indices
var2 = { "a": "test1", "b": "ああああ"} 2 105148416 3 2 [-1,0,-1,-1, -1, -1, -1, 1]
var2["c"] = "追加" 3 106618880 2 3 [2,0,-1,-1,-1, -1, -1, 1]
del var2["a"] 2 106864640 2 3 [2, -2,-1,-1, -1, -1, -1, 1]
var2["d"] = "削除後に追加" 3 107106304 1 4 [2, -2,-1,-1, -1, -1, 1, 3]
var2["e"], var2["f"]に追加 5 107356160 5 5 [1, -1, 4, 3, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1, 2]
  • dictのデータ数はma_usedで表せる
  • ma_version_tagはdictを操作するたびに変更される
  • dk_nentriesはデータの削除が発生してもかわらない
  • dk_usableが0になったのちに追加をすると、dk_indicesの再配置がおこなわれる。
  • dk_indicesの格納位置はhashによって変わる。すなわち、dk_indicesは実行するたびに格納場所が変わるので上記の実験結果とは一致しない。

参考

Discussion