🐍

cpython source code reading (3.10.13), 1/n

2024/03/10に公開

(この記事は書きかけです。)

概要

何回かに分けて下記を理解していく予定です。

  • .pyが解釈されて実行されるまで実行されるまで
  • list型の詳細
  • dict型の詳細

対象のソースコードはこれです。Linuxのコードを読んでいきます。
Python-3.10.13.tgz

まず、
"* .pyが解釈されて実行されるまで実行されるまで"
について、正常系の処理の流れを追っていきます。

main()

main()を検索するところから始めます。

Programs/python.c
/* Minimal main program -- everything is loaded from the library */

#include "Python.h"

#ifdef MS_WINDOWS
int
wmain(int argc, wchar_t **argv)
{
    return Py_Main(argc, argv);
}
#else
int
main(int argc, char **argv)
{
    return Py_BytesMain(argc, argv);
}
#endif

Linuxなので#else側です。
どんどん追って行きます。

Modules/main.c
Py_BytesMain(int argc, char **argv)
pymain_main(_PyArgv *args)
Py_RunMain(void)
pymain_run_python(int *exitcode)
pymain_run_file(const PyConfig *config)
pymain_run_file_obj(PyObject *program_name, PyObject *filename,
Python/pythonrun.c
_PyRun_AnyFileObject(FILE *fp, PyObject *filename, int closeit,
                     PyCompilerFlags *flags)
  _PyRun_SimpleFileObject(FILE *fp, PyObject *filename, int closeit,
                        PyCompilerFlags *flags)

.py か .pyc か?

.pycは.pyをコンパイルしたバイトコード。

Python/pythonrun.c
int
_PyRun_SimpleFileObject(FILE *fp, PyObject *filename, int closeit,
                        PyCompilerFlags *flags)
{
...
    if (pyc) {
        FILE *pyc_fp;
        /* Try to run a pyc file. First, re-open in binary */
        if (closeit) {
            fclose(fp);
        }

        pyc_fp = _Py_fopen_obj(filename, "rb");
        if (pyc_fp == NULL) {
            fprintf(stderr, "python: Can't reopen .pyc file\n");
            goto done;
        }

        if (set_main_loader(d, filename, "SourcelessFileLoader") < 0) {
            fprintf(stderr, "python: failed to set __main__.__loader__\n");
            ret = -1;
            fclose(pyc_fp);
            goto done;
        }
        v = run_pyc_file(pyc_fp, d, d, flags);
    } else {
        /* When running from stdin, leave __main__.__loader__ alone */
        if (PyUnicode_CompareWithASCIIString(filename, "<stdin>") != 0 &&
            set_main_loader(d, filename, "SourceFileLoader") < 0) {
            fprintf(stderr, "python: failed to set __main__.__loader__\n");
            ret = -1;
            goto done;
        }
        v = pyrun_file(fp, filename, Py_file_input, d, d,
                       closeit, flags);
    }

.py の場合

Python/pythonrun.c
pyrun_file(FILE *fp, PyObject *filename, int start, PyObject *globals,
           PyObject *locals, int closeit, PyCompilerFlags *flags)
run_mod(mod_ty mod, PyObject *filename, PyObject *globals, PyObject *locals,
            PyCompilerFlags *flags, PyArena *arena)
run_eval_code_obj(PyThreadState *tstate, PyCodeObject *co, PyObject *globals, PyObject *locals)
Python/ceval.c
PyEval_EvalCode(PyObject *co, PyObject *globals, PyObject *locals)
_PyEval_Vector(PyThreadState *tstate, PyFrameConstructor *con,
               PyObject *locals,
               PyObject* const* args, size_t argcount,
               PyObject *kwnames)

.pyc の場合

Python/pythonrun.c
run_pyc_file(FILE *fp, PyObject *globals, PyObject *locals,
             PyCompilerFlags *flags)
run_eval_code_obj(PyThreadState *tstate, PyCodeObject *co, PyObject *globals, PyObject *locals)
Python/ceval.c
PyEval_EvalCode(PyObject *co, PyObject *globals, PyObject *locals)

.pyの場合と合流しました。

PyFrameObjectとは?

あとで調べる
frameがなんの単位かわからないけど、code segment, symbolなどを保持しているので、ソースコードの解析結果の中間データなのだと思う。2つ以上存在することがあるのか?

Python/ceval.c
    PyFrameObject *f = _PyEval_MakeFrameVector(
        tstate, con, locals, args, argcount, kwnames);
Include/internal/pycore_ceval.h
static inline PyObject*
_PyEval_EvalFrame(PyThreadState *tstate, PyFrameObject *f, int throwflag)
{
    return tstate->interp->eval_frame(tstate, f, throwflag);
}
Include/pyframe.h
typedef struct _frame PyFrameObject;
Include/cpython/frameobject.h
struct _frame {
    PyObject_VAR_HEAD
    struct _frame *f_back;      /* previous frame, or NULL */
    PyCodeObject *f_code;       /* code segment */
    PyObject *f_builtins;       /* builtin symbol table (PyDictObject) */
    PyObject *f_globals;        /* global symbol table (PyDictObject) */
    PyObject *f_locals;         /* local symbol table (any mapping) */
    PyObject **f_valuestack;    /* points after the last local */
    PyObject *f_trace;          /* Trace function */
    int f_stackdepth;           /* Depth of value stack */
    char f_trace_lines;         /* Emit per-line trace events? */
    char f_trace_opcodes;       /* Emit per-opcode trace events? */

    /* Borrowed reference to a generator, or NULL */
    PyObject *f_gen;

    int f_lasti;                /* Last instruction if called */
    int f_lineno;               /* Current line number. Only valid if non-zero */
    int f_iblock;               /* index in f_blockstack */
    PyFrameState f_state;       /* What state the frame is in */
    PyTryBlock f_blockstack[CO_MAXBLOCKS]; /* for try and loop blocks */
    PyObject *f_localsplus[1];  /* locals+stack, dynamically sized */
};
Python/pystate.c
PyInterpreterState_New(void)
    interp->eval_frame = _PyEval_EvalFrameDefault;
Python/ceval.c
PyObject* _Py_HOT_FUNCTION
_PyEval_EvalFrameDefault(PyThreadState *tstate, PyFrameObject *f, int throwflag)
...
main_loop:
    for (;;) {
...
    dispatch_opcode:
#ifdef DYNAMIC_EXECUTION_PROFILE
#ifdef DXPAIRS
        dxpairs[lastopcode][opcode]++;
        lastopcode = opcode;
#endif
        dxp[opcode]++;
#endif

        switch (opcode) {

        /* BEWARE!
           It is essential that any operation that fails must goto error
           and that all operation that succeed call DISPATCH() ! */

        case TARGET(NOP): {
            DISPATCH();
        }
...
        } /* switch */

        /* This should never be reached. Every opcode should end with DISPATCH()
           or goto error. */
        Py_UNREACHABLE();

これがmainloopぽい。
ほとんどのcaseの最後にDISPATCH()マクロがある。
Py_UNREACHABLE(); とあるので、バグがなければDISPATCH()を経由して次のopcodeを実行することになる。

opcode

Include/opcode.h
/* Auto-generated by Tools/scripts/generate_opcode_h.py from Lib/opcode.py */
#ifndef Py_OPCODE_H
#define Py_OPCODE_H
#ifdef __cplusplus
extern "C" {
#endif


    /* Instruction opcodes for compiled code */
#define POP_TOP                   1
#define ROT_TWO                   2
#define ROT_THREE                 3
#define DUP_TOP                   4
...

opcodeが160個くらいある

DISPATCH()とは?

Python/ceval.c
#define DISPATCH() \
    { \
        if (trace_info.cframe.use_tracing OR_DTRACE_LINE OR_LLTRACE) { \
            goto tracing_dispatch; \
        } \
        f->f_lasti = INSTR_OFFSET(); \
        NEXTOPARG(); \
        goto *opcode_targets[opcode]; \
    }

    tracing_dispatch:
Python/opcode_targets.h
static void *opcode_targets[256] = {
    &&_unknown_opcode,
    &&TARGET_POP_TOP,
    &&TARGET_ROT_TWO,
...

TARGET_POP_TOP などはどこで定義されているか不明?

Discussion