Open2日前にコメント追加3

RemillのIR生成パターン

`push r64`などのISA固有のスタック操作がどのようにモデルされるか

TL;DR: セマンティクスは保存されているが、仮想CPU的コードをLLVM IRに持ち上げている都合上、再コンパイルしたセマンティクスを読み解くには労力が必要。特定のISAしか考慮しない場合はLLVM IRのIntrinsicsを素直に使ってStateなど余計な要素を排除したい。

docker run --rm -it remill --arch amd64 --os windows --ir_out /dev/stdout --bytes 50C3 > pushrax.ll

RemillのRuntimeをビルドするのがしんどいので仮にダミーオブジェクトを作ってコンパイルを通す

dummy.c

__declspec(dllexport) void __remill_write_memory_64(void) {}
__declspec(dllexport) void __remill_read_memory_64(void) {}
__declspec(dllexport) void __remill_missing_block(void) {}
__declspec(dllexport) void __remill_function_return(void) {}

clang -c dummy.c -o dummy.obj

llvm-as pushrax.ll -o pushrax.bc
llc -mtriple=x86_64-pc-windows-msvc -filetype=obj pushrax.bc -o pushrax.obj
lld-link pushrax.obj dummy.obj /noentry /dll

生成されるLLVM IR

pushrax.ll

; ModuleID = 'lifted_code'
source_filename = "lifted_code"
target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-windows-msvc-coff"

%struct.State = type { %struct.X86State }
%struct.X86State = type { %struct.ArchState, [32 x %union.VectorReg], %struct.ArithFlags, %union.anon, %struct.Segments, %struct.AddressSpace, %struct.GPR, %struct.X87Stack, %struct.MMX, %struct.FPUStatusFlags, %union.anon, %union.FPU, %struct.SegmentCaches, %struct.K_REG }
%struct.ArchState = type { i32, i32, %union.anon }
%union.VectorReg = type { %union.vec512_t }
%union.vec512_t = type { %struct.uint64v8_t }
%struct.uint64v8_t = type { [8 x i64] }
%struct.ArithFlags = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 }
%struct.Segments = type { i16, %union.SegmentSelector, i16, %union.SegmentSelector, i16, %union.SegmentSelector, i16, %union.SegmentSelector, i16, %union.SegmentSelector, i16, %union.SegmentSelector }
%union.SegmentSelector = type { i16 }
%struct.AddressSpace = type { i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg }
%struct.Reg = type { %union.anon }
%struct.GPR = type { i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg }
%struct.X87Stack = type { [8 x %struct.anon.3] }
%struct.anon.3 = type { [6 x i8], %struct.float80_t }
%struct.float80_t = type { [10 x i8] }
%struct.MMX = type { [8 x %struct.anon.4] }
%struct.anon.4 = type { i64, %union.vec64_t }
%union.vec64_t = type { %struct.uint64v1_t }
%struct.uint64v1_t = type { [1 x i64] }
%struct.FPUStatusFlags = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, [4 x i8] }
%union.anon = type { i64 }
%union.FPU = type { %struct.anon.13 }
%struct.anon.13 = type { %struct.FpuFXSAVE, [96 x i8] }
%struct.FpuFXSAVE = type { %union.SegmentSelector, %union.SegmentSelector, %union.FPUAbridgedTagWord, i8, i16, i32, %union.SegmentSelector, i16, i32, %union.SegmentSelector, i16, %union.FPUControlStatus, %union.FPUControlStatus, [8 x %struct.FPUStackElem], [16 x %union.vec128_t] }
%union.FPUAbridgedTagWord = type { i8 }
%union.FPUControlStatus = type { i32 }
%struct.FPUStackElem = type { %union.anon.11, [6 x i8] }
%union.anon.11 = type { %struct.float80_t }
%union.vec128_t = type { %struct.uint128v1_t }
%struct.uint128v1_t = type { [1 x i128] }
%struct.SegmentCaches = type { %struct.SegmentShadow, %struct.SegmentShadow, %struct.SegmentShadow, %struct.SegmentShadow, %struct.SegmentShadow, %struct.SegmentShadow }
%struct.SegmentShadow = type { %union.anon, i32, i32 }
%struct.K_REG = type { [8 x %struct.anon.18] }
%struct.anon.18 = type { i64, i64 }

define ptr @sub_0(ptr noalias %state, i64 %program_counter, ptr noalias %memory) {
  %RAX = getelementptr inbounds %struct.State, ptr %state, i32 0, i32 0, i32 6, i32 1, i32 0, i32 0
  %BRANCH_TAKEN = alloca i8, align 1
  %RETURN_PC = alloca i64, align 8
  %MONITOR = alloca i64, align 8
  %STATE = alloca ptr, align 8
  store ptr %state, ptr %STATE, align 8
  %MEMORY = alloca ptr, align 8
  store ptr %memory, ptr %MEMORY, align 8
  %NEXT_PC = alloca i64, align 8
  store i64 %program_counter, ptr %NEXT_PC, align 8
  %PC = getelementptr inbounds %struct.State, ptr %state, i32 0, i32 0, i32 6, i32 33, i32 0, i32 0
  %CSBASE = alloca i64, align 8
  store i64 0, ptr %CSBASE, align 8
  %SSBASE = alloca i64, align 8
  store i64 0, ptr %SSBASE, align 8
  %ESBASE = alloca i64, align 8
  store i64 0, ptr %ESBASE, align 8
  %DSBASE = alloca i64, align 8
  store i64 0, ptr %DSBASE, align 8
  store i64 %program_counter, ptr %NEXT_PC, align 8
  br label %1

1:                                                ; preds = %0
  %2 = load i64, ptr %NEXT_PC, align 8
  store i64 %2, ptr %PC, align 8
  %3 = add i64 %2, 1
  store i64 %3, ptr %NEXT_PC, align 8
  %4 = load i64, ptr %RAX, align 8
  %5 = load ptr, ptr %MEMORY, align 8
  %rsp.i.i = getelementptr inbounds %struct.X86State, ptr %state, i64 0, i32 6, i32 13
  %6 = load i64, ptr %rsp.i.i, align 8
  %sub.i.i.i = add i64 %6, -8
  %call.i.i.i = call ptr @__remill_write_memory_64(ptr noundef %5, i64 noundef %sub.i.i.i, i64 noundef %4) #2
  store i64 %sub.i.i.i, ptr %rsp.i.i, align 8
  store ptr %call.i.i.i, ptr %MEMORY, align 8
  br label %7

7:                                                ; preds = %1
  %8 = load i64, ptr %NEXT_PC, align 8
  store i64 %8, ptr %PC, align 8
  %9 = add i64 %8, 1
  store i64 %9, ptr %NEXT_PC, align 8
  %10 = load ptr, ptr %MEMORY, align 8
  %rsp.i = getelementptr inbounds %struct.X86State, ptr %state, i64 0, i32 6, i32 13
  %11 = load i64, ptr %rsp.i, align 8
  %call.i.i = call i64 @__remill_read_memory_64(ptr noundef %10, i64 noundef %11) #2
  %rip.i = getelementptr inbounds %struct.X86State, ptr %state, i64 0, i32 6, i32 33
  store i64 %call.i.i, ptr %rip.i, align 8
  store i64 %call.i.i, ptr %NEXT_PC, align 8
  %12 = load i64, ptr %rsp.i, align 8
  %add.i.i = add i64 %12, 8
  store i64 %add.i.i, ptr %rsp.i, align 8
  store ptr %10, ptr %MEMORY, align 8
  %13 = load i64, ptr %NEXT_PC, align 8
  store i64 %13, ptr %PC, align 8
  %14 = load ptr, ptr %MEMORY, align 8
  %15 = load i64, ptr %PC, align 8
  %16 = tail call ptr @__remill_function_return(ptr %state, i64 %15, ptr %14)
  ret ptr %16
}

; Function Attrs: noduplicate noinline nounwind optnone
declare ptr @__remill_write_memory_64(ptr noundef, i64 noundef, i64 noundef) #0

; Function Attrs: noduplicate noinline nounwind optnone
declare i64 @__remill_read_memory_64(ptr noundef, i64 noundef) #0

; Function Attrs: noduplicate noinline nounwind optnone
declare ptr @__remill_function_return(ptr noundef nonnull align 1, i64 noundef, ptr noundef) #1

attributes #0 = { noduplicate noinline nounwind optnone "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "tune-cpu"="generic" }
attributes #1 = { noduplicate noinline nounwind optnone "frame-pointer"="all" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "tune-cpu"="generic" }
attributes #2 = { nobuiltin nounwind "no-builtins" }

エントリブロック、このGEPはstate->gpr->rax->qword、以下ステートのセットアップ

%RAX = getelementptr inbounds %struct.State, ptr %state, i32 0, i32 0, i32 6, i32 1, i32 0, i32 0

ブロック1、はpush raxの操作を忠実に再現

1:                                                ; preds = %0
  %2 = load i64, ptr %NEXT_PC, align 8        ; %2 = pc
  store i64 %2, ptr %PC, align 8              ; rip = %2
  %3 = add i64 %2, 1                          ; pc++ (50 - push rax)
  store i64 %3, ptr %NEXT_PC, align 8         ; %3 = next_pc

  %4 = load i64, ptr %RAX, align 8            ; %4 = state->rax
  %5 = load ptr, ptr %MEMORY, align 8         ; %5 = memory

  ; state->gpr->rsp
  %rsp.i.i = getelementptr inbounds %struct.X86State, ptr %state, i64 0, i32 6, i32 13
  %6 = load i64, ptr %rsp.i.i, align 8  ; %6 = rsp
  %sub.i.i.i = add i64 %6, -8           ; rsp -= 8

  ; raxをスタックにプッシュ
  %call.i.i.i = call ptr @__remill_write_memory_64(ptr noundef %5, i64 noundef %sub.i.i.i, i64 noundef %4) #2

  store i64 %sub.i.i.i, ptr %rsp.i.i, align 8 ; rsp更新
  store ptr %call.i.i.i, ptr %MEMORY, align 8 ; memoryポインタ更新
  br label %7

ブロック7はretの操作を再現（省略）

7:                                                ; preds = %1
  %8 = load i64, ptr %NEXT_PC, align 8
  store i64 %8, ptr %PC, align 8
  %9 = add i64 %8, 1
  store i64 %9, ptr %NEXT_PC, align 8

  %10 = load ptr, ptr %MEMORY, align 8

  %rsp.i = getelementptr inbounds %struct.X86State, ptr %state, i64 0, i32 6, i32 13
  %11 = load i64, ptr %rsp.i, align 8

  %call.i.i = call i64 @__remill_read_memory_64(ptr noundef %10, i64 noundef %11) #2

  %rip.i = getelementptr inbounds %struct.X86State, ptr %state, i64 0, i32 6, i32 33
  store i64 %call.i.i, ptr %rip.i, align 8
  store i64 %call.i.i, ptr %NEXT_PC, align 8

  %12 = load i64, ptr %rsp.i, align 8
  %add.i.i = add i64 %12, 8
  store i64 %add.i.i, ptr %rsp.i, align 8

  store ptr %10, ptr %MEMORY, align 8

これをPEとしてコンパイルしたアセンブリ

sub_180001000 proc near

var_60= qword ptr -60h
var_58= qword ptr -58h
var_50= qword ptr -50h
var_48= qword ptr -48h
var_40= qword ptr -40h
var_38= qword ptr -38h
var_30= qword ptr -30h

push    rsi
push    rdi
sub     rsp, 78h
mov     rsi, rcx
mov     [rsp+88h+var_30], rcx
mov     [rsp+88h+var_58], r8
mov     [rsp+88h+var_60], rdx
mov     [rsp+88h+var_38], 0
mov     [rsp+88h+var_40], 0
mov     [rsp+88h+var_48], 0
mov     [rsp+88h+var_50], 0
mov     [rcx+9A8h], rdx
inc     rdx
mov     [rsp+88h+var_60], rdx
mov     rcx, [rsp+88h+var_58]
mov     r8, [rsi+8A8h]
mov     rdi, [rsi+908h]
add     rdi, 0FFFFFFFFFFFFFFF8h
mov     rdx, rdi
call    __remill_write_memory_64
mov     [rsi+908h], rdi
mov     [rsp+88h+var_58], rax
mov     rax, [rsp+88h+var_60]
mov     [rsi+9A8h], rax
inc     rax
mov     [rsp+88h+var_60], rax
mov     rdi, [rsp+88h+var_58]
mov     rdx, [rsi+908h]
mov     rcx, rdi
call    __remill_read_memory_64
mov     [rsi+9A8h], rax
mov     [rsp+88h+var_60], rax
add     qword ptr [rsi+908h], 8
mov     [rsp+88h+var_58], rdi
mov     rdx, [rsp+88h+var_60]
mov     [rsi+9A8h], rdx
mov     r8, [rsp+88h+var_58]
mov     rcx, rsi
add     rsp, 78h
pop     rdi
pop     rsi
jmp     __remill_function_return
sub_180001000 endp

TL;DR:

__int64 __fastcall sub_180001000(State *state, uint64_t pc, __int64 a3)
{
  uint64_t v4; // rdi
  __int64 v5; // rax
  __int64 v6; // rdi
  uint64_t memory_64; // rax
  uint64_t v9; // [rsp+28h] [rbp-60h]

  state[18].rdx = pc;
  v9 = pc + 1;
  v4 = state[17].rax - 8;
  v5 = _remill_write_memory_64(a3, state[17].rax - 8, state[16].rdi);
  state[17].rax = v4;
  state[18].rdx = v9;
  v6 = v5;
  memory_64 = _remill_read_memory_64(v5, state[17].rax);
  state[18].rdx = memory_64;
  state[17].rax += 8LL;
  state[18].rdx = memory_64;
  return _remill_function_return(state, memory_64, v6);
}

kkent030315

`cpuid`などのレジスタ依存があるISA固有のIntrinsicsがどのようにリフトされるか

これについてはLLVM IRのIntrinsicsでcpuidを保存しないと出力は"Performant LLVM IR"にはならないので少し気になった

docker run --rm -it remill --arch amd64 --os windows --ir_out /dev/stdout --bytes 0FA2 > cpuid.ll

出力が長いので先に結論、やはりLLVM IRのIntrinsicsがあった

%38 = call { i64, i64, i64, i64 } asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},{ax},{bx},{cx},{dx},~{dirflag},~{fpsr},~{flags}"(i64 %31, i64 %33, i64 %35, i64 %37) #4

特に特徴的なのはハイパーコール的なインターフェースがあることと、rdmsr, wrmsr, sysenter, syscall, sysretのintrinsicsがあること。特権命令が含まれているしそもそもWindowsのsyscallインターフェースと異なるのでPerformant LLVM IRでは明らかにない。さらに巨大なスイッチテーブルについては言うまでもない

%71 = call i32 asm sideeffect "wrmsr", "={cx},{ax},{dx},~{dirflag},~{fpsr},~{flags}"(i32 %68, i32 %70) #4

出力されたLLVM IR

kkent030315

長すぎてアセンブリを張り付けるのがしんどいのでコンパイル後模擬コード

__int64 __fastcall sub_180001000(_QWORD *a1, __int64 a2, __int64 a3)
{
  __int64 v10; // rdx
  __int64 v12; // [rsp+20h] [rbp-118h]
  _QWORD *v13; // [rsp+28h] [rbp-110h]
  __int64 v14; // [rsp+30h] [rbp-108h]
  __int64 v15; // [rsp+48h] [rbp-F0h]
  __int64 v16; // [rsp+60h] [rbp-D8h]
  __int64 v17; // [rsp+68h] [rbp-D0h]
  __int64 v18; // [rsp+70h] [rbp-C8h]
  __int64 v19; // [rsp+78h] [rbp-C0h]
  __int64 v20; // [rsp+80h] [rbp-B8h]
  __int64 v21; // [rsp+88h] [rbp-B0h]
  __int64 v22; // [rsp+90h] [rbp-A8h]
  __int64 v23; // [rsp+98h] [rbp-A0h]
  __int64 v24; // [rsp+A0h] [rbp-98h]
  int v25; // [rsp+ACh] [rbp-8Ch]
  __int64 v26; // [rsp+B0h] [rbp-88h]
  __int64 v27; // [rsp+B8h] [rbp-80h]
  __int64 v28; // [rsp+C0h] [rbp-78h]
  __int64 v29; // [rsp+C8h] [rbp-70h]
  __int64 v30; // [rsp+D0h] [rbp-68h]
  _QWORD *v31; // [rsp+D8h] [rbp-60h]

  v31 = a1;
  v15 = a3;
  v30 = 0;
  v29 = 0;
  v28 = 0;
  v27 = 0;
  a1[309] = a2;
  v26 = a2 + 2;
  v13 = a1;
  v12 = v15;
  v25 = 258;
  v14 = a1[289];
  v24 = a1[291];
  v23 = a1[293];
  v22 = a1[295];
  v21 = a1[297];
  v20 = a1[299];
  v19 = a1[301];
  v18 = a1[303];
  v17 = a1[305];
  v16 = a1[307];
  _RAX = a1[277];
  __asm { cpuid }
  v13[277] = _RAX;
  a1[279] = _RBX;
  a1[281] = _RCX;
  a1[283] = _RDX;
  v15 = v12;
  v10 = v26;
  a1[309] = v26;
  return _remill_missing_block(a1, v10, v15);
}

push r64などのISA固有のスタック操作がどのようにモデルされるか

cpuidなどのレジスタ依存があるISA固有のIntrinsicsがどのようにリフトされるか

`push r64`などのISA固有のスタック操作がどのようにモデルされるか

`cpuid`などのレジスタ依存があるISA固有のIntrinsicsがどのようにリフトされるか