Open3
RemillのIR生成パターン

push r64
などのISA固有のスタック操作がどのようにモデルされるか
TL;DR: セマンティクスは保存されているが、仮想CPU的コードをLLVM IRに持ち上げている都合上、再コンパイルしたセマンティクスを読み解くには労力が必要。特定のISAしか考慮しない場合はLLVM IRのIntrinsicsを素直に使ってStateなど余計な要素を排除したい。
docker run --rm -it remill --arch amd64 --os windows --ir_out /dev/stdout --bytes 50C3 > pushrax.ll
RemillのRuntimeをビルドするのがしんどいので仮にダミーオブジェクトを作ってコンパイルを通す
dummy.c
__declspec(dllexport) void __remill_write_memory_64(void) {}
__declspec(dllexport) void __remill_read_memory_64(void) {}
__declspec(dllexport) void __remill_missing_block(void) {}
__declspec(dllexport) void __remill_function_return(void) {}
clang -c dummy.c -o dummy.obj
llvm-as pushrax.ll -o pushrax.bc
llc -mtriple=x86_64-pc-windows-msvc -filetype=obj pushrax.bc -o pushrax.obj
lld-link pushrax.obj dummy.obj /noentry /dll
生成されるLLVM IR
pushrax.ll
; ModuleID = 'lifted_code'
source_filename = "lifted_code"
target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-windows-msvc-coff"
%struct.State = type { %struct.X86State }
%struct.X86State = type { %struct.ArchState, [32 x %union.VectorReg], %struct.ArithFlags, %union.anon, %struct.Segments, %struct.AddressSpace, %struct.GPR, %struct.X87Stack, %struct.MMX, %struct.FPUStatusFlags, %union.anon, %union.FPU, %struct.SegmentCaches, %struct.K_REG }
%struct.ArchState = type { i32, i32, %union.anon }
%union.VectorReg = type { %union.vec512_t }
%union.vec512_t = type { %struct.uint64v8_t }
%struct.uint64v8_t = type { [8 x i64] }
%struct.ArithFlags = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 }
%struct.Segments = type { i16, %union.SegmentSelector, i16, %union.SegmentSelector, i16, %union.SegmentSelector, i16, %union.SegmentSelector, i16, %union.SegmentSelector, i16, %union.SegmentSelector }
%union.SegmentSelector = type { i16 }
%struct.AddressSpace = type { i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg }
%struct.Reg = type { %union.anon }
%struct.GPR = type { i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg, i64, %struct.Reg }
%struct.X87Stack = type { [8 x %struct.anon.3] }
%struct.anon.3 = type { [6 x i8], %struct.float80_t }
%struct.float80_t = type { [10 x i8] }
%struct.MMX = type { [8 x %struct.anon.4] }
%struct.anon.4 = type { i64, %union.vec64_t }
%union.vec64_t = type { %struct.uint64v1_t }
%struct.uint64v1_t = type { [1 x i64] }
%struct.FPUStatusFlags = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, [4 x i8] }
%union.anon = type { i64 }
%union.FPU = type { %struct.anon.13 }
%struct.anon.13 = type { %struct.FpuFXSAVE, [96 x i8] }
%struct.FpuFXSAVE = type { %union.SegmentSelector, %union.SegmentSelector, %union.FPUAbridgedTagWord, i8, i16, i32, %union.SegmentSelector, i16, i32, %union.SegmentSelector, i16, %union.FPUControlStatus, %union.FPUControlStatus, [8 x %struct.FPUStackElem], [16 x %union.vec128_t] }
%union.FPUAbridgedTagWord = type { i8 }
%union.FPUControlStatus = type { i32 }
%struct.FPUStackElem = type { %union.anon.11, [6 x i8] }
%union.anon.11 = type { %struct.float80_t }
%union.vec128_t = type { %struct.uint128v1_t }
%struct.uint128v1_t = type { [1 x i128] }
%struct.SegmentCaches = type { %struct.SegmentShadow, %struct.SegmentShadow, %struct.SegmentShadow, %struct.SegmentShadow, %struct.SegmentShadow, %struct.SegmentShadow }
%struct.SegmentShadow = type { %union.anon, i32, i32 }
%struct.K_REG = type { [8 x %struct.anon.18] }
%struct.anon.18 = type { i64, i64 }
define ptr @sub_0(ptr noalias %state, i64 %program_counter, ptr noalias %memory) {
%RAX = getelementptr inbounds %struct.State, ptr %state, i32 0, i32 0, i32 6, i32 1, i32 0, i32 0
%BRANCH_TAKEN = alloca i8, align 1
%RETURN_PC = alloca i64, align 8
%MONITOR = alloca i64, align 8
%STATE = alloca ptr, align 8
store ptr %state, ptr %STATE, align 8
%MEMORY = alloca ptr, align 8
store ptr %memory, ptr %MEMORY, align 8
%NEXT_PC = alloca i64, align 8
store i64 %program_counter, ptr %NEXT_PC, align 8
%PC = getelementptr inbounds %struct.State, ptr %state, i32 0, i32 0, i32 6, i32 33, i32 0, i32 0
%CSBASE = alloca i64, align 8
store i64 0, ptr %CSBASE, align 8
%SSBASE = alloca i64, align 8
store i64 0, ptr %SSBASE, align 8
%ESBASE = alloca i64, align 8
store i64 0, ptr %ESBASE, align 8
%DSBASE = alloca i64, align 8
store i64 0, ptr %DSBASE, align 8
store i64 %program_counter, ptr %NEXT_PC, align 8
br label %1
1: ; preds = %0
%2 = load i64, ptr %NEXT_PC, align 8
store i64 %2, ptr %PC, align 8
%3 = add i64 %2, 1
store i64 %3, ptr %NEXT_PC, align 8
%4 = load i64, ptr %RAX, align 8
%5 = load ptr, ptr %MEMORY, align 8
%rsp.i.i = getelementptr inbounds %struct.X86State, ptr %state, i64 0, i32 6, i32 13
%6 = load i64, ptr %rsp.i.i, align 8
%sub.i.i.i = add i64 %6, -8
%call.i.i.i = call ptr @__remill_write_memory_64(ptr noundef %5, i64 noundef %sub.i.i.i, i64 noundef %4) #2
store i64 %sub.i.i.i, ptr %rsp.i.i, align 8
store ptr %call.i.i.i, ptr %MEMORY, align 8
br label %7
7: ; preds = %1
%8 = load i64, ptr %NEXT_PC, align 8
store i64 %8, ptr %PC, align 8
%9 = add i64 %8, 1
store i64 %9, ptr %NEXT_PC, align 8
%10 = load ptr, ptr %MEMORY, align 8
%rsp.i = getelementptr inbounds %struct.X86State, ptr %state, i64 0, i32 6, i32 13
%11 = load i64, ptr %rsp.i, align 8
%call.i.i = call i64 @__remill_read_memory_64(ptr noundef %10, i64 noundef %11) #2
%rip.i = getelementptr inbounds %struct.X86State, ptr %state, i64 0, i32 6, i32 33
store i64 %call.i.i, ptr %rip.i, align 8
store i64 %call.i.i, ptr %NEXT_PC, align 8
%12 = load i64, ptr %rsp.i, align 8
%add.i.i = add i64 %12, 8
store i64 %add.i.i, ptr %rsp.i, align 8
store ptr %10, ptr %MEMORY, align 8
%13 = load i64, ptr %NEXT_PC, align 8
store i64 %13, ptr %PC, align 8
%14 = load ptr, ptr %MEMORY, align 8
%15 = load i64, ptr %PC, align 8
%16 = tail call ptr @__remill_function_return(ptr %state, i64 %15, ptr %14)
ret ptr %16
}
; Function Attrs: noduplicate noinline nounwind optnone
declare ptr @__remill_write_memory_64(ptr noundef, i64 noundef, i64 noundef) #0
; Function Attrs: noduplicate noinline nounwind optnone
declare i64 @__remill_read_memory_64(ptr noundef, i64 noundef) #0
; Function Attrs: noduplicate noinline nounwind optnone
declare ptr @__remill_function_return(ptr noundef nonnull align 1, i64 noundef, ptr noundef) #1
attributes #0 = { noduplicate noinline nounwind optnone "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "tune-cpu"="generic" }
attributes #1 = { noduplicate noinline nounwind optnone "frame-pointer"="all" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "tune-cpu"="generic" }
attributes #2 = { nobuiltin nounwind "no-builtins" }
エントリブロック、このGEPはstate->gpr->rax->qword
、以下ステートのセットアップ
%RAX = getelementptr inbounds %struct.State, ptr %state, i32 0, i32 0, i32 6, i32 1, i32 0, i32 0
ブロック1、はpush rax
の操作を忠実に再現
1: ; preds = %0
%2 = load i64, ptr %NEXT_PC, align 8 ; %2 = pc
store i64 %2, ptr %PC, align 8 ; rip = %2
%3 = add i64 %2, 1 ; pc++ (50 - push rax)
store i64 %3, ptr %NEXT_PC, align 8 ; %3 = next_pc
%4 = load i64, ptr %RAX, align 8 ; %4 = state->rax
%5 = load ptr, ptr %MEMORY, align 8 ; %5 = memory
; state->gpr->rsp
%rsp.i.i = getelementptr inbounds %struct.X86State, ptr %state, i64 0, i32 6, i32 13
%6 = load i64, ptr %rsp.i.i, align 8 ; %6 = rsp
%sub.i.i.i = add i64 %6, -8 ; rsp -= 8
; raxをスタックにプッシュ
%call.i.i.i = call ptr @__remill_write_memory_64(ptr noundef %5, i64 noundef %sub.i.i.i, i64 noundef %4) #2
store i64 %sub.i.i.i, ptr %rsp.i.i, align 8 ; rsp更新
store ptr %call.i.i.i, ptr %MEMORY, align 8 ; memoryポインタ更新
br label %7
ブロック7はret
の操作を再現(省略)
7: ; preds = %1
%8 = load i64, ptr %NEXT_PC, align 8
store i64 %8, ptr %PC, align 8
%9 = add i64 %8, 1
store i64 %9, ptr %NEXT_PC, align 8
%10 = load ptr, ptr %MEMORY, align 8
%rsp.i = getelementptr inbounds %struct.X86State, ptr %state, i64 0, i32 6, i32 13
%11 = load i64, ptr %rsp.i, align 8
%call.i.i = call i64 @__remill_read_memory_64(ptr noundef %10, i64 noundef %11) #2
%rip.i = getelementptr inbounds %struct.X86State, ptr %state, i64 0, i32 6, i32 33
store i64 %call.i.i, ptr %rip.i, align 8
store i64 %call.i.i, ptr %NEXT_PC, align 8
%12 = load i64, ptr %rsp.i, align 8
%add.i.i = add i64 %12, 8
store i64 %add.i.i, ptr %rsp.i, align 8
store ptr %10, ptr %MEMORY, align 8
これをPEとしてコンパイルしたアセンブリ
sub_180001000 proc near
var_60= qword ptr -60h
var_58= qword ptr -58h
var_50= qword ptr -50h
var_48= qword ptr -48h
var_40= qword ptr -40h
var_38= qword ptr -38h
var_30= qword ptr -30h
push rsi
push rdi
sub rsp, 78h
mov rsi, rcx
mov [rsp+88h+var_30], rcx
mov [rsp+88h+var_58], r8
mov [rsp+88h+var_60], rdx
mov [rsp+88h+var_38], 0
mov [rsp+88h+var_40], 0
mov [rsp+88h+var_48], 0
mov [rsp+88h+var_50], 0
mov [rcx+9A8h], rdx
inc rdx
mov [rsp+88h+var_60], rdx
mov rcx, [rsp+88h+var_58]
mov r8, [rsi+8A8h]
mov rdi, [rsi+908h]
add rdi, 0FFFFFFFFFFFFFFF8h
mov rdx, rdi
call __remill_write_memory_64
mov [rsi+908h], rdi
mov [rsp+88h+var_58], rax
mov rax, [rsp+88h+var_60]
mov [rsi+9A8h], rax
inc rax
mov [rsp+88h+var_60], rax
mov rdi, [rsp+88h+var_58]
mov rdx, [rsi+908h]
mov rcx, rdi
call __remill_read_memory_64
mov [rsi+9A8h], rax
mov [rsp+88h+var_60], rax
add qword ptr [rsi+908h], 8
mov [rsp+88h+var_58], rdi
mov rdx, [rsp+88h+var_60]
mov [rsi+9A8h], rdx
mov r8, [rsp+88h+var_58]
mov rcx, rsi
add rsp, 78h
pop rdi
pop rsi
jmp __remill_function_return
sub_180001000 endp
TL;DR:
__int64 __fastcall sub_180001000(State *state, uint64_t pc, __int64 a3)
{
uint64_t v4; // rdi
__int64 v5; // rax
__int64 v6; // rdi
uint64_t memory_64; // rax
uint64_t v9; // [rsp+28h] [rbp-60h]
state[18].rdx = pc;
v9 = pc + 1;
v4 = state[17].rax - 8;
v5 = _remill_write_memory_64(a3, state[17].rax - 8, state[16].rdi);
state[17].rax = v4;
state[18].rdx = v9;
v6 = v5;
memory_64 = _remill_read_memory_64(v5, state[17].rax);
state[18].rdx = memory_64;
state[17].rax += 8LL;
state[18].rdx = memory_64;
return _remill_function_return(state, memory_64, v6);
}

cpuid
などのレジスタ依存があるISA固有のIntrinsicsがどのようにリフトされるか
これについてはLLVM IRのIntrinsicsでcpuidを保存しないと出力は"Performant LLVM IR"にはならないので少し気になった
docker run --rm -it remill --arch amd64 --os windows --ir_out /dev/stdout --bytes 0FA2 > cpuid.ll
出力が長いので先に結論、やはりLLVM IRのIntrinsicsがあった
%38 = call { i64, i64, i64, i64 } asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},{ax},{bx},{cx},{dx},~{dirflag},~{fpsr},~{flags}"(i64 %31, i64 %33, i64 %35, i64 %37) #4
特に特徴的なのはハイパーコール的なインターフェースがあることと、rdmsr
, wrmsr
, sysenter
, syscall
, sysret
のintrinsicsがあること。特権命令が含まれているしそもそもWindowsのsyscallインターフェースと異なるのでPerformant LLVM IRでは明らかにない。さらに巨大なスイッチテーブルについては言うまでもない
%71 = call i32 asm sideeffect "wrmsr", "={cx},{ax},{dx},~{dirflag},~{fpsr},~{flags}"(i32 %68, i32 %70) #4
出力されたLLVM IR

長すぎてアセンブリを張り付けるのがしんどいのでコンパイル後模擬コード
__int64 __fastcall sub_180001000(_QWORD *a1, __int64 a2, __int64 a3)
{
__int64 v10; // rdx
__int64 v12; // [rsp+20h] [rbp-118h]
_QWORD *v13; // [rsp+28h] [rbp-110h]
__int64 v14; // [rsp+30h] [rbp-108h]
__int64 v15; // [rsp+48h] [rbp-F0h]
__int64 v16; // [rsp+60h] [rbp-D8h]
__int64 v17; // [rsp+68h] [rbp-D0h]
__int64 v18; // [rsp+70h] [rbp-C8h]
__int64 v19; // [rsp+78h] [rbp-C0h]
__int64 v20; // [rsp+80h] [rbp-B8h]
__int64 v21; // [rsp+88h] [rbp-B0h]
__int64 v22; // [rsp+90h] [rbp-A8h]
__int64 v23; // [rsp+98h] [rbp-A0h]
__int64 v24; // [rsp+A0h] [rbp-98h]
int v25; // [rsp+ACh] [rbp-8Ch]
__int64 v26; // [rsp+B0h] [rbp-88h]
__int64 v27; // [rsp+B8h] [rbp-80h]
__int64 v28; // [rsp+C0h] [rbp-78h]
__int64 v29; // [rsp+C8h] [rbp-70h]
__int64 v30; // [rsp+D0h] [rbp-68h]
_QWORD *v31; // [rsp+D8h] [rbp-60h]
v31 = a1;
v15 = a3;
v30 = 0;
v29 = 0;
v28 = 0;
v27 = 0;
a1[309] = a2;
v26 = a2 + 2;
v13 = a1;
v12 = v15;
v25 = 258;
v14 = a1[289];
v24 = a1[291];
v23 = a1[293];
v22 = a1[295];
v21 = a1[297];
v20 = a1[299];
v19 = a1[301];
v18 = a1[303];
v17 = a1[305];
v16 = a1[307];
_RAX = a1[277];
__asm { cpuid }
v13[277] = _RAX;
a1[279] = _RBX;
a1[281] = _RCX;
a1[283] = _RDX;
v15 = v12;
v10 = v26;
a1[309] = v26;
return _remill_missing_block(a1, v10, v15);
}