diff --git a/CHANGELOG.md b/CHANGELOG.md index 9f8917e9..213929f6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,8 @@ * Fix bug that caused assembling error due to wrong `symbol_minus_symbol` for lsda entries with references to the end of `.gcc_except_table` * Generate alignments for function entry blocks depending on address +* Fixed bug that could result in missed symbolic expressions + (`symbol_minus_symbol`) in LEA # 1.9.0 diff --git a/examples/asm_examples/ex_aligned_data_in_code/ex_original.s b/examples/asm_examples/ex_aligned_data_in_code/ex_original.s index dd242e55..330f7062 100644 --- a/examples/asm_examples/ex_aligned_data_in_code/ex_original.s +++ b/examples/asm_examples/ex_aligned_data_in_code/ex_original.s @@ -27,6 +27,15 @@ main: # Load data into YMM register using vmovups: `data256u` does not need to be aligned. vmovups data256u(%rip), %ymm1 + # Integer arithmetic/logical instructions that require alignment + paddq data128.3(%rip), %xmm0 + pand data128.4(%rip), %xmm0 + psllq data128.5(%rip), %xmm0 + + # Floating-point instructions that require alignment + addps data128.6(%rip), %xmm0 + andpd data128.7(%rip), %xmm0 + call print_message2 xorq %rax, %rax @@ -53,6 +62,21 @@ data128.1: .align 16 data128.2: .byte 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +.align 16 +data128.3: + .byte 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +.align 16 +data128.4: + .byte 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +.align 16 +data128.5: + .byte 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +.align 16 +data128.6: + .byte 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +.align 16 +data128.7: + .byte 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 .align 32 data256: .byte 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 diff --git a/examples/asm_examples/ex_sym_minus_sym/Makefile b/examples/asm_examples/ex_sym_minus_sym/Makefile new file mode 100644 index 00000000..cd3634c9 --- /dev/null +++ b/examples/asm_examples/ex_sym_minus_sym/Makefile @@ -0,0 +1,10 @@ + +all: ex_original.s + gcc ex_original.s -o ex + @./ex > out.txt +clean: + rm -f ex out.txt + rm -fr ex.unstripped ex.s *.old* dl_files *.gtirb +check: + ./ex > /tmp/res.txt + @ diff out.txt /tmp/res.txt && echo TEST OK diff --git a/examples/asm_examples/ex_sym_minus_sym/ex_original.s b/examples/asm_examples/ex_sym_minus_sym/ex_original.s new file mode 100644 index 00000000..9e7906dc --- /dev/null +++ b/examples/asm_examples/ex_sym_minus_sym/ex_original.s @@ -0,0 +1,121 @@ + // This example contains LEAs with symbolic expressions as displacement. + // See the labels `lea_sym_minus_sym1` and `lea_sym_minus_sym2`. + + .text + .intel_syntax noprefix + .globl one # -- Begin function one + .p2align 4, 0x90 + .type one,@function +one: + push rbx + mov ebx, edi + lea rdi, [rip + .L.str] + call puts@PLT + mov eax, ebx + pop rbx + ret + + .globl two # -- Begin function two + .p2align 4, 0x90 + .type two,@function +two: + push rbx + mov ebx, edi + lea rdi, [rip + .L.str.1] + call puts@PLT + mov eax, ebx + pop rbx + ret + + .globl fun # -- Begin function fun + .p2align 4, 0x90 + .type fun,@function +fun: + push rbp + push r14 + push rbx + push r9 + mov ebp, esi + mov ebx, edi + cmp ebx, ebp + jge .L3 + lea r14, [rip + jump_table] +loop_header: + lea eax, [rbx - 1] + cmp eax, 1 + ja .L0 +jumping_block: + movsxd rax, dword ptr [r14 + 4*rax] + add rax, r14 + jmp rax +jt_target_1: + mov edi, ebx + call one + jmp .L2 +.L0: + mov edi, ebx + jmp .L2 +lea_sym_minus_sym1: + lea r9, qword ptr [rax + target2 - lea_sym_minus_sym1] + cmp rbx, rdi + jb .L1 +lea_sym_minus_sym2: + lea r9, qword ptr [r9 + target1 - target2] +.L1: + jmp r9 +target1: + mov edi, ebx + call one + jmp .L2 +target2: + mov edi, ebx + call two +.L2: + add ebx, 1 + cmp ebp, ebx + jne loop_header +.L3: + pop r9 + pop rbx + pop r14 + pop rbp + ret + + .globl main # -- Begin function main + .type main,@function + .p2align 4, 0x90 +main: + push rax + lea rdi, [rip + .L.str.5] + call puts@PLT + mov edi, 1 + mov esi, 6 + call fun + xor eax, eax + pop rcx + ret + + .section .rodata,"a",@progbits + .p2align 2 + +// here we have a table of relative offsets (symbol minus symbol) +jump_table: + .long jt_target_1-jump_table + .long lea_sym_minus_sym1-jump_table + + .section .rodata.str1.1,"aMS",@progbits,1 + + .type .L.str,@object +.L.str: + .asciz "one" + .size .L.str, 4 + + .type .L.str.1,@object +.L.str.1: + .asciz "two" + .size .L.str.1, 4 + + .type .L.str.5,@object +.L.str.5: + .asciz "!!!Hello World!!!" + .size .L.str.5, 18 diff --git a/src/datalog/arch/intel/arch_x86.dl b/src/datalog/arch/intel/arch_x86.dl index c4082033..09cd462d 100644 --- a/src/datalog/arch/intel/arch_x86.dl +++ b/src/datalog/arch/intel/arch_x86.dl @@ -295,6 +295,36 @@ operation_alignment_required("VMOVNTPD"). operation_alignment_required("VMOVNTDQ"). operation_alignment_required("VMOVNTDQA"). +operation_alignment_required(Operation):- + instruction_get_operation(_, Operation), + ( + // Floating-Point/Integer Packed Arithmetic + substr(Operation,0,4) = "ADDP"; + substr(Operation,0,4) = "SUBP"; + substr(Operation,0,4) = "MULP"; + substr(Operation,0,4) = "DIVP"; + + // Integer Packed Arithmetic + substr(Operation,0,4) = "PADD"; + substr(Operation,0,4) = "PSUB"; + substr(Operation,0,4) = "PMUL"; + substr(Operation,0,4) = "PDIV"; + + // Logical Packed Operations + substr(Operation,0,4) = "ANDP"; + substr(Operation,0,3) = "ORP"; + substr(Operation,0,4) = "XORP"; + + substr(Operation,0,4) = "PAND"; + substr(Operation,0,3) = "POR"; + substr(Operation,0,4) = "PXOR"; + + // Packed Shifts and Blends + substr(Operation,0,4) = "PSLL"; + substr(Operation,0,4) = "PSRL"; + substr(Operation,0,4) = "PSRA" + ). + alignment_required(EA,AlignInBits):- instruction_get_operation(EA,Operation), operation_alignment_required(Operation), diff --git a/src/datalog/symbolization.dl b/src/datalog/symbolization.dl index 592e03d7..fd255eb5 100644 --- a/src/datalog/symbolization.dl +++ b/src/datalog/symbolization.dl @@ -388,6 +388,55 @@ symbol_minus_symbol_candidate(EA,Size,Symbol1,Symbol2,as(Scale,unsigned),Offset) // If Symbol1 is zero, we have an absolute address. Symbol1 != 0. +// Displacements in indirect operands with address-value register +// need to be symbolized. +// +// E.g., `__memcpy_ssse3` in libc.so: +// 01:.L_1: +// 02: leaq 40(%r9),%r9 +// 03: cmpq %rcx,%rdx +// 04: movaps -12(%rsi),%xmm1 +// 05: jb .L_aade7 +// 06: leaq -7(%r9),%r9 +// 07:.L_2: +// 08: leaq -64(%rdx),%rdx +// 09: notrack jmpq *%r9 +// 10: ud2 +// 11:.L_3: +// 12: prefetchnta -448(%rsi) +// 13:.L_4: +// 14: movaps -28(%rsi),%xmm2 +// +// .L_1 is one of the targets of a jump-table jump (jmp %9). +// Before line 2, R9's value is the address of itself. +// After line2 (.L_1 + 40 = .L_4), R9's value is .L_4. +// Note that the LEA instructions can be encoded in either 4 bytes or +// 7 bytes depending on assemblers. +// Therefore, it is important to symbolize the displacements. +// +// Similarly, before line 6, R9's value is the address of .L_4. +// After line 6 (.L_4 - 7 = .L3), R9's value is .L_3. +// +// The displacements ini the indirect operands for both the LEAs +// at line 2 and 6 need to be symbolized as +// leaq (.L_4 - .L_1)(%r9), %r9 +// and +// leaq (.L_3 - .L_4)(%r9), %r9 +// respectively. +// +symbol_minus_symbol_candidate(EA+DispOffset,DispSize,Symbol1,Symbol2,1,0):- + binary_isa("X64"), + code(EA), + value_reg_address_before(EA,Reg,Address,_), + !arch.pc_reg(Reg), + !instruction_has_relocation(EA,_), + instruction_get_op(EA,Index,Op), + op_indirect_mapped(Op,"NONE",Reg,"NONE",1,Disp,_), + Symbol1 = as(Address,address), + Symbol2 = as(Address+Disp,address), + instruction_displacement_offset(EA,Index,DispOffset,DispSize). + + //////////////////////////////////////////////////////////////////////// /** diff --git a/src/datalog/value_analysis.dl b/src/datalog/value_analysis.dl index f436eb88..f3cedae5 100644 --- a/src/datalog/value_analysis.dl +++ b/src/datalog/value_analysis.dl @@ -396,3 +396,27 @@ EARegDef: The address where Reg is defined as Value. Often EADef = EARegDef. const_value_reg_used(UsedEA,EADef,EARegDef,Reg,Value):- value_reg(EARegDef,Reg,EADef,"NONE",0,Value,_), reg_def_use.def_used(EARegDef,Reg,UsedEA,_). + +/** +`AddrValue` represents the value of register `Reg` *before* `EA`. +The value should be an address. +*/ +.decl value_reg_address_before(EA:address,Reg:register,AddrValue:number,Steps:unsigned) + +// At the jump-table target from indirect-jump via register, +// the register should have an address value. +value_reg_address_before(EA,Reg,Address,1):- + code(EA), + relative_jump_table_entry(_,TableStart,_,_,EA,_,_), + jump_table_start(EA_jump,_,TableStart,_,_), + reg_jump(EA_jump,Reg), + Address = as(EA,number). + +// Propagate address value +value_reg_address_before(UsedEA,Reg,Address+Offset,Steps+1):- + step_limit(StepLimit), + value_reg_address_before(EADef,RegDef,Address,Steps), + Steps < StepLimit, + reg_def_use.def_used(EADef,Reg,UsedEA,_), + arch.reg_arithmetic_operation(EADef,Reg,RegDef,1,Offset), + track_register(Reg). diff --git a/tests/misc_test.py b/tests/misc_test.py index ed2803be..cf697526 100644 --- a/tests/misc_test.py +++ b/tests/misc_test.py @@ -473,8 +473,8 @@ def test_aligned_data_in_code(self): ) ] - # alignment=16: `data128.1`, `data128.2` - self.assertEqual(alignment_list.count(16), 2) + # alignment=16: `data128.1`~`data128.7` + self.assertEqual(alignment_list.count(16), 7) # alignment=32: `data256` self.assertEqual(alignment_list.count(32), 1) diff --git a/tests/symbolic_operand_heuristics_test.py b/tests/symbolic_operand_heuristics_test.py index 3e981ecf..1823fb71 100644 --- a/tests/symbolic_operand_heuristics_test.py +++ b/tests/symbolic_operand_heuristics_test.py @@ -59,6 +59,41 @@ def test_lea_results(self): ) ) + @unittest.skipUnless( + platform.system() == "Linux", "This test is linux only." + ) + def test_lea_sym_minus_sym(self): + """ + Test cases where the displacement of indirect operand in LEA is the + distance between EAs. + Such displacements should be symbolized as symbol_minus_symbol. + """ + binary = Path("ex") + with cd(ex_asm_dir / "ex_sym_minus_sym"): + self.assertTrue(compile("gcc", "g++", "-O0", [])) + ir_library = disassemble(binary).ir() + m = ir_library.modules[0] + + # check that we symbolize the LEA instructions + symbolized = [ + ("lea_sym_minus_sym1", "target2", "lea_sym_minus_sym1"), + ("lea_sym_minus_sym2", "target1", "target2"), + ] + for name, sym1, sym2 in symbolized: + symbol = next(m.symbols_named(name)) + block = symbol.referent + self.assertIsInstance(block, gtirb.CodeBlock) + _, _, sym_expr = next( + block.byte_interval.symbolic_expressions_at( + range(block.address, block.address + block.size) + ) + ) + self.assertIsInstance(sym_expr, gtirb.SymAddrAddr) + self.assertEqual(sym_expr.scale, 1) + self.assertEqual(sym_expr.offset, 0) + self.assertEqual(sym_expr.symbol1.name, sym1) + self.assertEqual(sym_expr.symbol2.name, sym2) + if __name__ == "__main__": unittest.main()