From eaa84dcda0e80414a5119779712049be2da6864c Mon Sep 17 00:00:00 2001 From: Minhyuk Kim Date: Mon, 13 Jan 2025 15:36:49 +0900 Subject: [PATCH] Add rv64-matching-tool to tests --- tests/rv64-matching-tool/README.md | 78 +++++ tests/rv64-matching-tool/matching_tool.py | 213 +++++++++++++ tests/rv64-matching-tool/requirements.txt | 1 + .../supported_targets/README.md | 17 ++ .../supported_targets/asterisc-v1.1.2.json | 282 ++++++++++++++++++ 5 files changed, 591 insertions(+) create mode 100644 tests/rv64-matching-tool/README.md create mode 100644 tests/rv64-matching-tool/matching_tool.py create mode 100644 tests/rv64-matching-tool/requirements.txt create mode 100644 tests/rv64-matching-tool/supported_targets/README.md create mode 100644 tests/rv64-matching-tool/supported_targets/asterisc-v1.1.2.json diff --git a/tests/rv64-matching-tool/README.md b/tests/rv64-matching-tool/README.md new file mode 100644 index 00000000..7546070c --- /dev/null +++ b/tests/rv64-matching-tool/README.md @@ -0,0 +1,78 @@ +# RISCV64 matching tool + +This project aims to analyze a RISCV64 ELF binary to ensure that all its RISCV64 executable instructions +are supported by a virtual machine implementation. + +The tool takes as input: +- a RISCV64 binary: the ELF file to analyze +- a JSON file: the definition of the opcodes in the virtual machine implementatiob + +The tool parses the opcode of an instruction and then some of the subfields (such as `funct3`) based on the given JSON file. + +When an instruction is found in the binary but is not in the JSON file, the number of `UNKNOWN` instruction is incremented +and the instruction is collected. + +Moreover, the tool now supports detecting Linux syscalls. + +Finally, the tool prints out the number of `UNKNOWN` instruction and the number of occurences for each of them. + + +## Limits + +### Instructions parsing + +This tool is an instruction parser. As RISCV64 instructions can be closed one to the other, +the tool may not be able to detect small discrepancies. + +For example, based on the `supported_targets/asterisc-v1.1.2.json`, the tool is not yet able to differentiate +`FLW` and `FLD` instructions. + +### Machine definition + +The machine definition is defined in the JSON file. It details all the instructions that the machine is able to support. + +The unsupported instructions must not be included in the machine definition. + +Note that this definition requires detailing instruction fields such as opcode, funct3, ... + +### ELF section + +The tool only parses the `.text` section of the ELF RISCV64 binary. Potential executable instructions in other sections are not parsed. + +## Syscalls detection + +The tool supports detecting some syscalls. When an `ECALL` instruction is detected, the tool navigates through the **5 previous instructions** to find if the value of the `a7` register was set to an immediate. +The `a7` register holds the syscall identifier. + +Why **5 previous instructions**? This is an arbitrary value, determined by reverse-engineering a RISCV-64 binary. + +If no value for `a7` register is found or if the `a7` value is not recognized as supported, the tool will trigger an alert. + +## Install + +Clone the repository: + +```bash +git clone https://github.com/zigtur/rv64-matching-tool +cd rv64-matching-tool +``` + +Install the local environment and its dependencies: + +```bash +python3 -m venv localenv +source localenv/bin/activate +pip3 install -r requirements.txt +``` + +## Usage + +Execute the Python script to analyze the RV64 binary: + +```bash +python3 matching_tool.py ./path_to_binary ./supported_targets/your_VM_target.json +``` + + + + diff --git a/tests/rv64-matching-tool/matching_tool.py b/tests/rv64-matching-tool/matching_tool.py new file mode 100644 index 00000000..5ac03479 --- /dev/null +++ b/tests/rv64-matching-tool/matching_tool.py @@ -0,0 +1,213 @@ +import sys +from elftools.elf.elffile import ELFFile +import re +import json + +# Lookback 5 instructions before the ECALL +SYSCALL_INSTRUCTIONS_LOOKBACK = 5 + +def extract_text_section_instructions(elf_path): + """ + Extract and print executable instructions from the .text section of a RISC-V ELF binary. + + Args: + - elf_path (str): Path to the ELF binary file. + + Returns: + - List of hexadecimal instructions from the .text section. + """ + try: + with open(elf_path, 'rb') as f: + elffile = ELFFile(f) + + # Check if the ELF is for RISC-V architecture (EM_RISCV = 243) + if elffile['e_machine'] != 'EM_RISCV': + print(f"Error: ELF is not for RISC-V (detected: {elffile['e_machine']})") + exit(1) + + # Get the .text section + text_section = elffile.get_section_by_name('.text') + if text_section is None: + print(f"Error: Could not find the .text section in {elf_path}") + exit(1) + + # Extract the raw bytes from the .text section + text_data = text_section.data() + + # Divide the text section data into 32-bit (4-byte) RISC-V instructions + instructions = [] + for i in range(0, len(text_data), 4): + instruction_bytes = text_data[i:i + 4] + if len(instruction_bytes) < 4: + break # If the remaining bytes are less than 4, stop + instruction = int.from_bytes(instruction_bytes, byteorder='little') + instructions.append(instruction) + + return instructions + + except FileNotFoundError: + print(f"Error: File '{elf_path}' not found.") + exit(1) + except Exception as e: + print(f"Error: Unable to read the ELF file. Reason: {e}") + exit(1) + +def parse_rd(instr): + return (instr >> 7) & 0x1F + +def parse_imm_i(instr): + return (instr >> 20) & 0xFFF + +def parse_imm_u(instr): + return instr & 0xFFFFF000 + +def parse_rs1(instr): + return (instr >> 15) & 0x1F + +def parse_funct3(instr): + return (instr >> 12) & 0x7 + +def parse_funct7(instr): + return (instr >> 25) + +def parse_funct12(instr): + return (instr >> 20) & 0xFFF + +def parse_opcode(instr): + return instr & 0x7F + +def instruction_name(instruction, supported): + opcode = parse_opcode(instruction) + funct3 = parse_funct3(instruction) + funct7 = parse_funct7(instruction) + funct12 = parse_funct12(instruction) + + opcode_hex = f"{opcode:02X}" + funct3_hex = f"{funct3:02X}" + funct7_hex = f"{funct7:02X}" + funct12_hex = f"{funct12:04X}" + + for opcode_entry in supported['opcodes']: + if opcode_hex in opcode_entry: + opcode_data = opcode_entry[opcode_hex] + + # Check if it's a direct instruction like LUI, JAL, etc. + if isinstance(opcode_data, str): + return opcode_data + + # Check for funct3-based instructions + if 'funct3' in opcode_data: + for funct3_entry in opcode_data['funct3']: + if funct3_hex in funct3_entry: + funct3_data = funct3_entry[funct3_hex] + + # Check for funct12 (for ECALL, EBREAK, etc.) + if 'funct12' in funct3_data: + for funct12_entry in funct3_data['funct12']: + if funct12_hex in funct12_entry: + funct12_data = funct12_entry[funct12_hex] + return funct12_data + + return funct3_data + + # Check for funct7-based instructions + if 'funct7' in opcode_data: + for funct7_entry in opcode_data['funct7']: + if funct7_hex in funct7_entry: + funct7_data = funct7_entry[funct7_hex] + if 'funct3' in funct7_data: + for funct3_entry in funct7_data['funct3']: + if funct3_hex in funct3_entry: + return funct3_entry[funct3_hex] + elif 'default' in funct7_entry: + funct7_data = funct7_entry['default'] + if 'funct3' in funct7_data: + for funct3_entry in funct7_data['funct3']: + if funct3_hex in funct3_entry: + return funct3_entry[funct3_hex] + + return "UNKNOWN" + +def parse_instructions(instructions, json_path): + last_bytes = {} + unknown_syscalls = {} + unknown_instructions = {} + supported, syscall_map = dict_from_json(json_path) + + u32max = (2**32)-1 + for index, instruction in enumerate(instructions): + if instruction < u32max: + ins_name = instruction_name(instruction, supported) + if ins_name == "ECALL": + ins_name = parse_syscall(instructions, index, syscall_map) + if "UNKNOWN" in ins_name: + unknown_syscalls[ins_name] = unknown_syscalls.get(ins_name, 0) +1 + if ins_name == "UNKNOWN": + unknown_instructions[instruction] = unknown_instructions.get(instruction, 0) + 1 + last_bytes[ins_name] = last_bytes.get(ins_name, 0) + 1 + else: + print(f"Error: Unexpected instruction: {instruction}.") + exit(1) + return last_bytes, unknown_instructions, unknown_syscalls + +def find_a7_value(instructions, index): + # parse the 5 previous instructions, looking for A7 value + for i in range(max(0,index-SYSCALL_INSTRUCTIONS_LOOKBACK), index): + instr = instructions[i] + rd = parse_rd(instr) + if rd == 17: # a7 = x17 + opcode = parse_opcode(instr) + if opcode == 0x13: # ADDI + imm = parse_imm_i(instr) + return imm + elif opcode == 0x37: # LUI + imm = parse_imm_u(instr) >> 12 + return imm + elif opcode == 0x13 and parse_rs1(instr) == 0: # LI (ADDI x17, x0, imm) + imm = parse_imm_i(instr) + return imm + return None + +def parse_syscall(instructions, index, syscall_map): + a7 = find_a7_value(instructions, index) + if a7 == None: + return "UNKNOWN_SYSCALL (a7 = UNKNOWN)" + syscall_name = syscall_map.get(f"{a7:02X}") + if syscall_map.get(f"{a7:02X}") is None: + return f"UNKNOWN_SYSCALL (a7 = 0x{a7:X})" + return f"ECALL.{syscall_name}" + + +def dict_from_json(json_path): + try: + with open(json_path, 'r') as f: + data = json.load(f) + syscalls = {list(s.keys())[0]: list(s.values())[0] for s in data.get('syscalls', [])} + return data, syscalls + except Exception as e: + print(f"Error: Unable to read the JSON file. Reason: {e}") + exit(1) + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Usage: python3 parse_riscv_elf.py ") + sys.exit(1) + + elf_path = sys.argv[1] + json_path = sys.argv[2] + instructions = extract_text_section_instructions(elf_path) + + instruction_counts, unknown_instr, unknown_syscalls = parse_instructions(instructions, json_path) + + # SYSCALL results + for key in unknown_syscalls.keys(): + print(f"There were {unknown_syscalls[key]} {key}.") + + if instruction_counts.get("UNKNOWN", 0) != 0: + nb_unknown = instruction_counts["UNKNOWN"] + print(f"There were {nb_unknown} unknown instructions.\n") + for instru, count in sorted(unknown_instr.items()): + print(f"Unknown instruction: {instru:08X}: {count} times") + exit(1) + else: + print("All instructions known.") diff --git a/tests/rv64-matching-tool/requirements.txt b/tests/rv64-matching-tool/requirements.txt new file mode 100644 index 00000000..c7f89be7 --- /dev/null +++ b/tests/rv64-matching-tool/requirements.txt @@ -0,0 +1 @@ +pyelftools \ No newline at end of file diff --git a/tests/rv64-matching-tool/supported_targets/README.md b/tests/rv64-matching-tool/supported_targets/README.md new file mode 100644 index 00000000..22ff3aa3 --- /dev/null +++ b/tests/rv64-matching-tool/supported_targets/README.md @@ -0,0 +1,17 @@ +# Targets + + + +## Asterisc + +### v1.1.2 + +Despite having an handler for the following instructions, the Asteric v1.1.2 target implements them as NO-OP. They can't be considered as supported. +- "07": "FLW/FLD" +- "27": "FSW/FSD" +- "53": "FADD" + + +Multiple syscalls are implemented. The Asterisc JSON file defines only 3 of them: read, write and exit. + + diff --git a/tests/rv64-matching-tool/supported_targets/asterisc-v1.1.2.json b/tests/rv64-matching-tool/supported_targets/asterisc-v1.1.2.json new file mode 100644 index 00000000..d0b51f19 --- /dev/null +++ b/tests/rv64-matching-tool/supported_targets/asterisc-v1.1.2.json @@ -0,0 +1,282 @@ +{ + "opcodes": [ + { + "03": { + "funct3": [ + { + "00": "LB" + }, + { + "01": "LH" + }, + { + "02": "LW" + }, + { + "03": "LD" + }, + { + "04": "LBU" + }, + { + "05": "LHU" + }, + { + "06": "LWU" + } + ] + } + }, + { + "23": { + "funct3": [ + { + "00": "SB" + }, + { + "01": "SH" + }, + { + "02": "SW" + }, + { + "03": "SD" + } + ] + } + }, + { + "63": { + "funct3": [ + { + "00": "BEQ" + }, + { + "01": "BNE" + }, + { + "04": "BLT" + }, + { + "05": "BGE" + }, + { + "06": "BLTU" + }, + { + "07": "BGEU" + } + ] + } + }, + { + "13": { + "funct3": [ + { + "00": "ADDI" + }, + { + "01": "SLLI" + }, + { + "02": "SLTI" + }, + { + "03": "SLTIU" + }, + { + "04": "XORI" + }, + { + "05": "SRLI/SRAI" + }, + { + "06": "ORI" + }, + { + "07": "ANDI" + } + ] + } + }, + { + "1B": { + "funct3": [ + { + "00": "ADDIW" + }, + { + "01": "SLLIW" + }, + { + "05": "SRLIW/SRAIW" + } + ] + } + }, + { + "33": { + "funct7": [ + { + "default": { + "funct3": [ + { + "00": "ADD/SUB" + }, + { + "01": "SLL" + }, + { + "02": "SLT" + }, + { + "03": "SLTU" + }, + { + "04": "XOR" + }, + { + "05": "SRL/SRA" + }, + { + "06": "OR" + }, + { + "07": "AND" + } + ] + } + }, + { + "01": { + "funct3": [ + { + "00": "MUL" + }, + { + "01": "MULH" + }, + { + "02": "MULHSU" + }, + { + "03": "MULU" + }, + { + "04": "DIV" + }, + { + "05": "DIVU" + }, + { + "06": "REM" + }, + { + "07": "REMU" + } + ] + } + } + ] + } + }, + { + "3B": { + "funct7": [ + { + "default": { + "funct3": [ + { + "00": "ADDW/SUBW" + }, + { + "01": "SLLW" + }, + { + "02": "SLT" + }, + { + "05": "SRLW/SRAW" + }, + { + "06": "OR" + }, + { + "07": "AND" + } + ] + } + }, + { + "01": { + "funct3": [ + { + "00": "MULW" + }, + { + "04": "DIVW" + }, + { + "05": "DIVUW" + }, + { + "06": "REMW" + }, + { + "07": "REMUW" + } + ] + } + } + ] + } + }, + { + "37": "LUI" + }, + { + "17": "AUIPC" + }, + { + "6F": "JAL" + }, + { + "67": "JALR" + }, + { + "73": { + "funct3": [ + { + "00": { + "funct12": [ + { + "0000": "ECALL" + }, + { + "0001": "EBREAK" + } + ] + } + } + ] + } + }, + { + "2F": "RV32A" + }, + { + "0F": "FENCE" + } + ], + "syscalls": [ + { + "3F": "READ" + }, + { + "40": "WRITE" + }, + { + "5D": "EXIT" + } + ] +} \ No newline at end of file