hedge-dev · JillianTo · Mar 8, 2025 · Mar 8, 2025 · Mar 9, 2025 · Mar 9, 2025
diff --git a/Auto_Function_Parser.py b/Auto_Function_Parser.py
@@ -0,0 +1,297 @@
+##
+## Searches for functions in .text that are referenced by functions in .pdata
+##
+## Input: 
+## Decompiled code - Created in IDA Pro 9.0SP1 with File -> Produce File -> Create HTML File...
+## CLI output from a XenonRecomp run - When trying to compile with XenonRecomp, use > to save the output from the terminal
+##
+## Output: 
+## XenonRecomp config - Function block for TOML to be inputted into XenonRecomp 
+##
+
+import sys
+import re
+
+# Check if correct number of input arguments were given
+if len(sys.argv) != 4:
+    sys.exit("Auto_Function_Parser.py [IDA HTML] [XenonRecomp log] [Output TOML]")
+
+# Filepath input arguments
+ida_html = sys.argv[1]
+xenonrecomp_log = sys.argv[2]
+output_file = sys.argv[3]
+
+# Disable extra debug output 
+debug = False
+
+##
+## Parse XenonRecomp log
+##
+
+# The starting index of the erroneous switch statement address in the XenonRecomp log
+switch_idx = 22
+
+# Initialize list to store erroneous switch statement addresses
+switch_addrs = []
+
+print("Parsing XenonRecomp log...")
+# Import each line of XenonRecomp log
+with open(xenonrecomp_log, 'r') as file:
+    # Read each line in the file
+    for line in file: 
+        # If this line describes an error, it has the address of a problematic switch statement
+        if re.search('ERROR: Switch case at ', line) != None:
+            # Save the address as integer
+            switch_addrs.append(line[switch_idx:switch_idx+8])
+
+# Save only unique addresses and sort
+switch_addrs = set(switch_addrs)
+
+##
+## Parse IDA HTML
+##
+
+# Initialize list to store start and end of functions 
+functs = []
+
+# Count how many functions have been added
+num_functs = 0
+
+# Function for adding to function list and incrementing count
+def add_function(new_start_addr, prev_end_addr, start_type):
+    global num_functs
+    # If an end address for the last added function was specified
+    if prev_end_addr != None:
+        # Set end address for last added function
+        functs[num_functs-1][1] = prev_end_addr
+    # Add a new function to the list with the specified starting address
+    functs.append([new_start_addr, 0, [], start_type])
+    # Increment the number of functions
+    num_functs = num_functs+1
+
+# Mark if we are in .text section
+in_text = False
+
+# Mark if we should end parsing
+end_parse = False
+
+# Initialize address of last bctr instruction to 0
+bctr_addr = '00000000'
+
+# Initialize address of last blr instruction to 0
+blr_addr = '00000000'
+
+# Initialize address of last 'End of function' comment to 0
+eof_addr = '00000000'
+
+# Initialize address of last restgprlr instruction to 0
+restgprlr_addr = '00000000'
+
+# Initialize address of last padding to 0
+pad_addr = 0
+
+# Import each line of decompiled code
+print("Parsing IDA HTML...")
+with open(ida_html, 'r') as file:
+    # Read each line in the file
+    for line in file:
+        if not end_parse:
+            # If in .text
+            if in_text:
+                # Get the current address
+                colon_idx = line.find(':')
+                curr_addr = line[colon_idx+1:colon_idx+9]
+
+                # Check if this is the start of a function
+                if re.search('.text:'+curr_addr+' </span><span class="c[0-9]*">sub_'+curr_addr+'</span><span class="c[0-9]*">: *</span><span class="c[0-9]*"># [A-Z][A-Z][A-Z][A-Z] XREF:', line):
+                    # Save current address as integer
+                    curr_addr_int = int(curr_addr, 16)
+
+                    # If this is not the first function being added
+                    if num_functs > 0:
+                        # If last address had padding or restgprlr instruction, then this function was already added
+                        if curr_addr_int-4 == pad_addr or curr_addr_int-4 == restgprlr_addr:
+                            # Set function type for start address
+                            functs[num_functs-1][3] = 'sub'
+                        else:
+                            # Check if this function is part of latest added function
+                            is_nested_funct = False
+                            nested_functs = functs[num_functs-1][2]
+                            for nested_funct in nested_functs:
+                                if nested_funct == curr_addr:
+                                    is_nested_funct = True
+                                    break
+
+                            # If last address was not padding and not nested in latest function
+                            if not is_nested_funct:
+                                # Add new function and last function's end address
+                                add_function(curr_addr_int, curr_addr_int, 'sub')
+                    else:
+                        # Add new function
+                        add_function(curr_addr_int, None, 'sub')
+
+                # If this is a location
+                elif re.search('^\.text:'+curr_addr+' </span><span class="c[0-9]*">loc_'+curr_addr, line):
+                    curr_addr_int = int(curr_addr, 16)
+                    curr_funct = functs[num_functs-1]
+                    # If previous address was a blr instruction
+                    if curr_addr_int-4 == int(blr_addr, 16):
+                        # If previous address had an 'End of function' comment or if there was a bctr with the comment
+                        if blr_addr == eof_addr or bctr_addr == eof_addr:
+                            # Find a XREF pointing to a .text address
+                            xref_idx = line.find('XREF: .text:')
+                            if xref_idx > -1:
+                                underscore_idx = line.find('_', xref_idx)
+                                if underscore_idx > -1:
+                                    xref = line[underscore_idx+1:underscore_idx+9]
+                                else:
+                                    xref = line[xref_idx+12:xref_idx+20]
+                            else:
+                                xref = None
+
+                            # Couldn't find XREF pointing to .text address or the XREF is after this address
+                            if xref == None or int(xref, 16) > curr_addr_int:
+                                # Add as new function
+                                add_function(curr_addr_int, curr_addr_int, 'loc')
+
+                    else:
+                        # Find address of function that references this
+                        xref_idx = line.find('CODE XREF: sub_')
+                        # If it was found
+                        if xref_idx > -1:
+                            # Store as nested function in latest function
+                            functs[num_functs-1][2].append(line[xref_idx+15:xref_idx+23])
+
+                # Check if this line is padding
+                elif num_functs > 0 and re.search('<span class="c[0-9]*">\.long </span><span class="c[0-9]*">0$', line):
+                    # Convert current address to integer 
+                    curr_addr_int = int(curr_addr, 16)
+
+                    # Add a new function at the line after padding, and end the current function at this padding address
+                    add_function(curr_addr_int+4, curr_addr_int, None)
+
+                    # Save padding address
+                    pad_addr = curr_addr_int
+
+                # Check for blr instruction
+                elif re.search('<span class="c[0-9]*">blr$', line):
+                    blr_addr = curr_addr
+
+                # Check for 'End of function' comment
+                elif re.search('End of function ', line):
+                    eof_addr = curr_addr
+
+                # Check for bctr instruction
+                elif re.search('<span class="c[0-9]*">bctr$', line):
+                    bctr_addr = curr_addr
+
+                # Check for restgprlr instruction
+                elif re.search('<span class="c[0-9]*">b         </span><span class="c[0-9]*">__restgprlr_[0-9][0-9]$', line):
+                    # Convert current address to integer 
+                    curr_addr_int = int(curr_addr, 16)
+
+                    # Add a new function at the line after restgprlr instruction, and end the current function at this address
+                    add_function(curr_addr_int+4, curr_addr_int, None)
+
+                    restgprlr_addr = curr_addr_int
+
+            # If not in .text
+            else:
+                # If .text section header found
+                if re.search('<span class="c[0-9]*">\.section &quot;\.text&quot;', line) != None:
+                    in_text = True
+
+##
+## Find .text functions that are referenced by .pdata functions
+##
+
+# Initialize list for functions that need to be added to toml
+output_functs = []
+
+# Look for related functions for every unique errored switch statement
+print("Searching for needed functions...")
+for switch_addr in switch_addrs:
+    # Start looking at first subroutine
+    curr_funct_idx = 0
+
+    # Save current switch statement address as integer
+    switch_addr_int = int(switch_addr, 16)
+
+    # The related function for this switch statement has not been found yet
+    search_for_funct = True
+
+    # Start search for function relating to switch statement
+    while(search_for_funct):
+        curr_funct = functs[curr_funct_idx]
+        # If switch address is after this function's start
+        curr_funct_start = curr_funct[0]
+        if(switch_addr_int > curr_funct_start):
+            # If switch address is before this function's end
+            curr_funct_end = curr_funct[1]
+            if(switch_addr_int <= curr_funct_end):
+                # Save current function's start address and the function's length
+                if debug:
+                    output_functs.append([hex(curr_funct_start), hex(curr_funct_end-curr_funct_start), switch_addr])
+                else:
+                    output_functs.append([hex(curr_funct_start), hex(curr_funct_end-curr_funct_start)])
+
+                # Don't need to continue search for this switch statement
+                search_for_funct = False
+
+            # Look in next function
+            curr_funct_idx = curr_funct_idx + 1
+
+        # Related function was not found
+        else:
+            print(f"WARNING: Function relating to {switch_addr} not found! Skipping.")
+            # Don't need to continue search for this switch statement
+            search_for_funct = False
+
+# Remove duplicates
+if not debug: 
+    output_functs = list(set(tuple(funct) for funct in output_functs))
+
+# Make sure there are no functions with the same starting address but different lengths
+if not debug:
+    for i in range(len(output_functs)):
+        for j in range(i+1, len(output_functs)):
+            curr_funct_start = output_functs[i][0]
+            if curr_funct_start == output_functs[j][0]:
+                print(f"WARNING: {curr_funct_start} has multiple entries of different lengths, manually find correct one.")
+
+print(f"{len(output_functs)} functions found!")
+
+##
+## Output all found functions to TOML in correct format
+##
+
+# Create formatted string to export to TOML
+output_str = "functions = ["
+
+# Append all function addresses and lengths to formatted string
+print("Outputting to formatted file...")
+for funct in output_functs:
+    # Format hex to uppercase 
+    curr_funct_start = '0x'+funct[0][2:].upper()
+    curr_funct_end = '0x'+funct[1][2:].upper()
+
+    # Format function 
+    curr_funct = "\n    { address = "+curr_funct_start+", size = "+curr_funct_end
+    if debug:
+        curr_funct = curr_funct+", src = "+funct[2]
+    curr_funct = curr_funct+" },"
+
+    # Add to complete output string
+    output_str = output_str+curr_funct
+
+# Delete last comma
+output_str = output_str[:len(output_str)-1]
+
+# Add last bracket
+output_str = output_str+"\n]"
+
+# Output to file
+with open(output_file, "w") as file:
+    file.write(output_str)
+
+
diff --git a/README.md b/README.md
@@ -188,6 +188,12 @@ functions = [
 
 You can define function boundaries explicitly using the `functions` property if XenonAnalyse fails to analyze them correctly, for example, with functions containing jump tables.
 
+You can automatically generate these using the FunctionParser.py script. You will need to create a HTML of your decompiled XEX with IDA using `File -> Produce File -> Create HTML File...` and save the terminal output from running XenonRecomp by appending `> [output log file path]` to the command.
+
+```
+python3 FunctionParser.py [input IDA HTML file path] [input XenonRecomp log file path] [output function list file path]
+```
+
 #### Invalid Instruction Skips
 
 ```toml