Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Created script for automatically generating function boundaries #98

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
297 changes: 297 additions & 0 deletions Auto_Function_Parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,297 @@
##
## Searches for functions in .text that are referenced by functions in .pdata
##
## Input:
## Decompiled code - Created in IDA Pro 9.0SP1 with File -> Produce File -> Create HTML File...
## CLI output from a XenonRecomp run - When trying to compile with XenonRecomp, use > to save the output from the terminal
##
## Output:
## XenonRecomp config - Function block for TOML to be inputted into XenonRecomp
##

import sys
import re

# Check if correct number of input arguments were given
if len(sys.argv) != 4:
sys.exit("Auto_Function_Parser.py [IDA HTML] [XenonRecomp log] [Output TOML]")

# Filepath input arguments
ida_html = sys.argv[1]
xenonrecomp_log = sys.argv[2]
output_file = sys.argv[3]

# Disable extra debug output
debug = False

##
## Parse XenonRecomp log
##

# The starting index of the erroneous switch statement address in the XenonRecomp log
switch_idx = 22

# Initialize list to store erroneous switch statement addresses
switch_addrs = []

print("Parsing XenonRecomp log...")
# Import each line of XenonRecomp log
with open(xenonrecomp_log, 'r') as file:
# Read each line in the file
for line in file:
# If this line describes an error, it has the address of a problematic switch statement
if re.search('ERROR: Switch case at ', line) != None:
# Save the address as integer
switch_addrs.append(line[switch_idx:switch_idx+8])

# Save only unique addresses and sort
switch_addrs = set(switch_addrs)

##
## Parse IDA HTML
##

# Initialize list to store start and end of functions
functs = []

# Count how many functions have been added
num_functs = 0

# Function for adding to function list and incrementing count
def add_function(new_start_addr, prev_end_addr, start_type):
global num_functs
# If an end address for the last added function was specified
if prev_end_addr != None:
# Set end address for last added function
functs[num_functs-1][1] = prev_end_addr
# Add a new function to the list with the specified starting address
functs.append([new_start_addr, 0, [], start_type])
# Increment the number of functions
num_functs = num_functs+1

# Mark if we are in .text section
in_text = False

# Mark if we should end parsing
end_parse = False

# Initialize address of last bctr instruction to 0
bctr_addr = '00000000'

# Initialize address of last blr instruction to 0
blr_addr = '00000000'

# Initialize address of last 'End of function' comment to 0
eof_addr = '00000000'

# Initialize address of last restgprlr instruction to 0
restgprlr_addr = '00000000'

# Initialize address of last padding to 0
pad_addr = 0

# Import each line of decompiled code
print("Parsing IDA HTML...")
with open(ida_html, 'r') as file:
# Read each line in the file
for line in file:
if not end_parse:
# If in .text
if in_text:
# Get the current address
colon_idx = line.find(':')
curr_addr = line[colon_idx+1:colon_idx+9]

# Check if this is the start of a function
if re.search('.text:'+curr_addr+' </span><span class="c[0-9]*">sub_'+curr_addr+'</span><span class="c[0-9]*">: *</span><span class="c[0-9]*"># [A-Z][A-Z][A-Z][A-Z] XREF:', line):
# Save current address as integer
curr_addr_int = int(curr_addr, 16)

# If this is not the first function being added
if num_functs > 0:
# If last address had padding or restgprlr instruction, then this function was already added
if curr_addr_int-4 == pad_addr or curr_addr_int-4 == restgprlr_addr:
# Set function type for start address
functs[num_functs-1][3] = 'sub'
else:
# Check if this function is part of latest added function
is_nested_funct = False
nested_functs = functs[num_functs-1][2]
for nested_funct in nested_functs:
if nested_funct == curr_addr:
is_nested_funct = True
break

# If last address was not padding and not nested in latest function
if not is_nested_funct:
# Add new function and last function's end address
add_function(curr_addr_int, curr_addr_int, 'sub')
else:
# Add new function
add_function(curr_addr_int, None, 'sub')

# If this is a location
elif re.search('^\.text:'+curr_addr+' </span><span class="c[0-9]*">loc_'+curr_addr, line):
curr_addr_int = int(curr_addr, 16)
curr_funct = functs[num_functs-1]
# If previous address was a blr instruction
if curr_addr_int-4 == int(blr_addr, 16):
# If previous address had an 'End of function' comment or if there was a bctr with the comment
if blr_addr == eof_addr or bctr_addr == eof_addr:
# Find a XREF pointing to a .text address
xref_idx = line.find('XREF: .text:')
if xref_idx > -1:
underscore_idx = line.find('_', xref_idx)
if underscore_idx > -1:
xref = line[underscore_idx+1:underscore_idx+9]
else:
xref = line[xref_idx+12:xref_idx+20]
else:
xref = None

# Couldn't find XREF pointing to .text address or the XREF is after this address
if xref == None or int(xref, 16) > curr_addr_int:
# Add as new function
add_function(curr_addr_int, curr_addr_int, 'loc')

else:
# Find address of function that references this
xref_idx = line.find('CODE XREF: sub_')
# If it was found
if xref_idx > -1:
# Store as nested function in latest function
functs[num_functs-1][2].append(line[xref_idx+15:xref_idx+23])

# Check if this line is padding
elif num_functs > 0 and re.search('<span class="c[0-9]*">\.long </span><span class="c[0-9]*">0$', line):
# Convert current address to integer
curr_addr_int = int(curr_addr, 16)

# Add a new function at the line after padding, and end the current function at this padding address
add_function(curr_addr_int+4, curr_addr_int, None)

# Save padding address
pad_addr = curr_addr_int

# Check for blr instruction
elif re.search('<span class="c[0-9]*">blr$', line):
blr_addr = curr_addr

# Check for 'End of function' comment
elif re.search('End of function ', line):
eof_addr = curr_addr

# Check for bctr instruction
elif re.search('<span class="c[0-9]*">bctr$', line):
bctr_addr = curr_addr

# Check for restgprlr instruction
elif re.search('<span class="c[0-9]*">b </span><span class="c[0-9]*">__restgprlr_[0-9][0-9]$', line):
# Convert current address to integer
curr_addr_int = int(curr_addr, 16)

# Add a new function at the line after restgprlr instruction, and end the current function at this address
add_function(curr_addr_int+4, curr_addr_int, None)

restgprlr_addr = curr_addr_int

# If not in .text
else:
# If .text section header found
if re.search('<span class="c[0-9]*">\.section &quot;\.text&quot;', line) != None:
in_text = True

##
## Find .text functions that are referenced by .pdata functions
##

# Initialize list for functions that need to be added to toml
output_functs = []

# Look for related functions for every unique errored switch statement
print("Searching for needed functions...")
for switch_addr in switch_addrs:
# Start looking at first subroutine
curr_funct_idx = 0

# Save current switch statement address as integer
switch_addr_int = int(switch_addr, 16)

# The related function for this switch statement has not been found yet
search_for_funct = True

# Start search for function relating to switch statement
while(search_for_funct):
curr_funct = functs[curr_funct_idx]
# If switch address is after this function's start
curr_funct_start = curr_funct[0]
if(switch_addr_int > curr_funct_start):
# If switch address is before this function's end
curr_funct_end = curr_funct[1]
if(switch_addr_int <= curr_funct_end):
# Save current function's start address and the function's length
if debug:
output_functs.append([hex(curr_funct_start), hex(curr_funct_end-curr_funct_start), switch_addr])
else:
output_functs.append([hex(curr_funct_start), hex(curr_funct_end-curr_funct_start)])

# Don't need to continue search for this switch statement
search_for_funct = False

# Look in next function
curr_funct_idx = curr_funct_idx + 1

# Related function was not found
else:
print(f"WARNING: Function relating to {switch_addr} not found! Skipping.")
# Don't need to continue search for this switch statement
search_for_funct = False

# Remove duplicates
if not debug:
output_functs = list(set(tuple(funct) for funct in output_functs))

# Make sure there are no functions with the same starting address but different lengths
if not debug:
for i in range(len(output_functs)):
for j in range(i+1, len(output_functs)):
curr_funct_start = output_functs[i][0]
if curr_funct_start == output_functs[j][0]:
print(f"WARNING: {curr_funct_start} has multiple entries of different lengths, manually find correct one.")

print(f"{len(output_functs)} functions found!")

##
## Output all found functions to TOML in correct format
##

# Create formatted string to export to TOML
output_str = "functions = ["

# Append all function addresses and lengths to formatted string
print("Outputting to formatted file...")
for funct in output_functs:
# Format hex to uppercase
curr_funct_start = '0x'+funct[0][2:].upper()
curr_funct_end = '0x'+funct[1][2:].upper()

# Format function
curr_funct = "\n { address = "+curr_funct_start+", size = "+curr_funct_end
if debug:
curr_funct = curr_funct+", src = "+funct[2]
curr_funct = curr_funct+" },"

# Add to complete output string
output_str = output_str+curr_funct

# Delete last comma
output_str = output_str[:len(output_str)-1]

# Add last bracket
output_str = output_str+"\n]"

# Output to file
with open(output_file, "w") as file:
file.write(output_str)


6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,12 @@ functions = [

You can define function boundaries explicitly using the `functions` property if XenonAnalyse fails to analyze them correctly, for example, with functions containing jump tables.

You can automatically generate these using the FunctionParser.py script. You will need to create a HTML of your decompiled XEX with IDA using `File -> Produce File -> Create HTML File...` and save the terminal output from running XenonRecomp by appending `> [output log file path]` to the command.

```
python3 FunctionParser.py [input IDA HTML file path] [input XenonRecomp log file path] [output function list file path]
```

#### Invalid Instruction Skips

```toml
Expand Down