Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BinExport2: better expression pruning #2527

Merged
merged 4 commits into from
Dec 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
- binja: fix unit test failure by fixing up the analysis for file al-khaser_x64.exe_ #2507 @xusheng6
- binja: move the stack string detection to function level #2516 @xusheng6
- BinExport2: fix handling of incorrect thunk functions #2524 @williballenthin
- BinExport2: more precise pruning of expressions @williballenthin

### capa Explorer Web

Expand Down
63 changes: 39 additions & 24 deletions capa/features/extractors/binexport2/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,25 @@ def is_vertex_type(vertex: BinExport2.CallGraph.Vertex, type_: BinExport2.CallGr
return vertex.HasField("type") and vertex.type == type_


# internal to `build_expression_tree`
# this is unstable: it is subject to change, so don't rely on it!
def _prune_expression_tree_references_to_tree_index(
expression_tree: list[list[int]],
tree_index: int,
):
# `i` is the index of the tree node that we'll search for `tree_index`
# if we remove `tree_index` from it, and it is now empty,
# then we'll need to prune references to `i`.
for i, tree_node in enumerate(expression_tree):
if tree_index in tree_node:
tree_node.remove(tree_index)

if len(tree_node) == 0:
# if the parent node is now empty,
# remove references to that parent node.
_prune_expression_tree_references_to_tree_index(expression_tree, i)


# internal to `build_expression_tree`
# this is unstable: it is subject to change, so don't rely on it!
def _prune_expression_tree_empty_shifts(
Expand All @@ -70,9 +89,7 @@ def _prune_expression_tree_empty_shifts(
#
# Which seems to be as if the shift wasn't there (shift of #0)
# so we want to remove references to this node from any parent nodes.
for tree_node in expression_tree:
if tree_index in tree_node:
tree_node.remove(tree_index)
_prune_expression_tree_references_to_tree_index(expression_tree, tree_index)

return

Expand All @@ -82,7 +99,20 @@ def _prune_expression_tree_empty_shifts(

# internal to `build_expression_tree`
# this is unstable: it is subject to change, so don't rely on it!
def _prune_expression_tree_empty_commas(
def _fixup_expression_tree_references_to_tree_index(
expression_tree: list[list[int]],
existing_index: int,
new_index: int,
):
for tree_node in expression_tree:
for i, index in enumerate(tree_node):
if index == existing_index:
tree_node[i] = new_index


# internal to `build_expression_tree`
# this is unstable: it is subject to change, so don't rely on it!
def _fixup_expression_tree_lonely_commas(
be2: BinExport2,
operand: BinExport2.Operand,
expression_tree: list[list[int]],
Expand All @@ -94,26 +124,12 @@ def _prune_expression_tree_empty_commas(

if expression.type == BinExport2.Expression.OPERATOR:
if len(children_tree_indexes) == 1 and expression.symbol == ",":
# Due to the above pruning of empty LSL or LSR expressions,
# the parents might need to be fixed up.
#
# Specifically, if the pruned node was part of a comma list with two children,
# now there's only a single child, which renders as an extra comma,
# so we replace references to the comma node with the immediate child.
#
# A more correct way of doing this might be to walk up the parents and do fixups,
# but I'm not quite sure how to do this yet. Just do two passes right now.
child = children_tree_indexes[0]

for tree_node in expression_tree:
tree_node.index
if tree_index in tree_node:
tree_node[tree_node.index(tree_index)] = child

return
existing_index = tree_index
new_index = children_tree_indexes[0]
_fixup_expression_tree_references_to_tree_index(expression_tree, existing_index, new_index)

for child_tree_index in children_tree_indexes:
_prune_expression_tree_empty_commas(be2, operand, expression_tree, child_tree_index)
_fixup_expression_tree_lonely_commas(be2, operand, expression_tree, child_tree_index)


# internal to `build_expression_tree`
Expand All @@ -124,7 +140,7 @@ def _prune_expression_tree(
expression_tree: list[list[int]],
):
_prune_expression_tree_empty_shifts(be2, operand, expression_tree, 0)
_prune_expression_tree_empty_commas(be2, operand, expression_tree, 0)
_fixup_expression_tree_lonely_commas(be2, operand, expression_tree, 0)


# this is unstable: it is subject to change, so don't rely on it!
Expand Down Expand Up @@ -173,7 +189,6 @@ def _build_expression_tree(
tree.append(children)

_prune_expression_tree(be2, operand, tree)
_prune_expression_tree(be2, operand, tree)

return tree

Expand Down
34 changes: 30 additions & 4 deletions scripts/inspect-binexport2.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,11 @@ def _render_expression_tree(
if expression.symbol != "!":
o.write(expression.symbol)

if expression.symbol in ("lsl", "lsr"):
# like: lsl 16
# not like: lsl16
o.write(" ")

child_index = children_tree_indexes[0]
_render_expression_tree(be2, operand, expression_tree, child_index, o)

Expand All @@ -141,7 +146,13 @@ def _render_expression_tree(
child_a = children_tree_indexes[0]
child_b = children_tree_indexes[1]
_render_expression_tree(be2, operand, expression_tree, child_a, o)

o.write(expression.symbol)
if expression.symbol == ",":
# like: 10, 20
# not like 10,20
o.write(" ")

_render_expression_tree(be2, operand, expression_tree, child_b, o)
return

Expand All @@ -152,11 +163,19 @@ def _render_expression_tree(
child_c = children_tree_indexes[2]
_render_expression_tree(be2, operand, expression_tree, child_a, o)
o.write(expression.symbol)
if expression.symbol == ",":
o.write(" ")
_render_expression_tree(be2, operand, expression_tree, child_b, o)
o.write(expression.symbol)
if expression.symbol == ",":
o.write(" ")
_render_expression_tree(be2, operand, expression_tree, child_c, o)
return

elif len(children_tree_indexes) == 0:
# like when all subtrees have been pruned: don't render anything
return

else:
raise NotImplementedError(len(children_tree_indexes))

Expand Down Expand Up @@ -362,10 +381,17 @@ def main(argv=None):
operands = []
for operand_index in instruction.operand_index:
operand = be2.operand[operand_index]
# Ghidra bug where empty operands (no expressions) may
# exist so we skip those for now (see https://github.com/NationalSecurityAgency/ghidra/issues/6817)
if len(operand.expression_index) > 0:
operands.append(render_operand(be2, operand, index=operand_index))
if not operand.expression_index:
# Ghidra bug where empty operands (no expressions) may
# exist so we skip those for now (see https://github.com/NationalSecurityAgency/ghidra/issues/6817)
continue

op = render_operand(be2, operand, index=operand_index)
if not op:
# operand has been pruned away, so don't show it
continue

operands.append(op)

call_targets = ""
if instruction.call_target:
Expand Down
Loading