Show hex and ascii side by side in decodes table

michelcrypt4d4mus · Oct 15, 2022 · cf00058 · cf00058
1 parent 7bb2f8f
commit cf00058
Show file tree

Hide file tree

Showing 8 changed files with 105 additions and 39 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,5 @@
 # NEXT RELEASE
+* Show hex and ascii side by side in decodes table
 
 ### 0.6.2
 * Remove `cairosvg` dependency

diff --git a/tests/test_yaralyze.py b/tests/test_yaralyze.py
@@ -51,17 +51,17 @@ def test_yaralyze_with_files(il_tulipano_path, tulips_yara_path):
 
 
 def test_yaralyze_with_patterns(il_tulipano_path, binary_file_path, tulips_yara_regex):
-    _assert_output_line_count_is_close(942, il_tulipano_path, '-re', tulips_yara_regex)
-    _assert_output_line_count_is_close(83, binary_file_path, '-re', '3Hl0')
-    _assert_output_line_count_is_close(90, binary_file_path, '-hex', HEX_STRING)
+    _assert_output_line_count_is_close(999, il_tulipano_path, '-re', tulips_yara_regex)
+    _assert_output_line_count_is_close(90, binary_file_path, '-re', '3Hl0')
+    _assert_output_line_count_is_close(96, binary_file_path, '-hex', HEX_STRING)
 
 
 def test_file_export(binary_file_path, tulips_yara_path, tmp_dir):
     _run_with_args(binary_file_path, '-Y', tulips_yara_path, '-svg', '-html', '-txt', '-out', tmp_dir)
     rendered_files = files_in_dir(tmp_dir)
     assert len(rendered_files) == 3
     file_sizes = [path.getsize(f) for f in rendered_files]
-    _assert_array_is_close(sorted(file_sizes), [40867, 69127, 216719])
+    _assert_array_is_close(sorted(file_sizes), [45191, 78832, 240046])
 
     for file in rendered_files:
         remove(file)

diff --git a/tests/test_yaralyzer.py b/tests/test_yaralyzer.py
@@ -10,7 +10,7 @@
 from yaralyzer.yaralyzer import Yaralyzer
 
 CLOSENESS_THRESHOLD = 0.05
-EXPECTED_LINES = 942
+EXPECTED_LINES = 1002
 
 
 def test_filename_string(a_yaralyzer):
@@ -22,15 +22,15 @@ def test_yaralyzer_with_files(il_tulipano_path, tulips_yara_path):
 
 
 def test_yaralyzer_with_patterns(il_tulipano_path, tulips_yara_regex):
-    _check_output_linecount(Yaralyzer.for_patterns([tulips_yara_regex], REGEX, il_tulipano_path), 942)
+    _check_output_linecount(Yaralyzer.for_patterns([tulips_yara_regex], REGEX, il_tulipano_path), EXPECTED_LINES)
 
 
 def test_yaralyzer_for_rules_dir(il_tulipano_path):
     _check_output_linecount(Yaralyzer.for_rules_dirs([dirname(il_tulipano_path)], il_tulipano_path))
 
 
 def test_hex_rules(binary_file_path, tulips_yara_path):
-    _check_output_linecount(Yaralyzer.for_rules_files([tulips_yara_path], binary_file_path), 95)
+    _check_output_linecount(Yaralyzer.for_rules_files([tulips_yara_path], binary_file_path), 102)
 
 
 def _check_output_linecount(yaralzyer: Yaralyzer, expected_line_count: int = EXPECTED_LINES) -> None:

diff --git a/yaralyzer/decoding/bytes_decoder.py b/yaralyzer/decoding/bytes_decoder.py
@@ -15,8 +15,8 @@
 from yaralyzer.bytes_match import BytesMatch
 from yaralyzer.config import YaralyzerConfig
 from yaralyzer.decoding.decoding_attempt import DecodingAttempt
-from yaralyzer.encoding_detection.encoding_assessment import EncodingAssessment
 from yaralyzer.encoding_detection.character_encodings import ENCODING, ENCODINGS_TO_ATTEMPT
+from yaralyzer.encoding_detection.encoding_assessment import EncodingAssessment
 from yaralyzer.encoding_detection.encoding_detector import EncodingDetector
 from yaralyzer.helpers.dict_helper import get_dict_key_by_value
 from yaralyzer.helpers.rich_text_helper import CENTER, DECODING_ERRORS_MSG, NO_DECODING_ERRORS_MSG
@@ -25,7 +25,7 @@
 from yaralyzer.output.rich_console import console
 from yaralyzer.util.logging import log
 
-# Messages used in the table to show true vs. false (a two element array can be indexed by booleans)
+# A 2-tuple that can be indexed by booleans of messages used in the table to show true vs. false
 WAS_DECODABLE_YES_NO = [NO_DECODING_ERRORS_MSG, DECODING_ERRORS_MSG]
 
 # Multiply chardet scores by 100 (again) to make sorting the table easy
@@ -34,7 +34,6 @@
 
 class BytesDecoder:
     def __init__(self, bytes_match: BytesMatch, label: Optional[str] = None) -> None:
-        """Instantiated with _bytes as the whole stream; :bytes_seq tells it how to pull the bytes it will decode"""
         self.bytes_match = bytes_match
         self.bytes = bytes_match.surrounding_bytes
         self.label = label or bytes_match.label
@@ -51,6 +50,7 @@ def __init__(self, bytes_match: BytesMatch, label: Optional[str] = None) -> None
         self.encoding_detector = EncodingDetector(self.bytes)
 
     def print_decode_attempts(self) -> None:
+        """Print the DecodingAttemptsTable."""
         console.line(2)
         self._print_decode_attempt_subheading()
 
@@ -108,7 +108,7 @@ def _print_decode_attempt_subheading(self) -> None:
         console.print(panel, justify=CENTER)
 
     def _track_decode_stats(self) -> None:
-        "Track stats about successful vs. forced vs. failed decode attempts"
+        """Track stats about successful vs. forced vs. failed decode attempts"""
         for decoding in self.decodings:
             if decoding.failed_to_decode:
                 self.was_match_undecodable[decoding.encoding] += 1
@@ -120,11 +120,15 @@ def _track_decode_stats(self) -> None:
                 self.was_match_force_decoded[decoding.encoding] += 1
 
     def _row_from_decoding_attempt(self, decoding: DecodingAttempt) -> DecodingTableRow:
+        """
+        Create a DecodingAttemptTable row from a DecodingAttempt.
+        If the decoding result is a duplicate of a previous decoding, replace the decoded text
+        with "same output as X" where X is the previous encoding that gave the same result.
+        """
         assessment = self.encoding_detector.get_encoding_assessment(decoding.encoding)
         plain_decoded_string = decoding.decoded_string.plain
         sort_score = assessment.confidence * SCORE_SCALER
 
-        # Replace the decoded text with a "same output as X" where X is the encoding that gave the same result
         if plain_decoded_string in self.decoded_strings.values():
             encoding_with_same_output = get_dict_key_by_value(self.decoded_strings, plain_decoded_string)
             display_text = Text('same output as ', style='color(66) dim italic')

diff --git a/yaralyzer/helpers/bytes_helper.py b/yaralyzer/helpers/bytes_helper.py
@@ -2,6 +2,7 @@
 import re
 from collections import namedtuple
 from io import StringIO
+from sys import byteorder
 
 from rich.console import Console
 from rich.markup import escape
@@ -10,14 +11,18 @@
 from yaralyzer.bytes_match import BytesMatch
 from yaralyzer.config import YaralyzerConfig
 from yaralyzer.encoding_detection.character_encodings import NEWLINE_BYTE
-from yaralyzer.output.rich_console import GREY, console
+from yaralyzer.output.rich_console import (BYTES, BYTES_BRIGHTER, BYTES_BRIGHTEST,
+     BYTES_HIGHLIGHT, GREY, console, console_width)
 from yaralyzer.util.logging import log
 
-HEX_CHARS_PER_GROUP = 8
-HEX_GROUPS_PER_LINE = 4
-
 BytesInfo = namedtuple('BytesInfo', ['size', 'md5', 'sha1', 'sha256'])
 
+HEX_CHARS_PER_GROUP = 8
+SUBTABLE_MAX_WIDTH = console_width() - 35 - 5  # 35 for first 3 cols, 5 for in between hex and ascii
+HEX_UNIT_LENGTH = (HEX_CHARS_PER_GROUP * 3) + HEX_CHARS_PER_GROUP + 4  # 4 for padding between groups
+HEX_GROUPS_PER_LINE = divmod(SUBTABLE_MAX_WIDTH, HEX_UNIT_LENGTH)[0]
+HEX_CHARS_PER_LINE = HEX_CHARS_PER_GROUP * HEX_GROUPS_PER_LINE
+
 
 def get_bytes_info(_bytes: bytes) -> BytesInfo:
     return BytesInfo(
@@ -85,10 +90,42 @@ def hex_view_of_raw_bytes(_bytes: bytes, bytes_match: BytesMatch) -> Text:
     highlight_start_idx = bytes_match.highlight_start_idx * 3
     highlight_end_idx = bytes_match.highlight_end_idx * 3
     hex_str.stylize(bytes_match.highlight_style, highlight_start_idx, highlight_end_idx)
-    lines = hex_str.wrap(console, HEX_CHARS_PER_GROUP * HEX_GROUPS_PER_LINE * 3)
+    lines = hex_str.wrap(console, HEX_CHARS_PER_LINE * 3)
     return Text("\n").join([Text('  ').join(line.wrap(console, HEX_CHARS_PER_GROUP * 3)) for line in lines])
 
 
+def ascii_view_of_raw_bytes(_bytes: bytes, bytes_match: BytesMatch) -> Text:
+    txt = Text('', style=BYTES)
+
+    for i, b in enumerate(_bytes):
+        if i < bytes_match.highlight_start_idx or i > bytes_match.highlight_end_idx:
+            style1 = 'color(246)'
+            style2 = 'color(234)'
+        else:
+            style1 = None
+            style2 = None
+
+        _byte = b.to_bytes(1, byteorder)
+
+        if b < 32:
+            txt.append('*', style=style2 or BYTES_BRIGHTER)
+        elif b < 127:
+            txt.append(_byte.decode('UTF-8'), style1 or BYTES_BRIGHTEST)
+        elif b <= 160:
+            txt.append('*', style=style2 or BYTES_HIGHLIGHT)
+        else:
+            txt.append('*', style=style2 or BYTES)
+
+    segments = [txt[i:i + HEX_CHARS_PER_GROUP] for i in range(0, len(txt), HEX_CHARS_PER_GROUP)]
+
+    lines = [
+        Text('  ').join(segments[i:min(len(segments), i + HEX_GROUPS_PER_LINE)])
+        for i in range(0, len(segments), HEX_GROUPS_PER_LINE)
+    ]
+
+    return Text("\n").join(lines)
+
+
 def hex_text(_bytes: bytes) -> Text:
     return Text(hex_string(_bytes), style=GREY)
 

diff --git a/yaralyzer/helpers/rich_text_helper.py b/yaralyzer/helpers/rich_text_helper.py
@@ -108,7 +108,6 @@ def size_text(num_bytes: int) -> Text:
     kb_txt = prefix_with_plain_text_obj("{:,.1f}".format(num_bytes / 1024), style='bright_cyan', root_style='white')
     kb_txt.append(' kb ')
     bytes_txt = Text('(', 'white') + size_in_bytes_text(num_bytes) + Text(')')
-
     return kb_txt + bytes_txt
 
 

diff --git a/yaralyzer/output/decoding_attempts_table.py b/yaralyzer/output/decoding_attempts_table.py
@@ -14,22 +14,37 @@
 
 from collections import namedtuple
 
+from rich import box
 from rich.table import Table
 from rich.text import Text
 
 from yaralyzer.bytes_match import BytesMatch
 from yaralyzer.encoding_detection.encoding_assessment import EncodingAssessment
-from yaralyzer.helpers.bytes_helper import hex_view_of_raw_bytes, rich_text_view_of_raw_bytes
+from yaralyzer.helpers.bytes_helper import (ascii_view_of_raw_bytes, hex_view_of_raw_bytes,
+     rich_text_view_of_raw_bytes)
 from yaralyzer.helpers.rich_text_helper import CENTER, FOLD, MIDDLE, RIGHT, na_txt
 
+# The confidence and encoding will not be shown in the final display - instead their Text versions are shown
+DecodingTableRow = namedtuple(
+    'DecodingTableRow',
+    [
+        'encoding_text',
+        'confidence_text',
+        'errors_while_decoded',
+        'decoded_string',
+        'confidence',
+        'encoding',
+        'sort_score'
+    ]
+)
 
 DECODE_NOT_ATTEMPTED_MSG = Text('(decode not attempted)', style='no_attempt')
 HEX = Text('HEX', style='bytes.title')
 RAW_BYTES = Text('Raw', style=f"bytes")
 
 
 def build_decoding_attempts_table(bytes_match: BytesMatch) -> Table:
-    """First rows are the raw / hex views of the bytes"""
+    """First rows are the raw / hex views of the bytes then 1 row per decoding attempt."""
     table = Table(show_lines=True, border_style='bytes', header_style='color(101) bold')
 
     def add_col(title, **kwargs):
@@ -42,26 +57,12 @@ def add_col(title, **kwargs):
     add_col('Decoded Output', justify='left')
 
     na = na_txt(style=HEX.style)
-    table.add_row(HEX, na, na, hex_view_of_raw_bytes(bytes_match.surrounding_bytes, bytes_match))
+    table.add_row(HEX, na, na, _hex_preview_subtable(bytes_match))
     na = na_txt(style=RAW_BYTES.style)
     table.add_row(RAW_BYTES, na, na, rich_text_view_of_raw_bytes(bytes_match.surrounding_bytes, bytes_match))
     return table
 
 
-# The confidence and encoding will not be shown in the final display - instead their Text versions are shown
-DecodingTableRow = namedtuple(
-    'DecodingTableRow',
-    [
-        'encoding_text',
-        'confidence_text',
-        'errors_while_decoded',
-        'decoded_string',
-        'confidence',
-        'encoding',
-        'sort_score'
-    ])
-
-
 def decoding_table_row(assessment: EncodingAssessment, is_forced: Text, txt: Text, score: float) -> DecodingTableRow:
     """Get a table row for a decoding attempt"""
     return DecodingTableRow(
@@ -77,3 +78,26 @@ def decoding_table_row(assessment: EncodingAssessment, is_forced: Text, txt: Tex
 def assessment_only_row(assessment: EncodingAssessment, score) -> DecodingTableRow:
     """Build a row with just chardet assessment data and no actual decoded string"""
     return decoding_table_row(assessment, na_txt(), DECODE_NOT_ATTEMPTED_MSG, score)
+
+
+def _hex_preview_subtable(bytes_match: BytesMatch) -> Table:
+    """Build a sub table for hex view (hex on one side, ascii on the other side)."""
+    hex_table = Table(
+        'hex',
+        'ascii',
+        border_style='color(235) dim',
+        header_style='color(101) bold',
+        box=box.MINIMAL,
+        show_lines=True,
+        show_header=True,
+        show_edge=False,
+        padding=(0, 1, 0, 2),
+        pad_edge=False
+    )
+
+    hex_table.add_row(
+        hex_view_of_raw_bytes(bytes_match.surrounding_bytes, bytes_match),
+        ascii_view_of_raw_bytes(bytes_match.surrounding_bytes, bytes_match)
+    )
+
+    return hex_table
diff --git a/yaralyzer/yara/yara_match.py b/yaralyzer/yara/yara_match.py
@@ -49,20 +49,21 @@
 
 class YaraMatch:
     def __init__(self, match: dict, matched_against_bytes_label: Text) -> None:
-        self.match: dict = match
+        self.match = match
         self.rule_name = match['rule']
-        self.label: Text = matched_against_bytes_label.copy().append(f" matched rule: '", style='matched_rule')
+        self.label = matched_against_bytes_label.copy().append(f" matched rule: '", style='matched_rule')
         self.label.append(self.rule_name, style='on bright_red bold').append("'!", style='siren')
 
     def __rich_console__(self, _console: Console, options: ConsoleOptions) -> RenderResult:
+        """Renders a panel showing the color highlighted raw YARA match info."""
         yield(Text("\n"))
         yield Padding(Panel(self.label, expand=False, style=f"on color(251) reverse"), MATCH_PADDING)
         yield(RAW_YARA_THEME_TXT)
         yield Padding(Panel(_rich_yara_match(self.match)), MATCH_PADDING)
 
 
 def _rich_yara_match(element: Any, depth: int = 0) -> Text:
-    """Mildly painful/hacky way of coloring a yara result hash"""
+    """Mildly painful/hacky way of coloring a yara result hash."""
     indent = Text((depth + 1) * 4 * ' ')
     end_indent = Text(depth * 4 * ' ')
 
@@ -78,7 +79,7 @@ def _rich_yara_match(element: Any, depth: int = 0) -> Text:
         if len(element) == 0:
             txt = Text('[]', style='white')
         else:
-            total_length = sum([len(str(e)) for e in element]) + ((len(element) - 1) * 2) + + len(indent) + 2
+            total_length = sum([len(str(e)) for e in element]) + ((len(element) - 1) * 2) + len(indent) + 2
             elements_txt = [_rich_yara_match(e, depth + 1) for e in element]
             list_txt = Text('[', style='white')