feat: added comments, argparse short codes, deduplication for meetings

lxndrblz · Jun 29, 2021 · ee3d9e7 · ee3d9e7
1 parent 596edcc
commit ee3d9e7
Show file tree

Hide file tree

Showing 3 changed files with 85 additions and 57 deletions.
diff --git a/utils/dump_leveldb.py b/utils/dump_leveldb.py
@@ -32,8 +32,8 @@ def parse_cmdline():
     description = 'Forensics.im Dump Tool'
     parser = argparse.ArgumentParser(description=description)
     required_group = parser.add_argument_group('required arguments')
-    required_group.add_argument('--filepath', required=True, help='File path to the IndexedDB.')
-    required_group.add_argument('--outputpath', required=True, help='File path to the processed output.')
+    required_group.add_argument('-f', '--filepath', required=True, help='File path to the IndexedDB.')
+    required_group.add_argument('-o', '--outputpath', required=True, help='File path to the processed output.')
     args = parser.parse_args()
     return args
 

diff --git a/utils/main.py b/utils/main.py
@@ -46,11 +46,15 @@ def extract_fields(record, keys):
 def parse_contacts(contacts):
     cleaned = []
     for contact in contacts:
-        value = contact['value']
-        x = extract_fields(value, 'contact')
-        x['origin_file'] = contact['origin_file']
-        x['record_type'] = 'contact'
-        cleaned.append(x)
+        try:
+            value = contact['value']
+            x = extract_fields(value, 'contact')
+            x['origin_file'] = contact['origin_file']
+            x['record_type'] = 'contact'
+            cleaned.append(x)
+        except UnicodeDecodeError or KeyError:
+            print("Could not decode contact.")
+            print(contact)
 
     # Deduplicate based on mri - should be unique anyway
     cleaned = deduplicate(cleaned, 'mri')
@@ -65,42 +69,49 @@ def parse_reply_chain(reply_chains):
         message = value['messages']
         for key, value in message.items():
             # parse as a normal chat message
-
-            x = extract_fields(value, 'message')
-            x['origin_file'] = reply_chain['origin_file']
-            # Files send without any description will be of type text
-            if x['messagetype'] == 'RichText/Html' or x['messagetype'] == 'Text':
-                # Get the call logs
-                if 'call-log' in x['properties']:
-                    # call logs are string escaped
-                    x['properties']['call-log'] = json.loads(value['properties']['call-log'])
-                    x['record_type'] = 'call'
-                # Get the reactions from the chat
-                elif 'activity' in x['properties']:
-                    # reactionInChat are for personal conversations, reactions are for posts or comments
-                    if x['properties']['activity']['activityType'] == 'reactionInChat' or 'reaction':
-                        x['record_type'] = 'reaction'
-                # normal message, posts, file transfers
-                else:
-                    x['content'] = strip_html_tags(x['content'])
-                    x['record_type'] = 'message'
-
-                    # handle string escaped json arrays within properties
-                    if 'links' in x['properties']:
-                        x['properties']['links'] = json.loads(x['properties']['links'])
-                    if 'files' in x['properties']:
-                        x['properties']['files'] = json.loads(x['properties']['files'])
-                # convert the timestamps
-                x['createdTime'] = convert_time_stamps(x['createdTime'])
-                x['version'] = convert_time_stamps(x['version'])
-                # manually construct the cachedDeduplicationKey, because not every replychain appears to have it
-                x['cachedDeduplicationKey'] = str(x['creator']+x['clientmessageid'])
-                cleaned.append(x)
-            # Other types include ThreadActivity/TopicUpdate and ThreadActivity/AddMember
-            # -> ThreadActivity/TopicUpdate occurs for meeting updates
-            # -> ThreadActivity/AddMember occurs when someone gets added to a chat
-
-    # Deduplicate
+            try:
+                x = extract_fields(value, 'message')
+                x['origin_file'] = reply_chain['origin_file']
+                # Files send without any description will be of type text
+                if x['messagetype'] == 'RichText/Html' or x['messagetype'] == 'Text':
+                    # Get the call logs
+                    if 'call-log' in x['properties']:
+                        # call logs are string escaped
+                        x['properties']['call-log'] = json.loads(value['properties']['call-log'])
+                        x['record_type'] = 'call'
+                    # Get the reactions from the chat
+                    elif 'activity' in x['properties']:
+                        # reactionInChat are for personal conversations, reactions are for posts or comments
+                        if x['properties']['activity']['activityType'] == 'reactionInChat' or 'reaction':
+                            x['record_type'] = 'reaction'
+                    # normal message, posts, file transfers
+                    else:
+                        x['content'] = strip_html_tags(x['content'])
+                        x['record_type'] = 'message'
+
+                        # handle string escaped json arrays within properties
+                        if 'links' in x['properties']:
+                            x['properties']['links'] = json.loads(x['properties']['links'])
+                        if 'files' in x['properties']:
+                            x['properties']['files'] = json.loads(x['properties']['files'])
+                    # convert the timestamps
+                    x['createdTime'] = convert_time_stamps(x['createdTime'])
+                    x['version'] = convert_time_stamps(x['version'])
+                    # manually construct the cachedDeduplicationKey, because not every replychain appears to have this key.
+                    # cachedDeduplicationKey look like 8:orgid:54dd27a7-fbb0-4bf0-8208-a4b31a578a3f6691174965251523000
+                    # They are composed of the:
+                    # -> creator 8:orgid:54dd27a7-fbb0-4bf0-8208-a4b31a578a3f
+                    # -> clientmessageid 6691174965251523000
+                    if x['creator'] is not None and x['clientmessageid'] is not None:
+                        x['cachedDeduplicationKey'] = str(x['creator']+x['clientmessageid'])
+                    cleaned.append(x)
+                # Other types include ThreadActivity/TopicUpdate and ThreadActivity/AddMember
+                # -> ThreadActivity/TopicUpdate occurs for meeting updates
+                # -> ThreadActivity/AddMember occurs when someone gets added to a chat
+            except UnicodeDecodeError or KeyError:
+                print("Could not decode reply chain.")
+                print(reply_chain)
+    # Deduplicate based on cachedDeduplicationKey, as messages appear often multiple times within
     cleaned = deduplicate(cleaned, 'cachedDeduplicationKey')
     return cleaned
 
@@ -113,24 +124,33 @@ def parse_conversations(conversations):
             x = extract_fields(value, 'conversation')
             # Include file origin for records
             x['origin_file'] = conversation['origin_file']
-            if x['type'] == 'Meeting':
-                # assign the type for further processing as the object store might not be sufficient
-                if 'threadProperties' in x:
-                    if 'meeting' in x['threadProperties']:
-                        x['threadProperties']['meeting'] = json.loads(x['threadProperties']['meeting'])
-                        x['record_type'] = 'meeting'
-                        cleaned.append(x)
-        except UnicodeDecodeError:
+            # Make first at sure that the conversation has a cachedDeduplicationKey
+            if 'lastMessage' in conversation['value']:
+                if 'cachedDeduplicationKey' in conversation['value']['lastMessage']:
+                    x['cachedDeduplicationKey'] = conversation['value']['lastMessage']['cachedDeduplicationKey']
+                # we are only interested in meetings for now
+                if x['type'] == 'Meeting':
+                    # assign the type for further processing as the object store might not be sufficient
+                    if 'threadProperties' in x:
+                        if 'meeting' in x['threadProperties']:
+                            x['threadProperties']['meeting'] = json.loads(x['threadProperties']['meeting'])
+                            x['record_type'] = 'meeting'
+                            cleaned.append(x)
+        except UnicodeDecodeError or KeyError:
             print("Could not decode meeting.")
-        # Other types include Message, Chat, Space
+            print(conversation)
+        # Other types include Message, Chat, Space, however, these did not include any records of evidential value
+        # for my test data. It might be relevant to investigate these further with a different test scenario.
+
+    # Deduplicate
+    cleaned = deduplicate(cleaned, 'cachedDeduplicationKey')
     return cleaned
 
 
 def parse_records(records):
     parsed_records = []
 
-    # Parse the records based on the store they are in. Some records, such as meetings appear in multiple store.
-    # The most appropriate one was used in this case.
+    # Parse the records based on the store they are in.
 
     # parse contacts
     contacts = [d for d in records if d['store'] == 'people']
@@ -185,8 +205,8 @@ def parse_cmdline():
     description = 'Forensics.im Xtract Tool'
     parser = argparse.ArgumentParser(description=description)
     required_group = parser.add_argument_group('required arguments')
-    required_group.add_argument('--filepath', required=True, help='File path to the IndexedDB.')
-    required_group.add_argument('--outputpath', required=True, help='File path to the processed output.')
+    required_group.add_argument('-f', '--filepath', required=True, help='File path to the IndexedDB.')
+    required_group.add_argument('-o', '--outputpath', required=True, help='File path to the processed output.')
     args = parser.parse_args()
     return args
 

diff --git a/utils/shared.py b/utils/shared.py
@@ -4,7 +4,7 @@
 from ccl_chrome_indexeddb import ccl_blink_value_deserializer, ccl_chromium_indexeddb, ccl_v8_value_deserializer, \
     ccl_leveldb
 
-# db ID (second byte) is constant, store ids vary based on what we are looking for
+# db ID (second byte) is constant for the version analysed, store ids vary based on what we are looking for.
 TEAMS_DB_PREFIX = {'replychains': b'\x00\x05\x02\x01', 'conversations': b'\x00\x05\x04\x01',
                    'people': b'\x00\x05\x07\x01'}
 
@@ -13,6 +13,8 @@ def deserialize(db):
     # Deserializer is adopted from but uses constant database and object stores IDs rather than looping through the dbs forever.
     # https://github.com/cclgroupltd/ccl_chrome_indexeddb/blob/master/ccl_chromium_indexeddb.py
 
+    # TODO refactor code to a subclass of RawLevelDB and override the iterator functionality
+
     blink_deserializer = ccl_blink_value_deserializer.BlinkV8Deserializer()
 
     deserialized_db = []
@@ -36,11 +38,15 @@ def deserialize(db):
 
                 val_idx += len(varint_raw)
 
+                # read the raw value of the record.
                 obj_raw = io.BytesIO(record.value[val_idx:])
+
+                # Initialize deserializer and try deserialization.
                 deserializer = ccl_v8_value_deserializer.Deserializer(
                     obj_raw, host_object_delegate=blink_deserializer.read)
                 try:
                     value = deserializer.read()
+                    # TODO refactor code to a generator.
                     deserialized_db.append({'value': value, 'origin_file': record.origin_file, 'store': datastore})
                 except Exception as e:
                     pass
@@ -57,12 +63,14 @@ def write_results_to_json(data, outputpath):
 
 
 def parse_db(filepath):
+    # Open raw access to a LevelDB and deserialize the records.
     db = ccl_leveldb.RawLevelDb(filepath)
     extracted_values = deserialize(db)
     return extracted_values
 
 
 def parse_json():
+    # read data from a file. This is only for testing purpose.
     try:
         with open('teams.json') as json_file:
             data = json.load(json_file)