diff --git a/.gitmodules b/.gitmodules index c210bb0..4099ff0 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,9 +1,3 @@ -[submodule ".\\utils\\ccl_chrome_indexeddb"] - path = .\\utils\\ccl_chrome_indexeddb - url = https://github.com/cclgroupltd/ccl_chrome_indexeddb/ -[submodule "utils\\ccl_chrome_indexeddb"] - path = utils\\ccl_chrome_indexeddb - url = https://github.com/cclgroupltd/ccl_chrome_indexeddb/ [submodule "forensicsim-data"] path = forensicsim-data url = https://github.com/KarelZe/forensicsim-data.git diff --git a/requirements.txt b/requirements.txt index 1448a7d..4871cce 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ beautifulsoup4~=4.9.3 chardet~=4.0.0 click~=8.0.1 +chromedb @ git+https://github.com/karelze/ccl_chrome_indexeddb@master colorama~=0.4.4 pause~=0.3 pyautogui~=0.9.54 diff --git a/utils/.gitignore b/utils/.gitignore deleted file mode 100644 index 71f0b4c..0000000 --- a/utils/.gitignore +++ /dev/null @@ -1,141 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - -# Teams chat logs -sample/ diff --git a/utils/ccl_chrome_indexeddb/.gitignore b/utils/ccl_chrome_indexeddb/.gitignore deleted file mode 100644 index 90a13b6..0000000 --- a/utils/ccl_chrome_indexeddb/.gitignore +++ /dev/null @@ -1 +0,0 @@ -/*.bin diff --git a/utils/ccl_chrome_indexeddb/Chromium_dump_local_storage.py b/utils/ccl_chrome_indexeddb/Chromium_dump_local_storage.py deleted file mode 100644 index 2a6a57d..0000000 --- a/utils/ccl_chrome_indexeddb/Chromium_dump_local_storage.py +++ /dev/null @@ -1,119 +0,0 @@ -""" -Copyright 2021, CCL Forensics -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" - -import sys -import pathlib -import datetime -import sqlite3 -import ccl_chromium_localstorage - -__version__ = "0.1" -__description__ = "Dumps a Chromium localstorage leveldb to sqlite for review" -__contact__ = "Alex Caithness" - -DB_SCHEMA = """ -CREATE TABLE storage_keys ("_id" INTEGER PRIMARY KEY AUTOINCREMENT, "storage_key" TEXT); -CREATE TABLE batches ("start_ldbseq" INTEGER PRIMARY KEY, - "end_ldbseq" INTEGER, - "storage_key" INTEGER, - "timestamp" INTEGER); -CREATE TABLE records ("_id" INTEGER PRIMARY KEY AUTOINCREMENT, - "storage_key" INTEGER, - "key" TEXT, - "value" TEXT, - "batch" INTEGER, - "ldbseq" INTEGER); -CREATE INDEX "storage_keys_storage_key" ON "storage_keys" ("storage_key"); - -CREATE VIEW "records_view" AS - SELECT - storage_keys.storage_key AS "storage_key", - records."key" AS "key", - records.value AS "value", - datetime(batches."timestamp", 'unixepoch') AS "batch_timestamp", - records.ldbseq AS "ldbseq" - FROM records - INNER JOIN storage_keys ON records.storage_key = storage_keys._id - INNER JOIN batches ON records.batch = batches.start_ldbseq - ORDER BY records.ldbseq; -""" - -INSERT_STORAGE_KEY_SQL = """INSERT INTO "storage_keys" ("storage_key") VALUES (?);""" -INSERT_BATCH_SQL = """INSERT INTO "batches" ("start_ldbseq", "end_ldbseq", "storage_key", "timestamp") - VALUES (?, ?, ?, ?);""" -INSERT_RECORD_SQL = """INSERT INTO "records" ("storage_key", "key", "value", "batch", "ldbseq") - VALUES (?, ?, ?, ?, ?);""" - - -def main(args): - level_db_in_dir = pathlib.Path(args[0]) - db_out_path = pathlib.Path(args[1]) - - if db_out_path.exists(): - raise ValueError("output database already exists") - - local_storage = ccl_chromium_localstorage.LocalStoreDb(level_db_in_dir) - out_db = sqlite3.connect(db_out_path) - out_db.executescript(DB_SCHEMA) - cur = out_db.cursor() - - storage_keys_lookup = {} - for storage_key in local_storage.iter_storage_keys(): - cur.execute(INSERT_STORAGE_KEY_SQL, (storage_key,)) - cur.execute("SELECT last_insert_rowid();") - storage_key_id = cur.fetchone()[0] - storage_keys_lookup[storage_key] = storage_key_id - - for batch in local_storage.iter_batches(): - cur.execute( - INSERT_BATCH_SQL, - ( - batch.start, - batch.end, - storage_keys_lookup[batch.storage_key], - batch.timestamp.replace(tzinfo=datetime.timezone.utc).timestamp(), - ), - ) - - for record in local_storage.iter_all_records(): - batch = local_storage.find_batch(record.leveldb_seq_number) - batch_id = batch.start if batch is not None else None - cur.execute( - INSERT_RECORD_SQL, - ( - storage_keys_lookup[record.storage_key], - record.script_key, - record.value, - batch_id, - record.leveldb_seq_number, - ), - ) - - cur.close() - out_db.commit() - out_db.close() - - -if __name__ == "__main__": - if len(sys.argv) != 3: - print(f"{pathlib.Path(sys.argv[0]).name} ") - exit(1) - main(sys.argv[1:]) diff --git a/utils/ccl_chrome_indexeddb/Chromium_dump_session_storage.py b/utils/ccl_chrome_indexeddb/Chromium_dump_session_storage.py deleted file mode 100644 index 711e4f5..0000000 --- a/utils/ccl_chrome_indexeddb/Chromium_dump_session_storage.py +++ /dev/null @@ -1,100 +0,0 @@ -""" -Copyright 2021, CCL Forensics -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" - -import sys -import pathlib -import sqlite3 -import ccl_chromium_sessionstorage - -__version__ = "0.1" -__description__ = "Dumps a Chromium sessionstorage leveldb to sqlite for review" -__contact__ = "Alex Caithness" - -DB_SCHEMA = """ -CREATE TABLE "hosts" ("_id" INTEGER PRIMARY KEY AUTOINCREMENT, "host" TEXT); -CREATE TABLE "guids" ("_id" INTEGER PRIMARY KEY AUTOINCREMENT, "guid" TEXT); -CREATE TABLE "items" ("_id" INTEGER PRIMARY KEY AUTOINCREMENT, - "host" INTEGER, - "guid" INTEGER, - "ldbseq" INTEGER, - "key" TEXT, - "value" TEXT); -CREATE INDEX "item_host" ON "items" ("host"); -CREATE INDEX "item_ldbseq" ON "items" ("ldbseq"); - -CREATE VIEW items_view AS - SELECT "items"."ldbseq", - "hosts"."host", - "items"."key", - "items"."value", - "guids"."guid" - FROM "items" - LEFT JOIN "hosts" ON "items"."host" = "hosts"."_id" - LEFT JOIN "guids" ON "items"."guid" = "guids"."_id" - ORDER BY "items"."ldbseq"; -""" - -INSERT_HOST_SQL = """INSERT INTO "hosts" ("host") VALUES (?);""" -INSERT_ITEM_SQL = ( - """INSERT INTO "items" (host, guid, ldbseq, key, value) VALUES (?, ?, ?, ?, ?);""" -) - - -def main(args): - level_db_in_dir = pathlib.Path(args[0]) - db_out_path = pathlib.Path(args[1]) - - if db_out_path.exists(): - raise ValueError("output database already exists") - - session_storage = ccl_chromium_sessionstorage.SessionStoreDb(level_db_in_dir) - out_db = sqlite3.connect(db_out_path) - out_db.executescript(DB_SCHEMA) - cur = out_db.cursor() - for host in session_storage.iter_hosts(): - cur.execute(INSERT_HOST_SQL, (host,)) - cur.execute("SELECT last_insert_rowid();") - host_id = cur.fetchone()[0] - host_kvs = session_storage.get_all_for_host(host) - - for key, values in host_kvs.items(): - for value in values: - cur.execute( - INSERT_ITEM_SQL, - (host_id, None, value.leveldb_sequence_number, key, value.value), - ) - - for key, value in session_storage.iter_orphans(): - cur.execute( - INSERT_ITEM_SQL, - (None, None, value.leveldb_sequence_number, key, value.value), - ) - - cur.close() - out_db.commit() - out_db.close() - - -if __name__ == "__main__": - if len(sys.argv) != 3: - print(f"{pathlib.Path(sys.argv[0]).name} ") - exit(1) - main(sys.argv[1:]) diff --git a/utils/ccl_chrome_indexeddb/LICENSE b/utils/ccl_chrome_indexeddb/LICENSE deleted file mode 100644 index 8b9f05e..0000000 --- a/utils/ccl_chrome_indexeddb/LICENSE +++ /dev/null @@ -1,19 +0,0 @@ -Copyright 2020, CCL Forensics - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/utils/ccl_chrome_indexeddb/README.md b/utils/ccl_chrome_indexeddb/README.md deleted file mode 100644 index 5b65a84..0000000 --- a/utils/ccl_chrome_indexeddb/README.md +++ /dev/null @@ -1,139 +0,0 @@ -# ccl_chrome_indexeddb -This repository contains (sometimes partial) re-implementations of the technologies involved in reading IndexedDB data -in Chrome-esque applications. -This includes: -* Snappy decompression -* LevelDB -* V8 object deserialization -* Blink object deserialization -* IndexedDB wrapper - -### Blog -Read a blog on the subject here: https://www.cclsolutionsgroup.com/post/indexeddb-on-chromium - -### Caveats -There is a fair amount of work yet to be done in terms of documentation, but -the modules should be fine for pulling data out of IndexedDB, with the following -caveats: - -#### LevelDB deleted data -The LevelDB module will spit out live and deleted/old versions of records -indiscriminately; it's possible to differentiate between them with some -work, but that hasn't really been baked into the modules as they currently -stand. So you are getting deleted data "for free" currently...whether you -want it or not. - -#### Blink data types -I am fairly satisfied that all the possible V8 object types are accounted for -(but I'm happy to be shown otherwise and get that fixed of course!), but it -is likely that the hosted Blink objects aren't all there yet; so if you hit -upon an error coming from inside ccl_blink_value_deserializer and can point -me towards test data, I'd be very thankful! - -#### Cyclic references -It is noted in the V8 source that recursive referencing is possible in the -serialization, we're not yet accounting for that so if Python throws a -`RecursionError` that's likely what you're seeing. The plan is to use a -similar approach to ccl_bplist where the collection types are subclassed and -do Just In Time resolution of the items, but that isn't done yet. - -## Using the modules -There are two methods for accessing records - a more pythonic API using a set of -wrapper objects and a raw API which doesn't mask the underlying workings. There is -unlikely to be much benefit to using the raw API in most cases, so the wrapper objects -are recommended in most cases. - -### Wrapper API -```python -import sys -import ccl_chromium_indexeddb - -# assuming command line arguments are paths to the .leveldb and .blob folders -leveldb_folder_path = sys.argv[1] -blob_folder_path = sys.argv[2] - -# open the indexedDB: -wrapper = ccl_chromium_indexeddb.WrappedIndexDB(leveldb_folder_path, blob_folder_path) - -# You can check the databases present using `wrapper.database_ids` - -# Databases can be accessed from the wrapper in a number of ways: -db = wrapper[2] # accessing database using id number -db = wrapper["MyTestDatabase"] # accessing database using name (only valid for single origin indexedDB instances) -db = wrapper["MyTestDatabase", "file__0@1"] # accessing the database using name and origin -# NB using name and origin is likely the preferred option in most cases - -# The wrapper object also supports checking for databases using `in` - -# You can check for object store names using `db.object_store_names` - -# Object stores can be accessed from the database in a number of ways: -obj_store = db[1] # accessing object store using id number -obj_store = db["store"] # accessing object store using name - -# Records can then be accessed by iterating the object store in a for-loop -for record in obj_store.iterate_records(): - print(record.user_key) - print(record.value) - - # if this record contained a FileInfo object somewhere linking - # to data stored in the blob dir, we could access that data like - # so (assume the "file" key in the record value is our FileInfo): - with record.get_blob_stream(record.value["file"]) as f: - file_data = f.read() - -# By default, any errors in decoding records will bubble an exception -# which might be painful when iterating records in a for-loop, so either -# passing True into the errors_to_stdout argument and/or by passing in an -# error handler function to bad_deserialization_data_handler, you can -# perform logging rather than crashing: - -for record in obj_store.iterate_records( - errors_to_stdout=True, - bad_deserializer_data_handler= lambda k,v: print(f"error: {k}, {v}")): - print(record.user_key) - print(record.value) -``` - -### Raw access API -```python -import sys -import ccl_chromium_indexeddb - -# assuming command line arguments are paths to the .leveldb and .blob folders -leveldb_folder_path = sys.argv[1] -blob_folder_path = sys.argv[2] - -# open the database: -db = ccl_chromium_indexeddb.IndexedDb(leveldb_folder_path, blob_folder_path) - -# there can be multiple databases, so we need to iterate through them (NB -# DatabaseID objects contain additional metadata, they aren't just ints): -for db_id_meta in db.global_metadata.db_ids: - # and within each database, there will be multiple object stores so we - # will need to know the maximum object store number (this process will be - # cleaned up in future releases): - max_objstore_id = db.get_database_metadata( - db_id_meta.dbid_no, - ccl_chromium_indexeddb.DatabaseMetadataType.MaximumObjectStoreId) - - # if the above returns None, then there are no stores in this db - if max_objstore_id is None: - continue - - # there may be multiple object stores, so again, we iterate through them - # this time based on the id number. Object stores start at id 1 and the - # max_objstore_id is inclusive: - for obj_store_id in range(1, max_objstore_id + 1): - # now we can ask the indexeddb wrapper for all records for this db - # and object store: - for record in db.iterate_records(db_id_meta.dbid_no, obj_store_id): - print(f"key: {record.user_key}") - print(f"key: {record.value}") - - # if this record contained a FileInfo object somewhere linking - # to data stored in the blob dir, we could access that data like - # so (assume the "file" key in the record value is our FileInfo): - with record.get_blob_stream(record.value["file"]) as f: - file_data = f.read() -``` diff --git a/utils/ccl_chrome_indexeddb/__init__.py b/utils/ccl_chrome_indexeddb/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/utils/ccl_chrome_indexeddb/ccl_blink_value_deserializer.py b/utils/ccl_chrome_indexeddb/ccl_blink_value_deserializer.py deleted file mode 100644 index a29fd52..0000000 --- a/utils/ccl_chrome_indexeddb/ccl_blink_value_deserializer.py +++ /dev/null @@ -1,232 +0,0 @@ -""" -Copyright 2020, CCL Forensics - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" - -import sys -import enum -import typing -from dataclasses import dataclass - -from . import ccl_v8_value_deserializer - -# See: https://chromium.googlesource.com/chromium/src/third_party/+/master/blink/renderer/bindings/core/v8/serialization - -# WebCoreStrings are read as (length:uint32_t, string:UTF8[length]). -# RawStrings are read as (length:uint32_t, string:UTF8[length]). -# RawUCharStrings are read as -# (length:uint32_t, string:UChar[length/sizeof(UChar)]). -# RawFiles are read as -# (path:WebCoreString, url:WebCoreStrng, type:WebCoreString). -# There is a reference table that maps object references (uint32_t) to -# v8::Values. -# Tokens marked with (ref) are inserted into the reference table and given the -# next object reference ID after decoding. -# All tags except InvalidTag, PaddingTag, ReferenceCountTag, VersionTag, -# GenerateFreshObjectTag and GenerateFreshArrayTag push their results to the -# deserialization stack. -# There is also an 'open' stack that is used to resolve circular references. -# Objects or arrays may contain self-references. Before we begin to deserialize -# the contents of these values, they are first given object reference IDs (by -# GenerateFreshObjectTag/GenerateFreshArrayTag); these reference IDs are then -# used with ObjectReferenceTag to tie the recursive knot. - -__version__ = "0.1" -__description__ = ( - "Partial reimplementation of the Blink Javascript Object Serialization" -) -__contact__ = "Alex Caithness" - -__DEBUG = True - - -def log(msg, debug_only=True): - if __DEBUG or not debug_only: - caller_name = sys._getframe(1).f_code.co_name - caller_line = sys._getframe(1).f_code.co_firstlineno - print(f"{caller_name} ({caller_line}):\t{msg}") - - -class BlobIndexType(enum.Enum): - Blob = 0 - File = 1 - - -@dataclass -class BlobIndex: - index_type: BlobIndexType - index_id: int - - -class Constants: - tag_kMessagePortTag = b"M" # index:int -> MessagePort. Fills the result with - # transferred MessagePort. - tag_kMojoHandleTag = b"h" # index:int -> MojoHandle. Fills the result with - # transferred MojoHandle. - tag_kBlobTag = b"b" # uuid:WebCoreString, type:WebCoreString, size:uint64_t -> - # Blob (ref) - tag_kBlobIndexTag = b"i" # index:int32_t -> Blob (ref) - tag_kFileTag = b"f" # file:RawFile -> File (ref) - tag_kFileIndexTag = b"e" # index:int32_t -> File (ref) - tag_kDOMFileSystemTag = b"d" # type : int32_t, name:WebCoreString, - # uuid:WebCoreString -> FileSystem (ref) - tag_kNativeFileSystemFileHandleTag = b"n" # name:WebCoreString, index:uint32_t - # -> NativeFileSystemFileHandle (ref) - tag_kNativeFileSystemDirectoryHandleTag = ( - b"N" # name:WebCoreString, index:uint32_t -> - ) - # NativeFileSystemDirectoryHandle (ref) - tag_kFileListTag = b"l" # length:uint32_t, files:RawFile[length] -> FileList (ref) - tag_kFileListIndexTag = ( - b"L" # length:uint32_t, files:int32_t[length] -> FileList (ref) - ) - tag_kImageDataTag = b"#" # tags terminated by ImageSerializationTag::kEnd (see - # SerializedColorParams.h), width:uint32_t, - # height:uint32_t, pixelDataLength:uint64_t, - # data:byte[pixelDataLength] - # -> ImageData (ref) - tag_kImageBitmapTag = b"g" # tags terminated by ImageSerializationTag::kEnd (see - # SerializedColorParams.h), width:uint32_t, - # height:uint32_t, pixelDataLength:uint32_t, - # data:byte[pixelDataLength] - # -> ImageBitmap (ref) - tag_kImageBitmapTransferTag = ( - b"G" # index:uint32_t -> ImageBitmap. For ImageBitmap transfer - ) - tag_kOffscreenCanvasTransferTag = b"H" # index, width, height, id, - # filter_quality::uint32_t -> - # OffscreenCanvas. For OffscreenCanvas - # transfer - tag_kReadableStreamTransferTag = b"r" # index:uint32_t - tag_kTransformStreamTransferTag = b"m" # index:uint32_t - tag_kWritableStreamTransferTag = b"w" # index:uint32_t - tag_kDOMPointTag = b"Q" # x:Double, y:Double, z:Double, w:Double - tag_kDOMPointReadOnlyTag = b"W" # x:Double, y:Double, z:Double, w:Double - tag_kDOMRectTag = b"E" # x:Double, y:Double, width:Double, height:Double - tag_kDOMRectReadOnlyTag = b"R" # x:Double, y:Double, width:Double, height:Double - tag_kDOMQuadTag = b"T" # p1:Double, p2:Double, p3:Double, p4:Double - tag_kDOMMatrixTag = b"Y" # m11..m44: 16 Double - tag_kDOMMatrixReadOnlyTag = b"U" # m11..m44: 16 Double - tag_kDOMMatrix2DTag = b"I" # a..f: 6 Double - tag_kDOMMatrix2DReadOnlyTag = b"O" # a..f: 6 Double - tag_kCryptoKeyTag = b"K" # subtag:byte, props, usages:uint32_t, - # keyDataLength:uint32_t, keyData:byte[keyDataLength] - # If subtag=AesKeyTag: - # props = keyLengthBytes:uint32_t, algorithmId:uint32_t - # If subtag=HmacKeyTag: - # props = keyLengthBytes:uint32_t, hashId:uint32_t - # If subtag=RsaHashedKeyTag: - # props = algorithmId:uint32_t, type:uint32_t, - # modulusLengthBits:uint32_t, - # publicExponentLength:uint32_t, - # publicExponent:byte[publicExponentLength], - # hashId:uint32_t - # If subtag=EcKeyTag: - # props = algorithmId:uint32_t, type:uint32_t, - # namedCurve:uint32_t - tag_kRTCCertificateTag = b"k" # length:uint32_t, pemPrivateKey:WebCoreString, - # pemCertificate:WebCoreString - tag_kRTCEncodedAudioFrameTag = b"A" # uint32_t -> transferred audio frame ID - tag_kRTCEncodedVideoFrameTag = b"V" # uint32_t -> transferred video frame ID - tag_kVideoFrameTag = b"v" # uint32_t -> transferred video frame ID - - # The following tags were used by the Shape Detection API implementation - # between M71 and M81. During these milestones, the API was always behind - # a flag. Usage was removed in https:#crrev.com/c/2040378. - tag_kDeprecatedDetectedBarcodeTag = b"B" - tag_kDeprecatedDetectedFaceTag = b"F" - tag_kDeprecatedDetectedTextTag = b"t" - - tag_kDOMExceptionTag = b"x" # name:String,message:String,stack:String - tag_kVersionTag = b"\xff" # version:uint32_t -> Uses this as the file version. - - -class BlinkV8Deserializer: - def _read_varint(self, stream) -> int: - return ccl_v8_value_deserializer.read_le_varint(stream)[0] - - def _read_file_index(self, stream: typing.BinaryIO) -> BlobIndex: - return BlobIndex(BlobIndexType.File, self._read_varint(stream)) - - def _read_file_list_index( - self, stream: typing.BinaryIO - ) -> typing.Iterable[BlobIndex]: - length = self._read_varint(stream) - result = [self._read_file_index(stream) for _ in range(length)] - return result - - def _not_implemented(self, stream): - raise NotImplementedError() - - def read(self, stream: typing.BinaryIO) -> typing.Any: - tag = stream.read(1) - - func = { - Constants.tag_kMessagePortTag: lambda x: self._not_implemented(x), - Constants.tag_kMojoHandleTag: lambda x: self._not_implemented(x), - Constants.tag_kBlobTag: lambda x: self._not_implemented(x), - Constants.tag_kBlobIndexTag: lambda x: self._not_implemented(x), - Constants.tag_kFileTag: lambda x: self._not_implemented(x), - Constants.tag_kFileIndexTag: lambda x: self._read_file_index(x), - Constants.tag_kDOMFileSystemTag: lambda x: self._not_implemented(x), - Constants.tag_kNativeFileSystemFileHandleTag: lambda x: self._not_implemented( - x - ), - Constants.tag_kNativeFileSystemDirectoryHandleTag: lambda x: self._not_implemented( - x - ), - Constants.tag_kFileListTag: lambda x: self._not_implemented(x), - Constants.tag_kFileListIndexTag: lambda x: self._read_file_list_index(x), - Constants.tag_kImageDataTag: lambda x: self._not_implemented(x), - Constants.tag_kImageBitmapTag: lambda x: self._not_implemented(x), - Constants.tag_kImageBitmapTransferTag: lambda x: self._not_implemented(x), - Constants.tag_kOffscreenCanvasTransferTag: lambda x: self._not_implemented( - x - ), - Constants.tag_kReadableStreamTransferTag: lambda x: self._not_implemented( - x - ), - Constants.tag_kTransformStreamTransferTag: lambda x: self._not_implemented( - x - ), - Constants.tag_kWritableStreamTransferTag: lambda x: self._not_implemented( - x - ), - Constants.tag_kDOMPointTag: lambda x: self._not_implemented(x), - Constants.tag_kDOMPointReadOnlyTag: lambda x: self._not_implemented(x), - Constants.tag_kDOMRectTag: lambda x: self._not_implemented(x), - Constants.tag_kDOMRectReadOnlyTag: lambda x: self._not_implemented(x), - Constants.tag_kDOMQuadTag: lambda x: self._not_implemented(x), - Constants.tag_kDOMMatrixTag: lambda x: self._not_implemented(x), - Constants.tag_kDOMMatrixReadOnlyTag: lambda x: self._not_implemented(x), - Constants.tag_kDOMMatrix2DTag: lambda x: self._not_implemented(x), - Constants.tag_kDOMMatrix2DReadOnlyTag: lambda x: self._not_implemented(x), - Constants.tag_kCryptoKeyTag: lambda x: self._not_implemented(x), - Constants.tag_kRTCCertificateTag: lambda x: self._not_implemented(x), - Constants.tag_kRTCEncodedAudioFrameTag: lambda x: self._not_implemented(x), - Constants.tag_kRTCEncodedVideoFrameTag: lambda x: self._not_implemented(x), - Constants.tag_kVideoFrameTag: lambda x: self._not_implemented(x), - Constants.tag_kDOMExceptionTag: lambda x: self._not_implemented(x), - }.get(tag) - - if func is None: - raise ValueError(f"Unknown tag: {tag}") - - return func(stream) diff --git a/utils/ccl_chrome_indexeddb/ccl_chromium_indexeddb.py b/utils/ccl_chrome_indexeddb/ccl_chromium_indexeddb.py deleted file mode 100644 index 346df63..0000000 --- a/utils/ccl_chrome_indexeddb/ccl_chromium_indexeddb.py +++ /dev/null @@ -1,843 +0,0 @@ -""" -Copyright 2020-2021, CCL Forensics - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" - -import dataclasses -import datetime -import enum -import io -import os -import pathlib -import struct -import sys -import types -import typing - -from . import ccl_blink_value_deserializer -from . import ccl_leveldb -from . import ccl_v8_value_deserializer - -__version__ = "0.6" -__description__ = "Module for reading Chromium IndexedDB LevelDB databases." -__contact__ = "Alex Caithness" - - -# TODO: need to go through and ensure that we have endianness right in all cases -# (it should sit behind a switch for integers, fixed for most other stuff) - - -def _read_le_varint(stream: typing.BinaryIO, *, is_google_32bit=False): - # this only outputs unsigned - i = 0 - result = 0 - underlying_bytes = [] - limit = 5 if is_google_32bit else 10 - while i < limit: - raw = stream.read(1) - if len(raw) < 1: - return None - (tmp,) = raw - underlying_bytes.append(tmp) - result |= (tmp & 0x7F) << (i * 7) - - if (tmp & 0x80) == 0: - break - i += 1 - return result, bytes(underlying_bytes) - - -def read_le_varint(stream: typing.BinaryIO, *, is_google_32bit=False): - x = _read_le_varint(stream, is_google_32bit=is_google_32bit) - if x is None: - return None - else: - return x[0] - - -def _le_varint_from_bytes(data: bytes): - with io.BytesIO(data) as buff: - return _read_le_varint(buff) - - -def custom_le_varint_from_bytes(data: bytes): - return _le_varint_from_bytes(data) - - -def le_varint_from_bytes(data: bytes): - with io.BytesIO(data) as buff: - return read_le_varint(buff) - - -class IdbKeyType(enum.IntEnum): - Null = 0 - String = 1 - Date = 2 - Number = 3 - Array = 4 - MinKey = 5 - Binary = 6 - - -class IdbKey: - # See: https://github.com/chromium/chromium/blob/master/content/browser/indexed_db/indexed_db_leveldb_coding.cc - def __init__(self, buffer: bytes): - self.raw_key = buffer - self.key_type = IdbKeyType(buffer[0]) - raw_key = buffer[1:] - - if self.key_type == IdbKeyType.Null: - self.value = None - self._raw_length = 1 - elif self.key_type == IdbKeyType.String: - str_len, varint_raw = _le_varint_from_bytes(raw_key) - self.value = raw_key[ - len(varint_raw) : len(varint_raw) + str_len * 2 - ].decode("utf-16-be") - self._raw_length = 1 + len(varint_raw) + str_len * 2 - elif self.key_type == IdbKeyType.Date: - (ts,) = struct.unpack("" - - def __str__(self): - return self.__repr__() - - def __eq__(self, other): - if not isinstance(other, IdbKey): - raise NotImplementedError() - return self.raw_key == other.raw_key - - def __ne__(self, other): - return not self == other - - -class IndexedDBExternalObjectType(enum.IntEnum): - # see: https://github.com/chromium/chromium/blob/master/content/browser/indexed_db/indexed_db_external_object.h - Blob = 0 - File = 1 - NativeFileSystemHandle = 2 - - -class IndexedDBExternalObject: - # see: https://github.com/chromium/chromium/blob/master/content/browser/indexed_db/indexed_db_backing_store.cc - # for encoding. - - def __init__( - self, - object_type: IndexedDBExternalObjectType, - blob_number: typing.Optional[int], - mime_type: typing.Optional[str], - size: typing.Optional[int], - file_name: typing.Optional[str], - last_modified: typing.Optional[datetime.datetime], - native_file_token: typing.Optional, - ): - self.object_type = object_type - self.blob_number = blob_number - self.mime_type = mime_type - self.size = size - self.file_name = file_name - self.last_modified = last_modified - self.native_file_token = native_file_token - - @classmethod - def from_stream(cls, stream: typing.BinaryIO): - blob_type = IndexedDBExternalObjectType(stream.read(1)[0]) - if blob_type in ( - IndexedDBExternalObjectType.Blob, - IndexedDBExternalObjectType.File, - ): - blob_number = read_le_varint(stream) - mime_type_length = read_le_varint(stream) - mime_type = stream.read(mime_type_length * 2).decode("utf-16-be") - data_size = read_le_varint(stream) - - if blob_type == IndexedDBExternalObjectType.File: - file_name_length = read_le_varint(stream) - file_name = stream.read(file_name_length * 2).decode("utf-16-be") - x, x_raw = _read_le_varint(stream) - last_modified_td = datetime.timedelta(microseconds=x) - last_modified = datetime.datetime(1601, 1, 1) + last_modified_td - return cls( - blob_type, - blob_number, - mime_type, - data_size, - file_name, - last_modified, - None, - ) - else: - return cls( - blob_type, blob_number, mime_type, data_size, None, None, None - ) - else: - raise NotImplementedError() - - -@dataclasses.dataclass(frozen=True) -class DatabaseId: - dbid_no: int - origin: str - name: str - - -class GlobalMetadata: - def __init__(self, raw_meta_dict: dict): - # TODO: more of these meta types if required - self.backing_store_schema_version = None - if raw_schema_version := raw_meta_dict.get("\x00\x00\x00\x00\x00"): - self.backing_store_schema_version = le_varint_from_bytes(raw_schema_version) - - self.max_allocated_db_id = None - if raw_max_db_id := raw_meta_dict.get("\x00\x00\x00\x00\x01"): - self.max_allocated_db_id = le_varint_from_bytes(raw_max_db_id) - - database_ids_raw = ( - raw_meta_dict[x] - for x in raw_meta_dict - if x.startswith(b"\x00\x00\x00\x00\xc9") - ) - - dbids = [] - for dbid_rec in database_ids_raw: - with io.BytesIO(dbid_rec.key[5:]) as buff: - origin_length = read_le_varint(buff) - origin = buff.read(origin_length * 2).decode("utf-16-be") - db_name_length = read_le_varint(buff) - db_name = buff.read(db_name_length * 2).decode("utf-16-be") - - db_id_no = le_varint_from_bytes(dbid_rec.value) - - dbids.append(DatabaseId(db_id_no, origin, db_name)) - - self.db_ids = tuple(dbids) - - -class DatabaseMetadataType(enum.IntEnum): - OriginName = 0 # String - DatabaseName = 1 # String - IdbVersionString = 2 # String (and obsolete) - MaximumObjectStoreId = 3 # Int - IdbVersion = 4 # Varint - BlobNumberGeneratorCurrentNumber = 5 # Varint - - -class DatabaseMetadata: - def __init__(self, raw_meta: dict): - self._metas = types.MappingProxyType(raw_meta) - - def get_meta( - self, db_id: int, meta_type: DatabaseMetadataType - ) -> typing.Optional[typing.Union[str, int]]: - record = self._metas.get((db_id, meta_type)) - if not record: - return None - - if meta_type == DatabaseMetadataType.MaximumObjectStoreId: - return le_varint_from_bytes(record.value) - - # TODO - raise NotImplementedError() - - -class ObjectStoreMetadataType(enum.IntEnum): - StoreName = 0 # String - KeyPath = 1 # IDBKeyPath - AutoIncrementFlag = 2 # Bool - IsEvictable = 3 # Bool (and obsolete apparently) - LastVersionNumber = 4 # Int - MaximumAllocatedIndexId = 5 # Int - HasKeyPathFlag = 6 # Bool (and obsolete apparently) - KeygeneratorCurrentNumber = 7 # Int - - -class ObjectStoreMetadata: - # All metadata fields are prefaced by a 0x00 byte - def __init__(self, raw_meta: dict): - self._metas = types.MappingProxyType(raw_meta) - - def get_meta( - self, db_id: int, obj_store_id: int, meta_type: ObjectStoreMetadataType - ): - record = self._metas.get((db_id, obj_store_id, meta_type)) - if not record: - return None - - if meta_type == ObjectStoreMetadataType.StoreName: - return record.value.decode("utf-16-be") - - # TODO - raise NotImplementedError() - - -class IndexedDbRecord: - def __init__( - self, - owner: "IndexedDb", - db_id: int, - obj_store_id: int, - key: IdbKey, - value: typing.Any, - is_live: bool, - ldb_seq_no: int, - origin_file: os.PathLike, - ): - self.owner = owner - self.db_id = db_id - self.obj_store_id = obj_store_id - self.key = key - self.value = value - self.is_live = is_live - self.sequence_number = ldb_seq_no - self.origin_file = origin_file - - def resolve_blob_index( - self, blob_index: ccl_blink_value_deserializer.BlobIndex - ) -> IndexedDBExternalObject: - """Resolve a ccl_blink_value_deserializer.BlobIndex to its IndexedDBExternalObject - to get metadata (file name, timestamps, etc)""" - return self.owner.get_blob_info( - self.db_id, self.obj_store_id, self.key.raw_key, blob_index.index_id - ) - - def get_blob_stream( - self, blob_index: ccl_blink_value_deserializer.BlobIndex - ) -> typing.BinaryIO: - """Resolve a ccl_blink_value_deserializer.BlobIndex to a stream of its content""" - return self.owner.get_blob( - self.db_id, self.obj_store_id, self.key.raw_key, blob_index.index_id - ) - - -class IndexedDb: - # This will be informative for a lot of the data below: - # https://github.com/chromium/chromium/blob/master/content/browser/indexed_db/docs/leveldb_coding_scheme.md - - # Of note, the first byte of the key defines the length of the db_id, obj_store_id and index_id in bytes: - # 0b xxxyyyzz (x = db_id size - 1, y = obj_store size - 1, z = index_id - 1) - # Currently I just assume that everything falls between 1 and 127 for simplicity as it makes scanning the keys - # lots easier. - def __init__(self, leveldb_dir: os.PathLike, leveldb_blob_dir: os.PathLike = None): - self._db = ccl_leveldb.RawLevelDb(leveldb_dir) - self._blob_dir = leveldb_blob_dir - self.global_metadata = GlobalMetadata(self._get_raw_global_metadata()) - self.database_metadata = DatabaseMetadata(self._get_raw_database_metadata()) - self.object_store_meta = ObjectStoreMetadata( - self._get_raw_object_store_metadata() - ) - - self._blob_lookup_cache = {} - - @staticmethod - def make_prefix(db_id: int, obj_store_id: int, index_id: int) -> bytes: - def count_bytes(val): - i = 0 - while val > 0: - i += 1 - val = val >> 8 - return i - - def yield_le_bytes(val): - if val < 0: - raise ValueError - while val > 0: - yield val & 0xFF - val >> 8 - - db_id_size = count_bytes(db_id) - obj_store_id_size = count_bytes(obj_store_id) - index_id_size = count_bytes(index_id) - - if db_id_size > 8 or obj_store_id_size > 8 or index_id_size > 4: - raise ValueError("id sizes are too big") - - byte_one = ( - ((db_id_size - 1) << 5) | ((obj_store_id_size - 1) << 2) | index_id_size - ) - return bytes( - [ - byte_one, - *yield_le_bytes(db_id), - *yield_le_bytes(obj_store_id), - *yield_le_bytes(index_id), - ] - ) - - def get_database_metadata(self, db_id: int, meta_type: DatabaseMetadataType): - return self.database_metadata.get_meta(db_id, meta_type) - - def get_object_store_metadata( - self, db_id: int, obj_store_id: int, meta_type: ObjectStoreMetadataType - ): - return self.object_store_meta.get_meta(db_id, obj_store_id, meta_type) - - def _get_raw_global_metadata( - self, live_only=True - ) -> typing.Dict[bytes, ccl_leveldb.Record]: - # Global metadata always has the prefix 0 0 0 0 - if not live_only: - raise NotImplementedError("Deleted metadata not implemented yet") - meta = {} - for record in self._db.iterate_records_raw(reverse=True): - if ( - record.key.startswith(b"\x00\x00\x00\x00") - and record.state == ccl_leveldb.KeyState.Live - ): - # we only want live keys and the newest version thereof (highest seq) - if record.key not in meta or meta[record.key].seq < record.seq: - meta[record.key] = record - - return meta - - def _get_raw_database_metadata(self, live_only=True): - if not live_only: - raise NotImplementedError("Deleted metadata not implemented yet") - - db_meta = {} - - for db_id in self.global_metadata.db_ids: - if db_id.dbid_no > 0x7F: - raise NotImplementedError( - "there could be this many dbs, but I don't support it yet" - ) - - prefix = bytes([0, db_id.dbid_no, 0, 0]) - for record in self._db.iterate_records_raw(reverse=True): - if ( - record.key.startswith(prefix) - and record.state == ccl_leveldb.KeyState.Live - ): - # we only want live keys and the newest version thereof (highest seq) - meta_type = record.key[len(prefix)] - old_version = db_meta.get((db_id.dbid_no, meta_type)) - if old_version is None or old_version.seq < record.seq: - db_meta[(db_id.dbid_no, meta_type)] = record - - return db_meta - - def _get_raw_object_store_metadata(self, live_only=True): - if not live_only: - raise NotImplementedError("Deleted metadata not implemented yet") - - os_meta = {} - - for db_id in self.global_metadata.db_ids: - if db_id.dbid_no > 0x7F: - raise NotImplementedError( - "there could be this many dbs, but I don't support it yet" - ) - - prefix = bytes([0, db_id.dbid_no, 0, 0, 50]) - - for record in self._db.iterate_records_raw(reverse=True): - if ( - record.key.startswith(prefix) - and record.state == ccl_leveldb.KeyState.Live - ): - # we only want live keys and the newest version thereof (highest seq) - objstore_id, varint_raw = _le_varint_from_bytes( - record.key[len(prefix) :] - ) - meta_type = record.key[len(prefix) + len(varint_raw)] - - old_version = os_meta.get((db_id.dbid_no, objstore_id, meta_type)) - - if old_version is None or old_version.seq < record.seq: - os_meta[(db_id.dbid_no, objstore_id, meta_type)] = record - - return os_meta - - def iterate_records( - self, - db_id: int, - store_id: int, - *, - live_only=False, - bad_deserializer_data_handler: typing.Callable[ - [IdbKey, bytes], typing.Any - ] = None, - ): - if db_id > 0x7F or store_id > 0x7F: - raise NotImplementedError( - "there could be this many dbs or object stores, but I don't support it yet" - ) - - blink_deserializer = ccl_blink_value_deserializer.BlinkV8Deserializer() - - # goodness me this is a slow way of doing things - prefix = bytes([0, db_id, store_id, 1]) - for record in self._db.iterate_records_raw(): - if record.key.startswith(prefix): - key = IdbKey(record.key[len(prefix) :]) - if not record.value: - # empty values will obviously fail, returning None is probably better than dying. - return key, None - value_version, varint_raw = _le_varint_from_bytes(record.value) - val_idx = len(varint_raw) - # read the blink envelope - blink_type_tag = record.value[val_idx] - if blink_type_tag != 0xFF: - # TODO: probably don't want to fail hard here long term... - if bad_deserializer_data_handler is not None: - bad_deserializer_data_handler(key, record.value) - continue - else: - raise ValueError("Blink type tag not present") - val_idx += 1 - - blink_version, varint_raw = _le_varint_from_bytes( - record.value[val_idx:] - ) - - val_idx += len(varint_raw) - - obj_raw = io.BytesIO(record.value[val_idx:]) - deserializer = ccl_v8_value_deserializer.Deserializer( - obj_raw, host_object_delegate=blink_deserializer.read - ) - try: - value = deserializer.read() - except Exception: - if bad_deserializer_data_handler is not None: - bad_deserializer_data_handler(key, record.value) - continue - raise - yield IndexedDbRecord( - self, - db_id, - store_id, - key, - value, - record.state == ccl_leveldb.KeyState.Live, - record.seq, - record.origin_file, - ) - - def get_blob_info( - self, db_id: int, store_id: int, raw_key: bytes, file_index: int - ) -> IndexedDBExternalObject: - if db_id > 0x7F or store_id > 0x7F: - raise NotImplementedError( - "there could be this many dbs, but I don't support it yet" - ) - - if result := self._blob_lookup_cache.get( - (db_id, store_id, raw_key, file_index) - ): - return result - - # goodness me this is a slow way of doing things, - # TODO: we should at least cache along the way to our record - prefix = bytes([0, db_id, store_id, 3]) - for record in self._db.iterate_records_raw(): - if record.key.startswith(prefix): - buff = io.BytesIO(record.value) - idx = 0 - while buff.tell() < len(record.value): - blob_info = IndexedDBExternalObject.from_stream(buff) - self._blob_lookup_cache[(db_id, store_id, raw_key, idx)] = blob_info - idx += 1 - break - - if result := self._blob_lookup_cache.get( - (db_id, store_id, raw_key, file_index) - ): - return result - else: - raise KeyError((db_id, store_id, raw_key, file_index)) - - def get_blob( - self, db_id: int, store_id: int, raw_key: bytes, file_index: int - ) -> typing.BinaryIO: - # Some detail here: https://github.com/chromium/chromium/blob/master/content/browser/indexed_db/docs/README.md - if self._blob_dir is None: - raise ValueError("Can't resolve blob if blob dir is not set") - info = self.get_blob_info(db_id, store_id, raw_key, file_index) - - # path will be: origin.blob/database id/top 16 bits of blob number with two digits/blob number - # TODO: check if this is still the case on non-windows systems - path = pathlib.Path( - self._blob_dir, - str(db_id), - f"{info.blob_number >> 8:02x}", - f"{info.blob_number:x}", - ) - - if path.exists(): - return path.open("rb") - - raise FileNotFoundError(path) - - @property - def database_path(self): - return self._db.in_dir_path - - -class WrappedObjectStore: - def __init__(self, raw_db: IndexedDb, dbid_no: int, obj_store_id: int): - self._raw_db = raw_db - self._dbid_no = dbid_no - self._obj_store_id = obj_store_id - - @property - def object_store_id(self): - return self._obj_store_id - - @property - def name(self) -> str: - return self._raw_db.get_object_store_metadata( - self._dbid_no, self._obj_store_id, ObjectStoreMetadataType.StoreName - ) - - @staticmethod - def _log_error(key: IdbKey, data: bytes): - sys.stderr.write(f"ERROR decoding key: {key}\n") - - def get_blob(self, raw_key: bytes, file_index: int) -> typing.BinaryIO: - return self._raw_db.get_blob( - self._dbid_no, self.object_store_id, raw_key, file_index - ) - - # def __iter__(self): - # yield from self._raw_db.iterate_records(self._dbid_no, self._obj_store_id) - - def iterate_records( - self, - *, - live_only=False, - errors_to_stdout=False, - bad_deserializer_data_handler: typing.Callable[ - [IdbKey, bytes], typing.Any - ] = None, - ): - def _handler(key, record): - if bad_deserializer_data_handler is not None: - bad_deserializer_data_handler(key, record) - if errors_to_stdout: - WrappedObjectStore._log_error(key, record) - - handler = ( - _handler - if errors_to_stdout or bad_deserializer_data_handler is not None - else None - ) - - yield from self._raw_db.iterate_records( - self._dbid_no, - self._obj_store_id, - live_only=live_only, - bad_deserializer_data_handler=handler, - ) - - def __repr__(self): - return f"" - - -class WrappedDatabase: - def __init__(self, raw_db: IndexedDb, dbid: DatabaseId): - self._raw_db = raw_db - self._dbid = dbid - - names = [] - for obj_store_id in range(1, self.object_store_count + 1): - names.append( - self._raw_db.get_object_store_metadata( - self.db_number, obj_store_id, ObjectStoreMetadataType.StoreName - ) - ) - self._obj_store_names = tuple(names) - # pre-compile object store wrappers as there's little overhead - self._obj_stores = tuple( - WrappedObjectStore(self._raw_db, self.db_number, i) - for i in range(1, self.object_store_count + 1) - ) - - @property - def name(self) -> str: - return self._dbid.name - - @property - def origin(self) -> str: - return self._dbid.origin - - @property - def db_number(self) -> int: - return self._dbid.dbid_no - - @property - def object_store_count(self) -> int: - # NB obj store ids are enumerated from 1. - return ( - self._raw_db.get_database_metadata( - self.db_number, DatabaseMetadataType.MaximumObjectStoreId - ) - or 0 - ) # returns None if there are none. - - @property - def object_store_names(self) -> typing.Iterable[str]: - yield from self._obj_store_names - - def get_object_store_by_id(self, obj_store_id: int) -> WrappedObjectStore: - if obj_store_id > 0 and obj_store_id <= self.object_store_count: - return self._obj_stores[obj_store_id - 1] - raise ValueError( - "obj_store_id must be greater than zero and less or equal to object_store_count " - "NB object stores are enumerated from 1 - there is no store with id 0" - ) - - def get_object_store_by_name(self, name: str) -> WrappedObjectStore: - if name in self: - return self.get_object_store_by_id(self._obj_store_names.index(name) + 1) - raise KeyError(f"{name} is not an object store in this database") - - def __len__(self): - len(self._obj_stores) - - def __contains__(self, item): - return item in self._obj_store_names - - def __getitem__(self, item) -> WrappedObjectStore: - if isinstance(item, int): - return self.get_object_store_by_id(item) - elif isinstance(item, str): - return self.get_object_store_by_name(item) - raise TypeError("Key can only be str (name) or int (id number)") - - def __repr__(self): - return f"" - - -class WrappedIndexDB: - def __init__(self, leveldb_dir: os.PathLike, leveldb_blob_dir: os.PathLike = None): - self._raw_db = IndexedDb(leveldb_dir, leveldb_blob_dir) - self._multiple_origins = ( - len(set(x.origin for x in self._raw_db.global_metadata.db_ids)) > 1 - ) - - self._db_number_lookup = { - x.dbid_no: WrappedDatabase(self._raw_db, x) - for x in self._raw_db.global_metadata.db_ids - } - # set origin to 0 if there's only 1 and we'll ignore it in all lookups - self._db_name_lookup = { - (x.name, x.origin if self.has_multiple_origins else 0): x - for x in self._db_number_lookup.values() - } - - @property - def database_count(self): - return len(self._db_number_lookup) - - @property - def database_ids(self): - yield from self._raw_db.global_metadata.db_ids - - @property - def has_multiple_origins(self): - return self._multiple_origins - - def __len__(self): - len(self._db_number_lookup) - - def __contains__(self, item): - if isinstance(item, str): - if self.has_multiple_origins: - raise ValueError( - "Database contains multiple origins, lookups must be provided as a tuple of (name, origin)" - ) - return (item, 0) in self._db_name_lookup - elif isinstance(item, tuple) and len(item) == 2: - name, origin = item - if not self.has_multiple_origins: - origin = 0 # origin ignored if not needed - return (name, origin) in self._db_name_lookup - elif isinstance(item, int): - return item in self._db_number_lookup - else: - raise TypeError( - "keys must be provided as a tuple of (name, origin) or a str (if only single origin) or int" - ) - - def __getitem__( - self, item: typing.Union[int, str, typing.Tuple[str, str]] - ) -> WrappedDatabase: - if isinstance(item, int): - if item in self._db_number_lookup: - return self._db_number_lookup[item] - else: - raise KeyError(item) - elif isinstance(item, str): - if self.has_multiple_origins: - raise ValueError( - "Database contains multiple origins, indexes must be provided as a tuple of (name, origin)" - ) - if item in self: - return self._db_name_lookup[item, 0] - else: - raise KeyError(item) - elif isinstance(item, tuple) and len(item) == 2: - name, origin = item - if not self.has_multiple_origins: - origin = 0 # origin ignored if not needed - if (name, origin) in self: - return self._db_name_lookup[name, origin] - else: - raise KeyError(item) - - raise TypeError("Lookups must be one of int, str or tuple of name and origin") - - def __repr__(self): - return f"" diff --git a/utils/ccl_chrome_indexeddb/ccl_chromium_localstorage.py b/utils/ccl_chrome_indexeddb/ccl_chromium_localstorage.py deleted file mode 100644 index ed60459..0000000 --- a/utils/ccl_chrome_indexeddb/ccl_chromium_localstorage.py +++ /dev/null @@ -1,359 +0,0 @@ -""" -Copyright 2021, CCL Forensics -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" - -import io -import bisect -import sys -import pathlib -import types -import typing -import dataclasses -import datetime - -from . import ccl_leveldb - -__version__ = "0.1" -__description__ = "Module for reading the Chromium leveldb localstorage format" -__contact__ = "Alex Caithness" - -""" -See: https://source.chromium.org/chromium/chromium/src/+/main:components/services/storage/dom_storage/local_storage_impl.cc -Meta keys: - Key = "META:" + storage_key (the host) - Value = protobuff: 1=timestamp (varint); 2=size in bytes (varint) - -Record keys: - Key = "_" + storage_key + "\\x0" + script_key - Value = record_value - -""" - -_META_PREFIX = b"META:" -_RECORD_KEY_PREFIX = b"_" -_CHROME_EPOCH = datetime.datetime(1601, 1, 1, 0, 0, 0) - -EIGHT_BIT_ENCODING = "iso-8859-1" - - -def from_chrome_timestamp(microseconds: int) -> datetime.datetime: - return _CHROME_EPOCH + datetime.timedelta(microseconds=microseconds) - - -def decode_string(raw: bytes) -> str: - """ - decodes a type-prefixed string - prefix of: 0=utf-16-le; 1=an extended ascii codepage (likely dependant on locale) - :param raw: raw prefixed-string data - :return: decoded string - """ - prefix = raw[0] - if prefix == 0: - return raw[1:].decode("utf-16-le") - elif prefix == 1: - return raw[1:].decode(EIGHT_BIT_ENCODING) - else: - raise ValueError("Unexpected prefix, please contact developer") - - -@dataclasses.dataclass(frozen=True) -class StorageMetadata: - storage_key: str - timestamp: datetime.datetime - size_in_bytes: int - leveldb_seq_number: int - - @classmethod - def from_protobuff(cls, storage_key: str, data: bytes, seq: int): - with io.BytesIO(data) as stream: - # This is a simple protobuff, so we'll read it directly, but with checks, rather than add a dependency - ts_tag = ccl_leveldb.read_le_varint(stream) - if (ts_tag & 0x07) != 0 or (ts_tag >> 3) != 1: - raise ValueError( - "Unexpected tag when reading StorageMetadata from protobuff" - ) - timestamp = from_chrome_timestamp(ccl_leveldb.read_le_varint(stream)) - - size_tag = ccl_leveldb.read_le_varint(stream) - if (size_tag & 0x07) != 0 or (size_tag >> 3) != 2: - raise ValueError( - "Unexpected tag when reading StorageMetadata from protobuff" - ) - size = ccl_leveldb.read_le_varint(stream) - - return cls(storage_key, timestamp, size, seq) - - -@dataclasses.dataclass(frozen=True) -class LocalStorageRecord: - storage_key: str - script_key: str - value: str - leveldb_seq_number: int - is_live: bool - - -class LocalStorageBatch: - def __init__(self, meta: StorageMetadata, end_seq: int): - self._meta = meta - self._end = end_seq - - @property - def storage_key(self) -> str: - return self._meta.storage_key - - @property - def timestamp(self) -> datetime.datetime: - return self._meta.timestamp - - @property - def start(self): - return self._meta.leveldb_seq_number - - @property - def end(self): - return self._end - - def __repr__(self): - return f"(storage_key={self.storage_key}, timestamp={self.timestamp}, start={self.start}, end={self.end})" - - -class LocalStoreDb: - def __init__(self, in_dir: pathlib.Path): - if not in_dir.is_dir(): - raise IOError("Input directory is not a directory") - - self._ldb = ccl_leveldb.RawLevelDb(in_dir) - - self._storage_details = {} # storage_key: {seq_number: StorageMetadata} - self._flat_items = [] # [StorageMetadata|LocalStorageRecord] - used to batch items up - self._records = {} # storage_key: {script_key: {seq_number: LocalStorageRecord}} - - for record in self._ldb.iterate_records_raw(): - if ( - record.user_key.startswith(_META_PREFIX) - and record.state == ccl_leveldb.KeyState.Live - ): - # Only live records for metadata - not sure what we can reliably infer from deleted keys - storage_key = record.user_key.removeprefix(_META_PREFIX).decode( - EIGHT_BIT_ENCODING - ) - self._storage_details.setdefault(storage_key, {}) - metadata = StorageMetadata.from_protobuff( - storage_key, record.value, record.seq - ) - self._storage_details[storage_key][record.seq] = metadata - self._flat_items.append(metadata) - elif record.user_key.startswith(_RECORD_KEY_PREFIX): - # We include deleted records here because we need them to build batches - storage_key_raw, script_key_raw = record.user_key.removeprefix( - _RECORD_KEY_PREFIX - ).split(b"\x00", 1) - storage_key = storage_key_raw.decode(EIGHT_BIT_ENCODING) - script_key = decode_string(script_key_raw) - - try: - value = ( - decode_string(record.value) - if record.state == ccl_leveldb.KeyState.Live - else None - ) - except UnicodeDecodeError as e: - # Some sites play games to test the browser's capabilities like encoding half of a surrogate pair - print( - f"Error decoding record value at seq no {record.seq}; " - f"{storage_key} {script_key}: {record.value}" - ) - continue - - self._records.setdefault(storage_key, {}) - self._records[storage_key].setdefault(script_key, {}) - - ls_record = LocalStorageRecord( - storage_key, - script_key, - value, - record.seq, - record.state == ccl_leveldb.KeyState.Live, - ) - self._records[storage_key][script_key][record.seq] = ls_record - self._flat_items.append(ls_record) - - self._storage_details = types.MappingProxyType(self._storage_details) - self._records = types.MappingProxyType(self._records) - - self._all_storage_keys = frozenset( - self._storage_details.keys() | self._records.keys() - ) # because deleted data. - self._flat_items.sort(key=lambda x: x.leveldb_seq_number) - - # organise batches - this is made complex and slow by having to account for missing/deleted data - # we're looking for a StorageMetadata followed by sequential (in terms of seq number) LocalStorageRecords - # with the same storage key. Everything that falls within that chain can safely be considered a batch. - # Any break in sequence numbers or storage key is a fail and can't be considered part of a batch. - self._batches = {} - current_meta: typing.Optional[StorageMetadata] = None - current_end = 0 - for item in self._flat_items: # pre-sorted - if isinstance(item, LocalStorageRecord): - if current_meta is None: - # no currently valid metadata so we can't attribute this record to anything - continue - elif ( - item.leveldb_seq_number - current_end != 1 - or item.storage_key != current_meta.storage_key - ): - # this record breaks a chain, so bundle up what we have and clear everything out - self._batches[current_meta.leveldb_seq_number] = LocalStorageBatch( - current_meta, current_end - ) - current_meta = None - current_end = 0 - else: - # contiguous and right storage key, include in the current chain - current_end = item.leveldb_seq_number - elif isinstance(item, StorageMetadata): - if current_meta is not None: - # this record breaks a chain, so bundle up what we have, set new start - self._batches[current_meta.leveldb_seq_number] = LocalStorageBatch( - current_meta, current_end - ) - current_meta = item - current_end = item.leveldb_seq_number - else: - raise ValueError - - if current_meta is not None: - self._batches[current_meta.leveldb_seq_number] = LocalStorageBatch( - current_meta, current_end - ) - - self._batch_starts = tuple(sorted(self._batches.keys())) - - def iter_storage_keys(self) -> typing.Iterable[str]: - yield from self._storage_details.keys() - - def contains_storage_key(self, storage_key: str) -> bool: - return storage_key in self._all_storage_keys - - def iter_script_keys(self, storage_key: str) -> typing.Iterable[str]: - if storage_key not in self._all_storage_keys: - raise KeyError(storage_key) - if storage_key not in self._records: - raise StopIteration - yield from self._records[storage_key].keys() - - def contains_script_key(self, storage_key: str, script_key: str) -> bool: - return script_key in self._records.get(storage_key, {}) - - def find_batch(self, seq: int): - """ - Finds the batch that a record with the given sequence number belongs to - :param seq: leveldb sequence id - :return: the batch containing the given sequence number or None if no batch contains it - """ - - i = bisect.bisect_left(self._batch_starts, seq) - 1 - if i < 0: - return None - start = self._batch_starts[i] - batch = self._batches[start] - if batch.start <= seq <= batch.end: - return batch - else: - return None - - def iter_all_records(self) -> typing.Iterable[LocalStorageRecord]: - """ - :return: iterable of LocalStorageRecords - """ - for storage_key, script_dict in self._records.items(): - for script_key, values in script_dict.items(): - for seq, value in values.items(): - if value.is_live: - yield value - - def iter_records_for_storage_key( - self, storage_key - ) -> typing.Iterable[LocalStorageRecord]: - """ - :param storage_key: storage key (host) for the records - :return: iterable of LocalStorageRecords - """ - if not self.contains_storage_key(storage_key): - raise KeyError(storage_key) - for script_key, values in self._records[storage_key].items(): - for seq, value in values.items(): - if value.is_live: - yield value - - def iter_records_for_script_key( - self, storage_key, script_key - ) -> typing.Iterable[LocalStorageRecord]: - """ - :param storage_key: storage key (host) for the records - :param script_key: script defined key for the records - :return: iterable of LocalStorageRecords - """ - if not self.contains_script_key(storage_key, script_key): - raise KeyError((storage_key, script_key)) - for seq, value in self._records[storage_key][script_key].items(): - if value.is_live: - yield value - - def iter_metadata(self) -> typing.Iterable[StorageMetadata]: - """ - :return: iterable of StorageMetaData - """ - for meta in self._flat_items: - if isinstance(meta, StorageMetadata): - yield meta - - def iter_metadata_for_storage_key( - self, storage_key: str - ) -> typing.Iterable[StorageMetadata]: - """ - :param storage_key: storage key (host) for the metadata - :return: iterable of StorageMetadata - """ - if storage_key not in self._all_storage_keys: - raise KeyError(storage_key) - if storage_key not in self._storage_details: - return None - for seq, meta in self._storage_details[storage_key].items(): - yield meta - - def iter_batches(self) -> typing.Iterable[LocalStorageBatch]: - yield from self._batches.values() - - def close(self): - self._ldb.close() - - -def main(args): - in_ldb_path = pathlib.Path(args[0]) - local_store = LocalStoreDb(in_ldb_path) - - for rec in local_store.iter_all_records(): - batch = local_store.find_batch(rec.leveldb_seq_number) - print(rec, batch) - - -if __name__ == "__main__": - main(sys.argv[1:]) diff --git a/utils/ccl_chrome_indexeddb/ccl_chromium_sessionstorage.py b/utils/ccl_chrome_indexeddb/ccl_chromium_sessionstorage.py deleted file mode 100644 index 267b460..0000000 --- a/utils/ccl_chrome_indexeddb/ccl_chromium_sessionstorage.py +++ /dev/null @@ -1,225 +0,0 @@ -""" -Copyright 2021, CCL Forensics -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" - -import sys -import pathlib -import typing -import dataclasses -from types import MappingProxyType - -from . import ccl_leveldb - -__version__ = "0.1" -__description__ = "Module for reading the Chromium leveldb sessionstorage format" -__contact__ = "Alex Caithness" - -# See: https://source.chromium.org/chromium/chromium/src/+/main:components/services/storage/dom_storage/session_storage_metadata.cc -# et al - -_NAMESPACE_PREFIX = b"namespace-" -_MAP_ID_PREFIX = b"map-" - -log = None - - -@dataclasses.dataclass(frozen=True) -class SessionStoreValue: - value: str - guid: typing.Optional[str] - leveldb_sequence_number: int - - -class SessionStoreDb: - # todo: get all grouped by namespace by host? - # todo: get all grouped by namespace by host.key? - # todo: consider refactoring to only getting metadata on first pass and everything else on demand? - def __init__(self, in_dir: pathlib.Path): - if not in_dir.is_dir(): - raise IOError("Input directory is not a directory") - - self._ldb = ccl_leveldb.RawLevelDb(in_dir) - - # If performance is a concern we should refactor this, but slow and steady for now - - # First collect the namespace (session/tab guid + host) and map-ids together - self._map_id_to_host = {} # map_id: (guid, host) - self._deleted_keys = set() - - for rec in self._ldb.iterate_records_raw(): - if rec.user_key.startswith(_NAMESPACE_PREFIX): - if rec.user_key == _NAMESPACE_PREFIX: - continue # bogus entry near the top usually - try: - key = rec.user_key.decode("utf-8") - except UnicodeDecodeError: - print(f"Invalid namespace key: {rec.user_key}") - continue - - split_key = key.split("-", 2) - if len(split_key) != 3: - print(f"Invalid namespace key: {key}") - continue - - _, guid, host = split_key - - if not host: - continue # TODO investigate why this happens - - # normalize host to lower just in case - host = host.lower() - guid_host_pair = guid, host - - if rec.state == ccl_leveldb.KeyState.Deleted: - self._deleted_keys.add(guid_host_pair) - else: - try: - map_id = rec.value.decode("utf-8") - except UnicodeDecodeError: - print(f"Invalid namespace value: {key}") - continue - - if not map_id: - continue # TODO: investigate why this happens/do we want to keep the host around somewhere? - - # if map_id in self._map_id_to_host_guid and self._map_id_to_host_guid[map_id] != guid_host_pair: - if ( - map_id in self._map_id_to_host - and self._map_id_to_host[map_id] != host - ): - print("Map ID Collision!") - print(f"map_id: {map_id}") - print(f"Old host: {self._map_id_to_host[map_id]}") - print(f"New host: {guid_host_pair}") - raise ValueError("map_id collision") - else: - self._map_id_to_host[map_id] = host - - # freeze stuff - self._map_id_to_host = MappingProxyType(self._map_id_to_host) - self._deleted_keys = frozenset(self._deleted_keys) - - self._host_lookup = {} # {host: {ss_key: [SessionStoreValue, ...]}} - self._orphans = [] # list of tuples of key, value where we can't get the host - for rec in self._ldb.iterate_records_raw(): - if rec.user_key.startswith(_MAP_ID_PREFIX): - try: - key = rec.user_key.decode("utf-8") - except UnicodeDecodeError: - print(f"Invalid map id key: {rec.user_key}") - continue - - if rec.state == ccl_leveldb.KeyState.Deleted: - continue # TODO: do we want to keep the key around because the presence is important? - - split_key = key.split("-", 2) - if len(split_key) != 3: - print(f"Invalid map id key: {key}") - continue - - _, map_id, ss_key = split_key - - if not split_key: - # TODO what does it mean when there is no key here? - # The value will also be a single number (encoded utf-8) - continue - - try: - value = rec.value.decode("UTF-16-LE") - except UnicodeDecodeError: - print(f"Error decoding value for {key}") - print(f"Raw Value: {rec.value}") - continue - - # guid_host_pair = self._map_id_to_host_guid.get(map_id) - host = self._map_id_to_host.get(map_id) - # if not guid_host_pair: - if not host: - self._orphans.append( - (ss_key, SessionStoreValue(value, None, rec.seq)) - ) - else: - # guid, host = guid_host_pair - self._host_lookup.setdefault(host, {}) - self._host_lookup[host].setdefault(ss_key, []) - self._host_lookup[host][ss_key].append( - SessionStoreValue(value, None, rec.seq) - ) - - def __contains__(self, item: typing.Union[str, typing.Tuple[str, str]]) -> bool: - """if item is a str, returns true if that host is present - if item is a tuple of (str, str), returns True if that host and key pair are present""" - if isinstance(item, str): - return item in self._host_lookup - elif isinstance(item, tuple) and len(item) == 2: - host, key = item - return host in self._host_lookup and key in self._host_lookup[host] - else: - raise TypeError("item must be a string or a tuple of (str, str)") - - def iter_hosts(self) -> typing.Iterable[str]: - yield from self._host_lookup.keys() - - def get_all_for_host(self, host): - if host not in self: - return {} - result_raw = dict(self._host_lookup[host]) - for ss_key in result_raw: - result_raw[ss_key] = tuple(result_raw[ss_key]) - return result_raw - - def get_session_storage_key(self, host, key): - if (host, key) not in self: - return tuple() - return tuple(self._host_lookup[host][key]) - - def iter_orphans(self): - yield from self._orphans - - def __getitem__(self, item: typing.Union[str, typing.Tuple[str, str]]): - if item not in self: - raise KeyError(item) - - if isinstance(item, str): - return self.get_all_for_host(item) - elif isinstance(item, tuple) and len(item) == 2: - return self.get_session_storage_key(*item) - else: - raise TypeError("item must be a string or a tuple of (str, str)") - - def __iter__(self): - """iterates the hosts present""" - return self.iter_hosts() - - def close(self): - self._ldb.close() - - -def main(args): - ldb_in_dir = pathlib.Path(args[0]) - ssdb = SessionStoreDb(ldb_in_dir) - - print("Hosts in db:") - for host in ssdb: - print(host) - - -if __name__ == "__main__": - main(sys.argv[1:]) diff --git a/utils/ccl_chrome_indexeddb/ccl_leveldb.py b/utils/ccl_chrome_indexeddb/ccl_leveldb.py deleted file mode 100644 index e99bd5c..0000000 --- a/utils/ccl_chrome_indexeddb/ccl_leveldb.py +++ /dev/null @@ -1,672 +0,0 @@ -""" -Copyright 2020-2021, CCL Forensics - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" - -import typing -import struct -import re -import os -import io -import pathlib -import dataclasses -import enum -from collections import namedtuple -from types import MappingProxyType -from . import ccl_simplesnappy - -__version__ = "0.4" -__description__ = "A module for reading LevelDB databases" -__contact__ = "Alex Caithness" - - -def _read_le_varint( - stream: typing.BinaryIO, *, is_google_32bit=False -) -> typing.Optional[typing.Tuple[int, bytes]]: - """Read varint from a stream. - If the read is successful: returns a tuple of the (unsigned) value and the raw bytes making up that varint, - otherwise returns None. - Can be switched to limit the varint to 32 bit.""" - # this only outputs unsigned - i = 0 - result = 0 - underlying_bytes = [] - limit = 5 if is_google_32bit else 10 - while i < limit: - raw = stream.read(1) - if len(raw) < 1: - return None - (tmp,) = raw - underlying_bytes.append(tmp) - result |= (tmp & 0x7F) << (i * 7) - if (tmp & 0x80) == 0: - break - i += 1 - return result, bytes(underlying_bytes) - - -def read_le_varint( - stream: typing.BinaryIO, *, is_google_32bit=False -) -> typing.Optional[int]: - """Convenience version of _read_le_varint that only returns the value or None""" - x = _read_le_varint(stream, is_google_32bit=is_google_32bit) - if x is None: - return None - else: - return x[0] - - -def read_length_prefixed_blob(stream: typing.BinaryIO): - length = read_le_varint(stream) - data = stream.read(length) - if len(data) != length: - raise ValueError(f"Could not read all data (expected {length}, got {len(data)}") - return data - - -@dataclasses.dataclass(frozen=True) -class BlockHandle: - """See: https://github.com/google/leveldb/blob/master/doc/table_format.md - A BlockHandle contains an offset and length of a block in an ldb table file""" - - offset: int - length: int - - @classmethod - def from_stream(cls, stream: typing.BinaryIO): - return cls(read_le_varint(stream), read_le_varint(stream)) - - @classmethod - def from_bytes(cls, data: bytes): - with io.BytesIO(data) as stream: - return BlockHandle.from_stream(stream) - - -@dataclasses.dataclass(frozen=True) -class RawBlockEntry: - """Raw key, value for a record in a LDB file Block, along with the offset within the block from which it came from - See: https://github.com/google/leveldb/blob/master/doc/table_format.md""" - - key: bytes - value: bytes - block_offset: int - - -class FileType(enum.Enum): - Ldb = 1 - Log = 2 - - -class KeyState(enum.Enum): - Deleted = 0 - Live = 1 - Unknown = 2 - - -@dataclasses.dataclass(frozen=True) -class Record: - """A record from leveldb; includes details of the origin file, state, etc.""" - - key: bytes - value: bytes - seq: int - state: KeyState - file_type: FileType - origin_file: os.PathLike - offset: int - was_compressed: bool - - @property - def user_key(self): - if self.file_type == FileType.Ldb: - if len(self.key) < 8: - return self.key - else: - return self.key[0:-8] - else: - return self.key - - @classmethod - def ldb_record( - cls, - key: bytes, - value: bytes, - origin_file: os.PathLike, - offset: int, - was_compressed: bool, - ): - seq = (struct.unpack("> 8 - if len(key) > 8: - state = KeyState.Deleted if key[-8] == 0 else KeyState.Live - else: - state = KeyState.Unknown - return cls( - key, value, seq, state, FileType.Ldb, origin_file, offset, was_compressed - ) - - @classmethod - def log_record( - cls, - key: bytes, - value: bytes, - seq: int, - state: KeyState, - origin_file: os.PathLike, - offset: int, - ): - return cls(key, value, seq, state, FileType.Log, origin_file, offset, False) - - -class Block: - """Block from an .lldb (table) file. See: https://github.com/google/leveldb/blob/master/doc/table_format.md""" - - def __init__( - self, raw: bytes, was_compressed: bool, origin: "LdbFile", offset: int - ): - self._raw = raw - self.was_compressed = was_compressed - self.origin = origin - self.offset = offset - - (self._restart_array_count,) = struct.unpack(" int: - offset = self._restart_array_offset + (index * 4) - return struct.unpack(" int: - return self.get_restart_offset(0) - - def __iter__(self) -> typing.Iterable[RawBlockEntry]: - offset = self.get_first_entry_offset() - with io.BytesIO(self._raw) as buff: - buff.seek(offset) - - key = b"" - - while buff.tell() < self._restart_array_offset: - start_offset = buff.tell() - shared_length = read_le_varint(buff, is_google_32bit=True) - non_shared_length = read_le_varint(buff, is_google_32bit=True) - value_length = read_le_varint(buff, is_google_32bit=True) - - # sense check - if offset >= self._restart_array_offset: - raise ValueError( - "Reading start of entry past the start of restart array" - ) - if shared_length > len(key): - raise ValueError( - "Shared key length is larger than the previous key" - ) - - key = key[:shared_length] + buff.read(non_shared_length) - value = buff.read(value_length) - - yield RawBlockEntry(key, value, start_offset) - - -class LdbFile: - """A leveldb table (.ldb or .sst) file.""" - - BLOCK_TRAILER_SIZE = 5 - FOOTER_SIZE = 48 - MAGIC = 0xDB4775248B80FB57 - - def __init__(self, file: pathlib.Path): - if not file.exists(): - raise FileNotFoundError(file) - - self.path = file - self.file_no = int(file.stem, 16) - - self._f = file.open("rb") - self._f.seek(-LdbFile.FOOTER_SIZE, os.SEEK_END) - - self._meta_index_handle = BlockHandle.from_stream(self._f) - self._index_handle = BlockHandle.from_stream(self._f) - self._f.seek(-8, os.SEEK_END) - (magic,) = struct.unpack(" typing.Tuple[typing.Tuple[bytes, BlockHandle], ...]: - index_block = self._read_block(self._index_handle) - # key is earliest key, value is BlockHandle to that data block - return tuple( - (entry.key, BlockHandle.from_bytes(entry.value)) for entry in index_block - ) - - def __iter__(self) -> typing.Iterable[Record]: - """Iterate Records in this Table file""" - for block_key, handle in self._index: - block = self._read_block(handle) - for entry in block: - yield Record.ldb_record( - entry.key, - entry.value, - self.path, - block.offset - if block.was_compressed - else block.offset + entry.block_offset, - block.was_compressed, - ) - - def close(self): - self._f.close() - - -class LogEntryType(enum.IntEnum): - Zero = 0 - Full = 1 - First = 2 - Middle = 3 - Last = 4 - - -class LogFile: - """A levelDb log (.log) file""" - - LOG_ENTRY_HEADER_SIZE = 7 - LOG_BLOCK_SIZE = 32768 - - def __init__(self, file: pathlib.Path): - if not file.exists(): - raise FileNotFoundError(file) - - self.path = file - self.file_no = int(file.stem, 16) - - self._f = file.open("rb") - - def _get_raw_blocks(self) -> typing.Iterable[bytes]: - self._f.seek(0) - - while chunk := self._f.read(LogFile.LOG_BLOCK_SIZE): - yield chunk - - def _get_batches(self) -> typing.Iterable[typing.Tuple[int, bytes]]: - in_record = False - start_block_offset = 0 - block = b"" - for idx, chunk_ in enumerate(self._get_raw_blocks()): - with io.BytesIO(chunk_) as buff: - while buff.tell() < LogFile.LOG_BLOCK_SIZE - 6: - header = buff.read(7) - if len(header) < 7: - break - crc, length, block_type = struct.unpack(" typing.Iterable[Record]: - """Iterate Records in this Log file""" - for batch_offset, batch in self._get_batches(): - # as per write_batch and write_batch_internal - # offset length description - # 0 8 (u?)int64 Sequence number - # 8 4 (u?)int32 Count - the log batch can contain multple entries - # - # Then Count * the following: - # - # 12 1 ValueType (KeyState as far as this library is concerned) - # 13 1-4 VarInt32 length of key - # ... ... Key data - # ... 1-4 VarInt32 length of value - # ... ... Value data - - with io.BytesIO(batch) as buff: # it's just easier this way - header = buff.read(12) - seq, count = struct.unpack(" typing.Iterable[bytes]: - self._f.seek(0) - - while chunk := self._f.read(LogFile.LOG_BLOCK_SIZE): - yield chunk - - def _get_batches(self) -> typing.Iterable[typing.Tuple[int, bytes]]: - in_record = False - start_block_offset = 0 - block = b"" - for idx, chunk_ in enumerate(self._get_raw_blocks()): - with io.BytesIO(chunk_) as buff: - while buff.tell() < LogFile.LOG_BLOCK_SIZE - 6: - header = buff.read(7) - if len(header) < 7: - break - crc, length, block_type = struct.unpack(" pathlib.Path: - return self._in_dir - - def iterate_records_raw(self, *, reverse=False) -> typing.Iterable[Record]: - for file_containing_records in sorted( - self._files, reverse=reverse, key=lambda x: x.file_no - ): - yield from file_containing_records - - def close(self): - for file in self._files: - file.close() - if self.manifest: - self.manifest.close() diff --git a/utils/ccl_chrome_indexeddb/ccl_simplesnappy.py b/utils/ccl_chrome_indexeddb/ccl_simplesnappy.py deleted file mode 100644 index bf7e492..0000000 --- a/utils/ccl_chrome_indexeddb/ccl_simplesnappy.py +++ /dev/null @@ -1,202 +0,0 @@ -""" -Copyright 2020, CCL Forensics - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" - -import sys -import struct -import io -import typing -import enum - -__version__ = "0.1" -__description__ = "Pure Python reimplementation of Google's Snappy decompression" -__contact__ = "Alex Caithness" - - -DEBUG = False - - -def log(msg): - if DEBUG: - print(msg) - - -class ElementType(enum.IntEnum): - """Run type in the compressed snappy data (literal data or offset to backreferenced data_""" - - Literal = 0 - CopyOneByte = 1 - CopyTwoByte = 2 - CopyFourByte = 3 - - -def _read_le_varint( - stream: typing.BinaryIO, -) -> typing.Optional[typing.Tuple[int, bytes]]: - """Read varint from a stream. - If the read is successful: returns a tuple of the (unsigned) value and the raw bytes making up that varint, - otherwise returns None""" - # this only outputs unsigned - i = 0 - result = 0 - underlying_bytes = [] - while i < 10: # 64 bit max possible? - raw = stream.read(1) - if len(raw) < 1: - return None - (tmp,) = raw - underlying_bytes.append(tmp) - result |= (tmp & 0x7F) << (i * 7) - if (tmp & 0x80) == 0: - break - i += 1 - return result, bytes(underlying_bytes) - - -def read_le_varint(stream: typing.BinaryIO) -> typing.Optional[int]: - """Convenience version of _read_le_varint that only returns the value or None""" - x = _read_le_varint(stream) - if x is None: - return None - else: - return x[0] - - -def read_uint16(stream: typing.BinaryIO) -> int: - """Reads a Uint16 from stream""" - return struct.unpack(" int: - """Reads a Uint24 from stream""" - return struct.unpack(" int: - """Reads a Uint32 from stream""" - return struct.unpack(" typing.Optional[int]: - """Reads a single byte from stream (or returns None if EOD is met)""" - x = stream.read(1) - if x: - return x[0] - - return None - - -def decompress(data: typing.BinaryIO) -> bytes: - """Decompresses the snappy compressed data stream""" - uncompressed_length = read_le_varint(data) - log(f"Uncompressed length: {uncompressed_length}") - - out = io.BytesIO() - - while True: - start_offset = data.tell() - log(f"Reading tag at offset {start_offset}") - type_byte = read_byte(data) - if type_byte is None: - break - - log(f"Type Byte is {type_byte:02x}") - - tag = type_byte & 0x03 - - log(f"Element Type is: {ElementType(tag)}") - - if tag == ElementType.Literal: - if ((type_byte & 0xFC) >> 2) < 60: # embedded in tag - length = 1 + ((type_byte & 0xFC) >> 2) - log(f"Literal length is embedded in type byte and is {length}") - elif ((type_byte & 0xFC) >> 2) == 60: # 8 bit - length = 1 + read_byte(data) - log(f"Literal length is 8bit and is {length}") - elif ((type_byte & 0xFC) >> 2) == 61: # 16 bit - length = 1 + read_uint16(data) - log(f"Literal length is 16bit and is {length}") - elif ((type_byte & 0xFC) >> 2) == 62: # 16 bit - length = 1 + read_uint24(data) - log(f"Literal length is 24bit and is {length}") - elif ((type_byte & 0xFC) >> 2) == 63: # 16 bit - length = 1 + read_uint32(data) - log(f"Literal length is 32bit and is {length}") - else: - raise ValueError() # cannot ever happen - - literal_data = data.read(length) - if len(literal_data) < length: - raise ValueError("Couldn't read enough literal data") - - out.write(literal_data) - - else: - if tag == ElementType.CopyOneByte: - length = ((type_byte & 0x1C) >> 2) + 4 - offset = ((type_byte & 0xE0) << 3) | read_byte(data) - elif tag == ElementType.CopyTwoByte: - length = 1 + ((type_byte & 0xFC) >> 2) - offset = read_uint16(data) - elif tag == ElementType.CopyFourByte: - length = 1 + ((type_byte & 0xFC) >> 2) - offset = read_uint32(data) - else: - raise ValueError() # cannot ever happen - - if offset == 0: - raise ValueError("Offset cannot be 0") - - actual_offset = out.tell() - offset - log(f"Current Outstream Length: {out.tell()}") - log(f"Backreference length: {length}") - log(f"Backreference relative offset: {offset}") - log(f"Backreference absolute offset: {actual_offset}") - - # have to read incrementally because you might have to read data that you've just written - # this is probably a really slow way of doing this. - for i in range(length): - out.write( - out.getbuffer()[actual_offset + i : actual_offset + i + 1].tobytes() - ) - - result = out.getvalue() - if uncompressed_length != len(result): - raise ValueError("Wrong data length in uncompressed data") - # TODO: allow a partial / potentially bad result via a flag in the function call? - - return result - - -def main(path): - import pathlib - import hashlib - - f = pathlib.Path(path).open("rb") - decompressed = decompress(f) - print(decompressed) - sha1 = hashlib.sha1() - sha1.update(decompressed) - print(sha1.hexdigest()) - - -if __name__ == "__main__": - main(sys.argv[1]) diff --git a/utils/ccl_chrome_indexeddb/ccl_v8_value_deserializer.py b/utils/ccl_chrome_indexeddb/ccl_v8_value_deserializer.py deleted file mode 100644 index 2faccd5..0000000 --- a/utils/ccl_chrome_indexeddb/ccl_v8_value_deserializer.py +++ /dev/null @@ -1,644 +0,0 @@ -""" -Copyright 2020, CCL Forensics - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" - -import sys -import struct -import datetime -import types -import typing -import re - -__version__ = "0.1" -__description__ = "Partial reimplementation of the V8 Javascript Object Serialization" -__contact__ = "Alex Caithness" - -# TODO: We need to address cyclic references, which are permissible. Probably take the same approach as in ccl_bplist -# and subclass the collection types to resolve references JIT - -# See: https://github.com/v8/v8/blob/master/src/objects/value-serializer.cc - -__DEBUG = False - - -def log(msg, debug_only=True): - if not debug_only or __DEBUG: - caller_name = sys._getframe(1).f_code.co_name - caller_line = sys._getframe(1).f_code.co_firstlineno - print(f"{caller_name} ({caller_line}):\t{msg}") - - -def read_le_varint( - stream: typing.BinaryIO, -) -> typing.Optional[typing.Tuple[int, bytes]]: - # this only outputs unsigned - i = 0 - result = 0 - underlying_bytes = [] - while i < 10: # 64 bit max possible? - raw = stream.read(1) - if len(raw) < 1: - return None - (tmp,) = raw - underlying_bytes.append(tmp) - result |= (tmp & 0x7F) << (i * 7) - if (tmp & 0x80) == 0: - break - i += 1 - return result, bytes(underlying_bytes) - - -class _Undefined: - def __bool__(self): - return False - - def __eq__(self, other): - if isinstance(other, _Undefined): - return True - return False - - def __repr__(self): - return "" - - def __str__(self): - return "" - - -class Constants: - # Constants - kLatestVersion = 13 - - # version:uint32_t (if at beginning of data, sets version > 0) - token_kVersion = b"\xFF" - # ignore - token_kPadding = b"\0" - # refTableSize:uint32_t (previously used for sanity checks; safe to ignore) - token_kVerifyObjectCount = b"?" - # Oddballs (no data). - token_kTheHole = b"-" - token_kUndefined = b"_" - token_kNull = b"0" - token_kTrue = b"T" - token_kFalse = b"F" - # Number represented as 32-bit integer, ZigZag-encoded - # (like sint32 in protobuf) - token_kInt32 = b"I" - # Number represented as 32-bit unsigned integer, varint-encoded - # (like uint32 in protobuf) - token_kUint32 = b"U" - # Number represented as a 64-bit double. - # Host byte order is used (N.B. this makes the format non-portable). - token_kDouble = b"N" - # BigInt. Bitfield:uint32_t, then raw digits storage. - token_kBigInt = b"Z" - # byteLength:uint32_t, then raw data - token_kUtf8String = b"S" - token_kOneByteString = b'"' - token_kTwoByteString = b"c" - # Reference to a serialized object. objectID:uint32_t - token_kObjectReference = b"^" - # Beginning of a JS object. - token_kBeginJSObject = b"o" - # End of a JS object. numProperties:uint32_t - token_kEndJSObject = b"{" - # Beginning of a sparse JS array. length:uint32_t - # Elements and properties are written as token_key/value pairs, like objects. - token_kBeginSparseJSArray = b"a" - # End of a sparse JS array. numProperties:uint32_t length:uint32_t - token_kEndSparseJSArray = b"@" - # Beginning of a dense JS array. length:uint32_t - # |length| elements, followed by properties as token_key/value pairs - token_kBeginDenseJSArray = b"A" - # End of a dense JS array. numProperties:uint32_t length:uint32_t - token_kEndDenseJSArray = b"$" - # Date. millisSinceEpoch:double - token_kDate = b"D" - # Boolean object. No data. - token_kTrueObject = b"y" - token_kFalseObject = b"x" - # Number object. value:double - token_kNumberObject = b"n" - # BigInt object. Bitfield:uint32_t, then raw digits storage. - token_kBigIntObject = b"z" - # String object, UTF-8 encoding. byteLength:uint32_t, then raw data. - token_kStringObject = b"s" - # Regular expression, UTF-8 encoding. byteLength:uint32_t, raw data - # flags:uint32_t. - token_kRegExp = b"R" - # Beginning of a JS map. - token_kBeginJSMap = b";" - # End of a JS map. length:uint32_t. - token_kEndJSMap = b":" - # Beginning of a JS set. - token_kBeginJSSet = b"'" - # End of a JS set. length:uint32_t. - token_kEndJSSet = b"," - # Array buffer. byteLength:uint32_t, then raw data. - token_kArrayBuffer = b"B" - # Array buffer (transferred). transferID:uint32_t - token_kArrayBufferTransfer = b"t" - # View into an array buffer. - # subtag:ArrayBufferViewTag, byteOffset:uint32_t, byteLength:uint32_t - # For typed arrays, byteOffset and byteLength must be divisible by the size - # of the element. - # Note: token_kArrayBufferView is special, and should have an ArrayBuffer (or an - # ObjectReference to one) serialized just before it. This is a quirk arising - # from the previous stack-based implementation. - token_kArrayBufferView = b"V" - # Shared array buffer. transferID:uint32_t - token_kSharedArrayBuffer = b"u" - # A wasm module object transfer. next value is its index. - token_kWasmModuleTransfer = b"w" - # The delegate is responsible for processing all following data. - # This "escapes" to whatever wire format the delegate chooses. - token_kHostObject = b"\\" - # A transferred WebAssembly.Memory object. maximumPages:int32_t, then by - # SharedArrayBuffer tag and its data. - token_kWasmMemoryTransfer = b"m" - # A list of (subtag: ErrorTag, [subtag dependent data]). See ErrorTag for - # details. - token_kError = b"r" - - # The following tags are reserved because they were in use in Chromium before - # the token_kHostObject tag was introduced in format version 13, at - # v8 refs/heads/master@{#43466} - # chromium/src refs/heads/master@{#453568} - # - # They must not be reused without a version check to prevent old values from - # starting to deserialize incorrectly. For simplicity, it's recommended to - # avoid them altogether. - # - # This is the set of tags that existed in SerializationTag.h at that time and - # still exist at the time of this writing (i.e., excluding those that were - # removed on the Chromium side because there should be no real user data - # containing them). - # - # It might be possible to also free up other tags which were never persisted - # (e.g. because they were used only for transfer) in the future. - token_kLegacyReservedMessagePort = b"M" - token_kLegacyReservedBlob = b"b" - token_kLegacyReservedBlobIndex = b"i" - token_kLegacyReservedFile = b"f" - token_kLegacyReservedFileIndex = b"e" - token_kLegacyReservedDOMFileSystem = b"d" - token_kLegacyReservedFileList = b"l" - token_kLegacyReservedFileListIndex = b"L" - token_kLegacyReservedImageData = b"#" - token_kLegacyReservedImageBitmap = b"g" - token_kLegacyReservedImageBitmapTransfer = b"G" - token_kLegacyReservedOffscreenCanvas = b"H" - token_kLegacyReservedCryptoKey = b"token_k" - token_kLegacyReservedRTCCertificate = b"token_k" - - -class ArrayBufferViewTag: - tag_kInt8Array = "b" - tag_kUint8Array = "B" - tag_kUint8ClampedArray = "C" - tag_kInt16Array = "w" - tag_kUint16Array = "W" - tag_kInt32Array = "d" - tag_kUint32Array = "D" - tag_kFloat32Array = "f" - tag_kFloat64Array = "F" - tag_kBigInt64Array = "q" - tag_kBigUint64Array = "Q" - tag_kDataView = "?" - - STRUCT_LOOKUP = types.MappingProxyType( - { - tag_kInt8Array: "b", - tag_kUint8Array: "B", - tag_kUint8ClampedArray: "B", - tag_kInt16Array: "h", - tag_kUint16Array: "H", - tag_kInt32Array: "i", - tag_kUint32Array: "I", - tag_kFloat32Array: "f", - tag_kFloat64Array: "d", - tag_kBigInt64Array: "q", - tag_kBigUint64Array: "Q", - tag_kDataView: "c", - } - ) - - -class Deserializer: - Undefined = _Undefined() - - __ODDBALLS = { - Constants.token_kUndefined: Undefined, - Constants.token_kTheHole: Undefined, - Constants.token_kNull: None, - Constants.token_kTrue: True, - Constants.token_kFalse: False, - } - - __WRAPPED_PRIMITIVES = { - Constants.token_kTrueObject, - Constants.token_kFalseObject, - Constants.token_kNumberObject, - Constants.token_kBigIntObject, - Constants.token_kStringObject, - } - - def __init__( - self, - stream: typing.BinaryIO, - host_object_delegate: typing.Callable, - *, - is_little_endian=True, - is_64bit=True, - ): - self._f = stream - self._host_object_delegate = host_object_delegate - self._endian = "<" if is_little_endian else ">" - self._pointer_size = 8 if is_64bit else 4 - self._next_id = 0 - self._objects = [] - self.version = self._read_header() - - def _read_raw(self, length: int) -> bytes: - start = self._f.tell() - raw = self._f.read(length) - if len(raw) != length: - raise ValueError( - f"Could not read all data at offset {start}; wanted {length}; got {len(raw)}" - ) - - return raw - - def _read_le_varint(self) -> typing.Optional[typing.Tuple[int, bytes]]: - return read_le_varint(self._f) - - def _read_zigzag(self) -> int: - unsigned = self._read_le_varint()[0] - if unsigned & 1: - return -(unsigned >> 1) - else: - return unsigned >> 1 - - def _read_double(self) -> float: - return struct.unpack(f"{self._endian}d", self._read_raw(8))[0] - - # def _read_uint32(self) -> int: - # return self._read_le_varint() - - # def _read_uint64(self) -> int: - # return self._read_le_varint() - - def _read_bigint(self) -> int: - size_flag = self._read_le_varint()[0] - is_neg = size_flag & 0x01 - size = size_flag >> 4 - raw = self._read_raw(size * self._pointer_size) - - value = int.from_bytes( - raw, "big" if self._endian == ">" else "little", signed=False - ) - if is_neg: - value = -value - - return value - - def _read_utf8_string(self) -> str: - length = self._read_le_varint()[0] - return self._read_raw(length).decode("utf8") - - def _read_one_byte_string(self) -> typing.AnyStr: - length = self._read_le_varint()[0] - # I think this can be used to store raw 8-bit data, so return ascii if we can, otherwise bytes - raw = self._read_raw(length) # .decode("ascii") - try: - result = raw.decode("ascii") - except UnicodeDecodeError: - result = raw - return result - - def _read_two_byte_string(self) -> str: - length = self._read_le_varint()[0] - return self._read_raw(length).decode("utf-16-le") # le? - - def _read_string(self) -> str: - if self.version < 12: - return self._read_utf8_string() - - value = self._read_object() - assert isinstance(value, str) - - return value - - def _read_object_by_reference(self) -> typing.Any: - ref_id = self._read_le_varint()[0] - return self._objects[ref_id] - - def _read_tag(self) -> bytes: - while True: - t = self._f.read(1) - if t != Constants.token_kPadding: - return t - - def _peek_tag(self) -> bytes: - start = self._f.tell() - tag = self._read_tag() - self._f.seek(start, 0) - return tag - - def _read_date(self) -> datetime.datetime: - x = self._read_double() - result = datetime.datetime(1970, 1, 1) + datetime.timedelta(milliseconds=x) - self._objects.append(result) - return result - - def _read_js_regex(self) -> typing.Pattern: - log(f"Reading js regex properties at {self._f.tell()}") - pattern = self._read_string() - flags = self._read_le_varint() - - # TODO: Flags? - regex = re.compile(pattern) - self._objects.append(regex) - return regex - - def _read_js_object_properties( - self, end_tag - ) -> typing.Iterable[typing.Tuple[typing.Any, typing.Any]]: - log(f"Reading object properties at {self._f.tell()} with end tag: {end_tag}") - while True: - if self._peek_tag() == end_tag: - log(f"Object end at offset {self._f.tell()}") - break - key = self._read_object() - value = self._read_object() - - yield key, value - - assert self._read_tag() == end_tag - - def _read_js_object(self) -> dict: - log(f"Reading js object properties at {self._f.tell()}") - result = {} - self._objects.append(result) - for key, value in self._read_js_object_properties(Constants.token_kEndJSObject): - result[key] = value - # while True: - # if self._peek_tag() == end_tag: - # log(f"Object end at offset {self._f.tell()}") - # break - # key = self._read_object() - # value = self._read_object() - # result[key] = value - # - # assert self._read_tag() == end_tag - property_count = self._read_le_varint()[0] - log( - f"Actual property count: {len(result)}; stated property count: {property_count}" - ) - if len(result) != property_count: - raise ValueError("Property count mismatch") - - return result - - def _read_js_sparse_array(self) -> list: - log(f"Reading js sparse array properties at {self._f.tell()}") - # TODO: implement a sparse list so that this isn't so horribly inefficient - length = self._read_le_varint()[0] - result = [None for _ in range(length)] - self._objects.append(result) - - sparse_object = self._read_js_object_properties( - Constants.token_kEndSparseJSArray - ) - prop_count = 0 - for key, value in sparse_object: - i = int(key) - result[i] = value - prop_count += 1 - expected_num_properties = self._read_le_varint()[0] - - log( - f"Actual property count: {prop_count}; stated property count: {expected_num_properties}" - ) - if prop_count != expected_num_properties: - raise ValueError("Property count mismatch") - - expected_length = self._read_le_varint()[0] # TODO: should this be checked? - - return result - - def _read_js_dense_array(self) -> list: - log(f"Reading js dense array properties at {self._f.tell()}") - length = self._read_le_varint()[0] - result = [None for _ in range(length)] - self._objects.append(result) - - for i in range(length): - result[i] = self._read_object() - - # And then there's a sparse bit maybe? - sparse_object = self._read_js_object_properties( - Constants.token_kEndDenseJSArray - ) - prop_count = 0 - for key, value in sparse_object: - i = int(key) - result[i] = value - prop_count += 1 - - expected_num_properties = self._read_le_varint()[0] - - log( - f"Actual property count: {prop_count}; stated property count: {expected_num_properties}" - ) - if prop_count != expected_num_properties: - raise ValueError("Property count mismatch") - - expected_length = self._read_le_varint()[0] # TODO: should this be checked? - - return result - - def _read_js_map(self) -> dict: - log(f"Reading js map at {self._f.tell()}") - result = {} - self._objects.append(result) - while True: - if self._peek_tag() == Constants.token_kEndJSMap: - log(f"End of map at {self._f.tell()}") - break - - key = self._read_object() - value = self._read_object() - result[key] = value - - assert self._read_tag() == Constants.token_kEndJSMap - - expected_length = self._read_le_varint()[0] - log( - f"Actual map item count: {len(result) * 2}; stated map item count: {expected_length}" - ) - if expected_length != len(result) * 2: - raise ValueError("Map count mismatch") - - return result - - def _read_js_set(self) -> set: - log(f"Reading js set properties at {self._f.tell()}") - result = set() - self._objects.append(result) - - while True: - if self._peek_tag() == Constants.token_kEndJSSet: - log(f"End of set at {self._f.tell()}") - break - - result.add(self._read_object()) - - assert self._read_tag() == Constants.token_kEndJSSet - - expected_length = self._read_le_varint()[0] - log( - f"Actual set item count: {len(result)}; stated set item count: {expected_length}" - ) - if expected_length != len(result): - raise ValueError("Set count mismatch") - - return result - - def _read_js_arraybuffer(self) -> bytes: - length = self._read_le_varint()[0] - raw = self._read_raw(length) - self._objects.append(raw) - - return raw - - def _wrap_js_array_buffer_view(self, raw: bytes) -> tuple: - if not isinstance(raw, bytes): - raise TypeError( - "Only bytes should be passed to be wrapped in a buffer view" - ) - - log(f"Wrapping in ArrayBufferView at offset {self._f.tell()}") - - tag = chr(self._read_le_varint()[0]) - byte_offset = self._read_le_varint()[0] - byte_length = self._read_le_varint()[0] - - if byte_offset + byte_length > len(raw): - raise ValueError("Not enough data in the raw data to hold the defined data") - - log( - f"ArrayBufferView: tag: {tag}; byte_offset: {byte_offset}; byte_length: {byte_length}" - ) - - fmt = ArrayBufferViewTag.STRUCT_LOOKUP[tag] - element_length = struct.calcsize(fmt) - if byte_length % element_length != 0: - raise ValueError( - f"ArrayBufferView doesn't fit nicely: byte_length: {byte_length}; " - f"element_length: {element_length}" - ) - - element_count = byte_length // element_length - - return struct.unpack( - f"{self._endian}{element_count}{fmt}", - raw[byte_offset : byte_offset + byte_length], - ) - - def _read_host_object(self) -> typing.Any: - result = self._host_object_delegate(self._f) - self._objects.append(result) - return result - - def _not_implemented(self): - raise NotImplementedError("Todo") - - def _read_object_internal(self) -> typing.Tuple[bytes, typing.Any]: - tag = self._read_tag() - - log(f"Offset: {self._f.tell()}; Tag: {tag}") - - if tag in Deserializer.__ODDBALLS: - return tag, Deserializer.__ODDBALLS[tag] - - func = { - Constants.token_kTrueObject: lambda: Deserializer.__ODDBALLS[ - Constants.token_kTrue - ], - Constants.token_kFalseObject: lambda: Deserializer.__ODDBALLS[ - Constants.token_kFalse - ], - Constants.token_kNumberObject: self._read_double, - Constants.token_kUint32: self._read_le_varint, - Constants.token_kInt32: self._read_zigzag, - Constants.token_kDouble: self._read_double, - Constants.token_kDate: self._read_date, - Constants.token_kBigInt: self._read_bigint, - Constants.token_kBigIntObject: self._read_bigint, - Constants.token_kUtf8String: self._read_utf8_string, - Constants.token_kOneByteString: self._read_one_byte_string, - Constants.token_kTwoByteString: self._read_two_byte_string, - Constants.token_kStringObject: self._read_string, - Constants.token_kRegExp: self._read_js_regex, - Constants.token_kObjectReference: self._read_object_by_reference, - Constants.token_kBeginJSObject: self._read_js_object, - Constants.token_kBeginSparseJSArray: self._read_js_sparse_array, - Constants.token_kBeginDenseJSArray: self._read_js_dense_array, - Constants.token_kBeginJSMap: self._read_js_map, - Constants.token_kBeginJSSet: self._read_js_set, - Constants.token_kArrayBuffer: self._read_js_arraybuffer, - Constants.token_kSharedArrayBuffer: self._not_implemented, # and probably never, as it can't be pulled from the data I think? - Constants.token_kArrayBufferTransfer: self._not_implemented, - Constants.token_kError: self._not_implemented, - Constants.token_kWasmModuleTransfer: self._not_implemented, - Constants.token_kWasmMemoryTransfer: self._not_implemented, - Constants.token_kHostObject: self._read_host_object, - }.get(tag) - - if func is None: - raise ValueError(f"Unknown tag {tag}") - - value = func() - - if tag in Deserializer.__WRAPPED_PRIMITIVES: - self._objects.append(value) - - return tag, value - - def _read_object(self) -> typing.Any: - log(f"Read object at offset: {self._f.tell()}") - tag, o = self._read_object_internal() - - if self._peek_tag() == Constants.token_kArrayBufferView: - assert self._read_tag() == Constants.token_kArrayBufferView - o = self._wrap_js_array_buffer_view(o) - - return o - - def _read_header(self) -> int: - tag = self._read_tag() - if tag != Constants.token_kVersion: - raise ValueError("Didn't get version tag in the header") - version = self._read_le_varint()[0] - return version - - def read(self) -> typing.Any: - return self._read_object() diff --git a/utils/ccl_chrome_indexeddb/dump_indexeddb_details.py b/utils/ccl_chrome_indexeddb/dump_indexeddb_details.py deleted file mode 100644 index 3dd7dd0..0000000 --- a/utils/ccl_chrome_indexeddb/dump_indexeddb_details.py +++ /dev/null @@ -1,39 +0,0 @@ -import sys -import pathlib -import ccl_chromium_indexeddb - - -def main(args): - ldb_path = pathlib.Path(args[0]) - wrapper = ccl_chromium_indexeddb.WrappedIndexDB(ldb_path) - - for db_info in wrapper.database_ids: - db = wrapper[db_info.dbid_no] - print("------Database------") - print(f"db_number={db.db_number}; name={db.name}; origin={db.origin}") - print() - print("\t---Object Stores---") - for obj_store_name in db.object_store_names: - obj_store = db[obj_store_name] - print( - f"\tobject_store_id={obj_store.object_store_id}; name={obj_store.name}" - ) - try: - one_record = next(obj_store.iterate_records()) - except StopIteration: - one_record = None - if one_record is not None: - print("\tExample record:") - print(f"\tkey: {one_record.key}") - print(f"\tvalue: {one_record.value}") - else: - print("\tNo records") - print() - print() - - -if __name__ == "__main__": - if len(sys.argv) < 2: - print(f"USAGE: {pathlib.Path(sys.argv[0]).name} ") - exit(1) - main(sys.argv[1:]) diff --git a/utils/ccl_chrome_indexeddb/dump_leveldb.py b/utils/ccl_chrome_indexeddb/dump_leveldb.py deleted file mode 100644 index 72e71cf..0000000 --- a/utils/ccl_chrome_indexeddb/dump_leveldb.py +++ /dev/null @@ -1,64 +0,0 @@ -import sys -import csv -import ccl_leveldb -import pathlib - -ENCODING = "iso-8859-1" - - -def main(args): - input_path = args[0] - output_path = "leveldb_dump.csv" - if len(args) > 1: - output_path = args[1] - - leveldb_records = ccl_leveldb.RawLevelDb(input_path) - - with open(output_path, "w", encoding="utf-8", newline="") as file1: - writes = csv.writer(file1, quoting=csv.QUOTE_ALL) - writes.writerow( - [ - "key-hex", - "key-text", - "value-hex", - "value-text", - "origin_file", - "file_type", - "offset", - "seq", - "state", - "was_compressed", - ] - ) - - for record in leveldb_records.iterate_records_raw(): - writes.writerow( - [ - record.user_key.hex(" ", 1), - record.user_key.decode(ENCODING, "replace"), - record.value.hex(" ", 1), - record.value.decode(ENCODING, "replace"), - str(record.origin_file), - record.file_type.name, - record.offset, - record.seq, - record.state.name, - record.was_compressed, - ] - ) - - -if __name__ == "__main__": - if len(sys.argv) < 2: - print(f"Usage: {pathlib.Path(sys.argv[0]).name} [outpath.csv]") - exit(1) - print() - print("+--------------------------------------------------------+") - print("|Please note: keys and values in leveldb are binary blobs|") - print("|so any text seen in the output of this script might not |") - print("|represent the entire meaning of the data. The output of |") - print("|this script should be considered as a preview of the |") - print("|data only. |") - print("+--------------------------------------------------------+") - print() - main(sys.argv[1:]) diff --git a/utils/shared.py b/utils/shared.py index 01ab095..1f8a082 100644 --- a/utils/shared.py +++ b/utils/shared.py @@ -26,7 +26,7 @@ import json import os -from ccl_chrome_indexeddb import ( +from chromedb import ( ccl_blink_value_deserializer, ccl_chromium_indexeddb, ccl_v8_value_deserializer, @@ -34,7 +34,7 @@ ccl_chromium_localstorage, ccl_chromium_sessionstorage, ) -from ccl_chrome_indexeddb.ccl_chromium_indexeddb import ( +from chromedb.ccl_chromium_indexeddb import ( DatabaseMetadataType, ObjectStoreMetadataType, ) @@ -122,7 +122,7 @@ def fetch_data(self): ( objstore_id, varint_raw, - ) = ccl_chromium_indexeddb.custom_le_varint_from_bytes( + ) = ccl_chromium_indexeddb.le_varint_from_bytes( record.key[len(prefix_objectstore) :] ) except TypeError: @@ -190,7 +190,7 @@ def iterate_records(self, do_not_filter=False): ( value_version, varint_raw, - ) = ccl_chromium_indexeddb.custom_le_varint_from_bytes( + ) = ccl_chromium_indexeddb.le_varint_from_bytes( record.value ) val_idx = len(varint_raw) @@ -203,7 +203,7 @@ def iterate_records(self, do_not_filter=False): ( blink_version, varint_raw, - ) = ccl_chromium_indexeddb.custom_le_varint_from_bytes( + ) = ccl_chromium_indexeddb.le_varint_from_bytes( record.value[val_idx:] )