From 516082c561e438fa67c133538c1be4cb5580bc5b Mon Sep 17 00:00:00 2001 From: MatrixEditor <58256046+MatrixEditor@users.noreply.github.com> Date: Tue, 26 Dec 2023 10:46:42 +0100 Subject: [PATCH] New type: prefixed arrays and strings + updated documentation + some examples were updated + corrected the name of BItField --- caterpillar/_common.py | 55 +++++++++++++++- caterpillar/abc.py | 4 ++ caterpillar/fields/__init__.py | 1 + caterpillar/fields/_base.py | 20 ++++-- caterpillar/fields/common.py | 62 +++++++++++++++++- caterpillar/model/__init__.py | 5 +- caterpillar/model/_base.py | 2 + caterpillar/model/_bitfield.py | 6 +- docs/source/development/changelog.rst | 5 ++ docs/source/development/contribution.rst | 24 +++++++ docs/source/development/index.rst | 5 ++ docs/source/index.rst | 14 ++-- docs/source/installing/index.rst | 15 ++++- docs/source/library/index.rst | 1 + docs/source/library/model.rst | 34 ++++++++++ docs/source/reference/baseclasses.rst | 7 -- docs/source/reference/datamodel.rst | 9 +++ docs/source/reference/index.rst | 10 ++- docs/source/reference/introduction.rst | 83 +++++++++++++++++++++++- examples/formats/caf.py | 4 +- examples/formats/nibarchive.py | 19 ++---- 21 files changed, 340 insertions(+), 45 deletions(-) create mode 100644 docs/source/development/changelog.rst create mode 100644 docs/source/development/contribution.rst create mode 100644 docs/source/development/index.rst delete mode 100644 docs/source/reference/baseclasses.rst diff --git a/caterpillar/_common.py b/caterpillar/_common.py index a8ccfeb..5556fa5 100644 --- a/caterpillar/_common.py +++ b/caterpillar/_common.py @@ -16,7 +16,7 @@ from typing import List, Any, Union, Iterable -from caterpillar.abc import _GreedyType, _ContextLike, isgreedy, _StreamType +from caterpillar.abc import _GreedyType, _ContextLike, isgreedy, _StreamType, isprefixed from caterpillar.context import ( Context, CTX_PATH, @@ -26,9 +26,26 @@ CTX_OBJECT, CTX_STREAM, ) +from caterpillar.options import F_SEQUENTIAL from caterpillar.exception import Stop, StructException, InvalidValueError +class WithoutFlag: + def __init__(self, context: _ContextLike, flag) -> None: + self.context = context + self.field = context[CTX_FIELD] + self.flag = flag + + def __enter__(self) -> None: + self.field ^= self.flag + + def __exit__(self, exc_type, exc_value, traceback) -> None: + self.field |= self.flag + # We have to apply the right field as instance of the Field class + # might set their own value into the context. + self.context[CTX_FIELD] = self.field + + def unpack_seq(context: _ContextLike, unpack_one) -> List[Any]: """Generic function to unpack sequenced elements. @@ -49,10 +66,28 @@ def unpack_seq(context: _ContextLike, unpack_one) -> List[Any]: # the new context. The '_pos' attribute will be adjusted automatically. values = [] # always list (maybe add factory) seq_context = Context( - _parent=context, _io=stream, _length=length, _lst=values, _field=field + _parent=context, + _io=stream, + _length=length, + _lst=values, + _field=field, + _obj=context.get(CTX_OBJECT), ) greedy = isgreedy(length) + prefixed = isprefixed(length) seq_context[CTX_POS] = stream.tell() + if prefixed: + # We have to temporarily remove the array status from the parsing field + with WithoutFlag(context, F_SEQUENTIAL): + field.amount = 1 + new_length = length.start.__unpack__(context) + field.amount, length = length, new_length + + if not isinstance(length, int): + raise InvalidValueError( + f"Prefix struct returned non-integer: {length!r}", context + ) + for i in range(length) if not greedy else itertools.count(): try: seq_context[CTX_PATH] = ".".join([base_path, str(i)]) @@ -93,10 +128,24 @@ def pack_seq(seq: List[Any], context: _ContextLike, pack_one) -> None: # REVISIT: when to use field.length(context) count = len(seq) + length = field.amount + if isprefixed(length): + struct = length.start + # We have to temporatily alter the field's values, + with WithoutFlag(context, F_SEQUENTIAL): + field.amount = 1 + struct.__pack__(count, context) + field.amount = length # Special elements '_index' and '_length' can be referenced within # the new context. The '_pos' attribute will be adjusted automatically. - seq_context = Context(_parent=context, _io=stream, _length=count, _field=field) + seq_context = Context( + _parent=context, + _io=stream, + _length=count, + _field=field, + _obj=context.get(CTX_OBJECT), + ) seq_context[CTX_POS] = stream.tell() for i, elem in enumerate(seq): # The path will contain an additional hint on what element is processed diff --git a/caterpillar/abc.py b/caterpillar/abc.py index f6341bb..a076f74 100644 --- a/caterpillar/abc.py +++ b/caterpillar/abc.py @@ -24,6 +24,7 @@ _StreamFactory = Callable[[], _StreamType] _GreedyType = type(...) +_PrefixedType = slice class _ContextLike(dict): @@ -177,3 +178,6 @@ def typeof(struct: Union[_StructLike, _ContainsStruct]) -> type: def isgreedy(obj) -> bool: return isinstance(obj, _GreedyType) + +def isprefixed(obj) -> bool: + return isinstance(obj, _PrefixedType) \ No newline at end of file diff --git a/caterpillar/fields/__init__.py b/caterpillar/fields/__init__.py index ada053f..0502bfd 100644 --- a/caterpillar/fields/__init__.py +++ b/caterpillar/fields/__init__.py @@ -32,6 +32,7 @@ Computed, Pass, CString, + Prefixed, padding, char, boolean, diff --git a/caterpillar/fields/_base.py b/caterpillar/fields/_base.py index 7e62715..a0721a7 100644 --- a/caterpillar/fields/_base.py +++ b/caterpillar/fields/_base.py @@ -24,6 +24,7 @@ _StreamType, _ContextLike, _GreedyType, + _PrefixedType, hasstruct, getstruct, typeof, @@ -100,7 +101,7 @@ class Field(_StructLike): An automatic flag that indicates this field stores a sequential struct. """ - amount: Union[_ContextLambda, int, _GreedyType] + amount: Union[_ContextLambda, int, _GreedyType, _PrefixedType] """ A constant or dynamic value to represent the amount of structs. Zero indicates there are no sequence types associated with this field. @@ -140,7 +141,7 @@ def __init__( order: ByteOrder = SysNative, offset: Union[_ContextLambda, int] = -1, flags: Set[Flag] = None, - amount: Union[_ContextLambda, int] = 0, + amount: Union[_ContextLambda, int, _PrefixedType] = 0, options: Union[_Switch, Dict[Any, _StructLike], None] = None, condition: Union[_ContextLambda, bool] = True, arch: Arch = None, @@ -195,7 +196,7 @@ def __matmul__(self, offset: Union[_ContextLambda, int]) -> Self: return self def __getitem__(self, dim: Union[_ContextLambda, int, _GreedyType]) -> Self: - self._verify_context_value(dim, (_GreedyType, int)) + self._verify_context_value(dim, (_GreedyType, int, _PrefixedType)) self.amount = dim if self.amount != 0: self.flags.add(F_SEQUENTIAL) @@ -218,6 +219,17 @@ def __rsub__(self, bits: Union[_ContextLambda, int]) -> Self: self.bits = bits return self + def __set_byteorder__(self, order: ByteOrder) -> Self: + self.order = order + return self + + __ixor__ = __xor__ + __ior__ = __or__ + __ifloordiv__ = __floordiv__ + __irshift__ = __rshift__ + __imatmul__ = __matmul__ + __isub__ = __rsub__ + def is_seq(self) -> bool: """Returns whether this field is sequential. @@ -256,7 +268,7 @@ def length(self, context: _ContextLike) -> Union[int, _GreedyType]: :rtype: Union[int, _GreedyType] """ try: - if isinstance(self.amount, (int, _GreedyType)): + if isinstance(self.amount, (int, _GreedyType, _PrefixedType)): return self.amount return self.amount(context) diff --git a/caterpillar/fields/common.py b/caterpillar/fields/common.py index 2b475a0..e24d52a 100644 --- a/caterpillar/fields/common.py +++ b/caterpillar/fields/common.py @@ -26,8 +26,14 @@ _EnumLike, isgreedy, ) -from caterpillar.exception import ValidationError, StructException, InvalidValueError +from caterpillar.exception import ( + ValidationError, + StructException, + InvalidValueError, + DynamicSizeError, +) from caterpillar.context import CTX_FIELD, CTX_STREAM +from caterpillar.options import F_SEQUENTIAL from ._base import Field, FieldStruct @@ -531,6 +537,7 @@ def unpack_single(self, context: _ContextLike) -> Any: def __class_getitem__(cls, dim) -> Field: return CString(...)[dim] + class ConstString(Const): """ A specialized constant field for handling string values. @@ -612,3 +619,56 @@ def pack_single(self, obj: Any, context: _ContextLike) -> None: def unpack_single(self, context: _ContextLike) -> None: # No need for an implementation pass + + +class Prefixed(FieldStruct): + def __init__(self, prefix: Optional[_StructLike] = None, encoding: Optional[str] = None): + self.encoding = encoding + self.prefix = prefix or uint32 + + def __type__(self) -> type: + return bytes if not self.encoding else str + + def __size__(self, context: _ContextLike) -> int: + """ + Calculate the size of the Prefixed field. + + :param context: The current context. + :return: The size of the Bytes field. + """ + raise DynamicSizeError("Prefixed does not store a size", context) + + def pack_single(self, obj: bytes, context: _ContextLike) -> None: + """ + Pack a single bytes object into the stream. + + :param obj: The bytes object to pack. + :param context: The current context. + """ + self.prefix.__pack__(len(obj), context) + if self.encoding: + obj = obj.encode(self.encoding) + context[CTX_STREAM].write(obj) + + def unpack_single(self, context: _ContextLike) -> Any: + """ + Unpack a single bytes object from the stream. + + :param context: The current context. + :return: The unpacked bytes object. + """ + field: Field = context[CTX_FIELD] + is_seq = field.is_seq() + if is_seq: + # We have to remove the sequence status temporarily + field ^= F_SEQUENTIAL + + size = self.prefix.unpack_single(context) + data = context[CTX_STREAM].read(size) + if self.encoding: + data = data.decode(self.encoding) + + # The status has to be added again + if is_seq: + field |= F_SEQUENTIAL + return data diff --git a/caterpillar/model/__init__.py b/caterpillar/model/__init__.py index a294188..abf6f6a 100644 --- a/caterpillar/model/__init__.py +++ b/caterpillar/model/__init__.py @@ -24,6 +24,7 @@ pack_file, ) from ._bitfield import ( - Bitfield, - bitfield + BitField, + bitfield, + BitFieldGroup ) \ No newline at end of file diff --git a/caterpillar/model/_base.py b/caterpillar/model/_base.py index a0ff5c7..230786a 100644 --- a/caterpillar/model/_base.py +++ b/caterpillar/model/_base.py @@ -45,6 +45,8 @@ @dataclass(init=False) class Sequence(_StructLike, FieldMixin): + """Default implementation for a sequence of fields.""" + model: Any """ Specifies the target class/dictionary used as the base model. diff --git a/caterpillar/model/_bitfield.py b/caterpillar/model/_bitfield.py index 9eae569..df437fb 100644 --- a/caterpillar/model/_bitfield.py +++ b/caterpillar/model/_bitfield.py @@ -67,7 +67,7 @@ class BitFieldGroup: fields: Dict[BitTuple, Field] = dcfield(default_factory=dict) -class Bitfield(Struct): +class BitField(Struct): groups: List[BitFieldGroup] def __init__( @@ -101,7 +101,7 @@ def __init__( del self._current_group def __add__(self, other: "BitField") -> Self: - if not isinstance(other, Bitfield): + if not isinstance(other, BitField): raise ValidationError( f"Attempted to add a non-bitfield struct to a bitfield! (type={type(other)})" ) @@ -319,7 +319,7 @@ def _make_bitfield( arch: Optional[Arch] = None, field_options: Iterable[Flag] = None, ) -> type: - _ = Bitfield( + _ = BitField( cls, order=order, arch=arch, options=options, field_options=field_options ) return cls diff --git a/docs/source/development/changelog.rst b/docs/source/development/changelog.rst new file mode 100644 index 0000000..eebf90c --- /dev/null +++ b/docs/source/development/changelog.rst @@ -0,0 +1,5 @@ +.. _changelog: + +********* +Changelog +********* \ No newline at end of file diff --git a/docs/source/development/contribution.rst b/docs/source/development/contribution.rst new file mode 100644 index 0000000..377c857 --- /dev/null +++ b/docs/source/development/contribution.rst @@ -0,0 +1,24 @@ +.. _contribution: + +*********************** +Contribution Guidelines +*********************** + + +Submit a new Feature +-------------------- + +*TODO* + +Submit an Issue +--------------- + +*TODO* + +Running Tests +------------- + +*TODO* + +Having a general question? +-------------------------- \ No newline at end of file diff --git a/docs/source/development/index.rst b/docs/source/development/index.rst new file mode 100644 index 0000000..a628e42 --- /dev/null +++ b/docs/source/development/index.rst @@ -0,0 +1,5 @@ +.. _development-index: + +*********** +Development +*********** \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst index 761182a..5acb41a 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -29,8 +29,7 @@ to write complex structures in a compact and readable manner. magic: b"Foo" # constant values name: CString(...) # C-String without a fixed length value: le + uint16 # little endian encoding - num_entries: be + uint32 # simple field definition + big endian encoding - entries: CString[this.num_entries] # arrays just like that + entries: be + CString[uint32::] # arrays with big-endian prefixed length .. admonition:: Hold up, wait a minute! @@ -40,16 +39,16 @@ to write complex structures in a compact and readable manner. Working with defined classes is as straightforward as working with normal classes. *All constant values are created automatically!* ->>> obj = Format(name="Hello, World!", value=1, num_entries=1, entries=["Bar"]) +>>> obj = Format(name="Hello, World!", value=10, entries=["Bar", "Baz"]) >>> print(obj) -Format(magic=b'Foo', name='Hello, World!', value=1, num_entries=1, entries=['Bar']) +Format(magic=b'Foo', name='Hello, World!', value=10, entries=['Bar', 'Baz']) Packing and unpacking have never been easier: >>> pack(obj) -b'FooHello, World!\x00\x01\x00\x00\x00\x00\x00\x00\x01Bar\x00' +b'FooHello, World!\x00\n\x00\x00\x00\x00\x02Bar\x00Baz\x00' >>> unpack(Format, _) -Format(magic=b'Foo', name='Hello, World!', value=1, num_entries=1, entries=['Bar']) +Format(magic=b'Foo', name='Hello, World!', value=10, entries=['Bar', 'Baz']) .. admonition:: What about documentation? @@ -70,9 +69,10 @@ what configuration options can be used. Alternatively you can follow the :ref:`t :caption: Contents: installing/index.rst - reference/index.rst tutorial/index.rst + reference/index.rst library/index.rst + development/index.rst diff --git a/docs/source/installing/index.rst b/docs/source/installing/index.rst index 202fa6d..ccb9503 100644 --- a/docs/source/installing/index.rst +++ b/docs/source/installing/index.rst @@ -4,5 +4,18 @@ Installation ************ -*TODO* +*caterpillar* does not have a direct Python installation candidate. Therefore, you will need +to install it by providing the Git link. This library has no fixed dependencies, so it can run +out of the box. +.. code-block:: bash + + pip install git+https://github.com/MatrixEditor/caterpillar.git + + +.. note:: + If you clone the repository, don't forget to add `-e` to the installation via pip as + it enables developer mode. + + +If you wish to contribute to this project, make sure you follow the :ref:`contribution`. \ No newline at end of file diff --git a/docs/source/library/index.rst b/docs/source/library/index.rst index b9a71fa..006ac91 100644 --- a/docs/source/library/index.rst +++ b/docs/source/library/index.rst @@ -8,4 +8,5 @@ Library :maxdepth: 2 :numbered: + model.rst options.rst diff --git a/docs/source/library/model.rst b/docs/source/library/model.rst index e69de29..f96a91e 100644 --- a/docs/source/library/model.rst +++ b/docs/source/library/model.rst @@ -0,0 +1,34 @@ +.. _model: + +************ +Struct Model +************ + +.. autoclass:: caterpillar.model.Sequence + :members: + +.. autoclass:: caterpillar.model.Struct + :members: + +.. autofunction:: caterpillar.model.struct + +.. autofunction:: caterpillar.model.union + +.. autofunction:: caterpillar.model.pack + +.. autofunction:: caterpillar.model.pack_into + +.. autofunction:: caterpillar.model.pack_file + +.. autofunction:: caterpillar.model.unpack + +.. autofunction:: caterpillar.model.unpack_file + +.. autoclass:: caterpillar.model.BitFieldGroup + :members: + +.. autoclass:: caterpillar.model.BitField + :members: + +.. autofunction:: caterpillar.model.bitfield + diff --git a/docs/source/reference/baseclasses.rst b/docs/source/reference/baseclasses.rst deleted file mode 100644 index 057d916..0000000 --- a/docs/source/reference/baseclasses.rst +++ /dev/null @@ -1,7 +0,0 @@ -.. _baseclasses: - -********************* -Abstract Base Classes -********************* - -*TODO: describe classes and protocols* \ No newline at end of file diff --git a/docs/source/reference/datamodel.rst b/docs/source/reference/datamodel.rst index b7843fc..5f7d2b4 100644 --- a/docs/source/reference/datamodel.rst +++ b/docs/source/reference/datamodel.rst @@ -277,6 +277,15 @@ example. >>> unpack(field, b"abcd\x00") 'abcd' +Prefixed +-------- + +In addition to greedy parsing, this library supports prefixed packing and unpacking as well. With *prefixed*, we refer +to the length of an array of elements that should be parsed. In this library, the :code:`slice` class is to achieve a +prefix option. + +>>> field = CString[uint32::] + Context ------- diff --git a/docs/source/reference/index.rst b/docs/source/reference/index.rst index 04a5c38..d6fbac2 100644 --- a/docs/source/reference/index.rst +++ b/docs/source/reference/index.rst @@ -4,10 +4,18 @@ Caterpillar's Reference ################################ +If you've completed the tutorial, you are now ready to delve into the internal logic and models +used to provide the user-friendly interface. Please note that some sections are still in +development and may be subject to changes or removal. This framework is evolving over time, and +nothing in the current documentation is considered to be final. + +To get started with the internal API, it is recommended to visit the :ref:`library-index` API +documentation. + .. toctree:: :maxdepth: 2 :numbered: - baseclasses.rst + introduction.rst datamodel.rst operators.rst \ No newline at end of file diff --git a/docs/source/reference/introduction.rst b/docs/source/reference/introduction.rst index eedb868..d554c1d 100644 --- a/docs/source/reference/introduction.rst +++ b/docs/source/reference/introduction.rst @@ -1,5 +1,84 @@ -.. _introduction +.. _introduction: ************ Introduction -************ \ No newline at end of file +************ + +The "library reference" contains several different documents describing the core model of this framework. After +the tutorial, you are now able to dive deeper into the functionalities of this library. + +.. admonition:: What exactly is a *caterpillar*? + + Caterpillars (|c0|/k |c1| t |c2| rp |c3| l |c4| r - 🐛) are the wormlike larva of a butterfly or moth. [1]_ + Just as caterpillars undergo a metamorphosis, *caterpillar* facilitates the metamorphosis of data structures + into runtime objects. + + +This document aims to address burning questions regarding design and availability. It provides an overview of the +aspects covered by this framework and those that it doesn't. In general, this library was designed to enhance the +process of reverse engineering binary structures using readable and shareable code. The use of *"static"* [2]_ +class definitions delivers advantages but also brings up some problems that we need to discuss. + + +Why use Caterpillar? +-------------------- + +There are several reasons to incorporate this library into your code. Some of the scenarios where Caterpillar can +be beneficial include: + +- **Quick Reverse Engineering**: When you need to rapidly reverse engineer a binary structure. +- **Creating Presentable Binary Structures**: When there's a task to create a binary structure, and the result should be presentable. +- **Exploration in Python**: When you want to experiment and play around in Python. + + +The biggest advantage of *Caterpillar* is the lack of external dependencies (though extensions can be integrated using +dependencies). Additionally, the minimal lines of code required to define structures speak for themselves, as +demonstrated in the following example from `examples/formats/caf`: + +.. code-block:: python + :linenos: + + @struct(order=BigEndian) + class CAFChunk: + chunk_header: CAFChunkHeader + data: Field(this.chunk_header.chunk_type) >> { + b"desc": CAFAudioFormat, + b"info": CAFStringsChunk, + b"pakt": CAFPacketTable, + b"data": CAFData, + b"free": padding[this.chunk_header.chunk_size], + DEFAULT_OPTION: Bytes(this.chunk_header.chunk_size), + } + + +How does this even work? +^^^^^^^^^^^^^^^^^^^^^^^^ + +*Caterpillar* utilizes Python's annotations to build its model from processing class definitions. With the use +of Python 3.12, there are no conflicts in using annotations for defining fields. + +.. code-block:: python + + @struct + class Format: + # : [ = ] + +By using annotations, we can simply define a default value if desired, eliminating the need to make the code +more complex by using assignments. + +Pros & Cons +----------- + +*TODO* + + + + +.. |c0| unicode:: U+02C8 +.. |c1| unicode:: U+00E6 +.. |c2| unicode:: U+0259 +.. |c3| unicode:: U+026A +.. |c4| unicode:: U+0259 + +.. [1] https://en.wikipedia.org/wiki/Caterpillar +.. [2] Event structs generated from class models are extensible in some degree. \ No newline at end of file diff --git a/examples/formats/caf.py b/examples/formats/caf.py index ff9caef..bacab7f 100644 --- a/examples/formats/caf.py +++ b/examples/formats/caf.py @@ -97,8 +97,8 @@ class CAFInformation: class CAFStringsChunk: # A simple prefixed field where we use a reference to an already parsed value # as the length. - num_entries: uint32 - strings: CAFInformation[this.num_entries] + # num_entries: uint32 + strings: CAFInformation[uint32::] @struct(order=BigEndian) diff --git a/examples/formats/nibarchive.py b/examples/formats/nibarchive.py index 824026e..3e52614 100644 --- a/examples/formats/nibarchive.py +++ b/examples/formats/nibarchive.py @@ -76,13 +76,9 @@ class NIBClassName: extras: int32[this.extras_count] -@struct(order=LittleEndian) -class NIBKey: - length: VarInt - # Note that the returned string instance here may contain extra null-bytes - # at the end. - name: String(this.length) - +# Note that the returned string instance here may contain extra null-bytes +# at the end. +NIBKey = Prefixed(VarInt, "utf-8") class ValueType(enum.Enum): UNKNOWN = -1 @@ -99,11 +95,9 @@ class ValueType(enum.Enum): OBJECT_REF = 10 -@struct(order=LittleEndian) -class NIBData: - length: VarInt - # The raw data is just copied from the stream. - data: Bytes(this.length) +# The raw data is just copied from the stream. If we don't specify an +# encoding, the raw bytes or copied. +NIBData = Prefixed(VarInt) @struct(order=LittleEndian) @@ -164,4 +158,5 @@ class NIBArchive: # print(NIBArchive.__struct__.fields) if __name__ == '__main__': obj = unpack_file(NIBArchive, sys.argv[1]) + print(obj) pack_file(obj, sys.argv[2], use_tempfile=True)