Skip to content

Commit

Permalink
More
Browse files Browse the repository at this point in the history
  • Loading branch information
squeaky-pl committed Sep 11, 2024
1 parent cf737e1 commit 3f37a9b
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 17 deletions.
9 changes: 2 additions & 7 deletions bin/recompress-raw-mime.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from concurrent.futures import ThreadPoolExecutor

import click
import zstandard
from sqlalchemy.orm import Query
from sqlalchemy.sql import func

Expand Down Expand Up @@ -100,12 +99,8 @@ def recompress_batch(recompress_sha256s: "set[str]", dry_run: bool) -> None:
if data is None:
continue

if data.startswith(blockstore.ZSTD_MAGIC_NUMBER_PREFIX):
decompressed_raw_mime = zstandard.decompress(data)
else:
decompressed_raw_mime = data

compressed_raw_mime = blockstore.get_maybe_compressed_mime(
decompressed_raw_mime = blockstore.maybe_decompress_raw_mime(data)
compressed_raw_mime = blockstore.maybe_compress_raw_mime(
decompressed_raw_mime, compress=True
)
compressed_raw_mimes_by_sha256[data_sha256] = (
Expand Down
46 changes: 36 additions & 10 deletions inbox/util/blockstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,22 @@ def _data_file_path(h):
return os.path.join(_data_file_directory(h), h)


def get_maybe_compressed_mime(
def maybe_compress_raw_mime(
decompressed_raw_mime: bytes, *, compress: "bool | None" = None
) -> bytes:
"""
Optionally compress the raw MIME data.
Args:
decompressed_raw_mime: The raw MIME data, always *decompressed*.
compress:
Whether to compress the data.
If None, the value of `config["COMPRESS_RAW_MIME"]` is used
which defaults to False.
Returns:
The optionally compressed raw MIME data.
"""
if compress is None:
compress = config.get("COMPRESS_RAW_MIME", False)

Expand Down Expand Up @@ -81,7 +94,7 @@ def save_raw_mime(
Returns:
The length of the data in the datastore.
"""
compressed_raw_mime = get_maybe_compressed_mime(
compressed_raw_mime = maybe_compress_raw_mime(
decompressed_raw_mime, compress=compress
)

Expand Down Expand Up @@ -174,6 +187,26 @@ def get_from_blockstore(data_sha256, *, check_sha=True) -> Optional[bytes]:
return value


def maybe_decompress_raw_mime(compressed_raw_mime: bytes) -> bytes:
"""
Decompress the raw MIME data if it's compressed.
Args:
compressed_raw_mime: The raw MIME data, either compressed or not.
Returns:
The decompressed raw MIME data.
"""
# Raw MIME data will never start with the ZSTD magic number,
# because email messages always start with headers in 7-bit ASCII.
# ZSTD magic number contains bytes with the highest bit set to 1,
# so we can use it as a marker to check if the data is compressed.
if compressed_raw_mime.startswith(ZSTD_MAGIC_NUMBER_PREFIX):
return zstandard.decompress(compressed_raw_mime)
else:
return compressed_raw_mime


def get_raw_mime(data_sha256: str) -> "bytes | None":
"""
Get the raw MIME data from the blockstore.
Expand All @@ -190,14 +223,7 @@ def get_raw_mime(data_sha256: str) -> "bytes | None":
if compressed_raw_mime is None:
return None

# Raw MIME data will never start with the ZSTD magic number,
# because email messages always start with headers in 7-bit ASCII.
# ZSTD magic number contains bytes with the highest bit set to 1,
# so we can use it as a marker to check if the data is compressed.
if compressed_raw_mime.startswith(ZSTD_MAGIC_NUMBER_PREFIX):
decompressed_raw_mime = zstandard.decompress(compressed_raw_mime)
else:
decompressed_raw_mime = compressed_raw_mime
decompressed_raw_mime = maybe_decompress_raw_mime(compressed_raw_mime)

assert (
sha256(decompressed_raw_mime).hexdigest() == data_sha256
Expand Down

0 comments on commit 3f37a9b

Please sign in to comment.