diff --git a/.gitignore b/.gitignore index 665df8cca7..4111b2e8a9 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ log/* data/* bot/exts/fun/_latex_cache/* +bot/exts/fun/_typst_cache/* diff --git a/bot/__main__.py b/bot/__main__.py index b3f346a8a3..aec1eae37c 100644 --- a/bot/__main__.py +++ b/bot/__main__.py @@ -86,4 +86,6 @@ async def main() -> None: await _bot.start(constants.Client.token.get_secret_value()) -asyncio.run(main()) +# the main-guard is needed for launching subprocesses, e.g. via anyio.to_process +if __name__ == "__main__": + asyncio.run(main()) diff --git a/bot/constants.py b/bot/constants.py index 3d24ac1e33..42c8d21210 100644 --- a/bot/constants.py +++ b/bot/constants.py @@ -305,6 +305,23 @@ class _Reddit(EnvConfig, env_prefix="reddit_"): Reddit = _Reddit() + +# "typst_" is the prefix for Typst's own envvars, so "typstext_" +class _Typst(EnvConfig, env_prefix="typstext_"): + # the path to the typst binary that will be used. + # the Typst cog can download it to this path automatically + typst_path: str = "bot/exts/fun/_typst_cache/typst" + + # fetching configuration. note that the defaults assume Linux on x86_64 + + # the direct url to fetch a typst release archive from. It will be unpacked and the executable from it used. + typst_archive_url: str = "https://github.com/typst/typst/releases/download/v0.12.0/typst-x86_64-unknown-linux-musl.tar.xz" + # SHA256 hex digest the archive at typst_archive_url will be checked against. can be obtained by sha256sum + typst_archive_sha256: str = "605130a770ebd59a4a579673079cb913a13e75985231657a71d6239a57539ec3" + + +Typst = _Typst() + # Default role combinations MODERATION_ROLES = {Roles.moderation_team, Roles.admins, Roles.owners} STAFF_ROLES = {Roles.helpers, Roles.moderation_team, Roles.admins, Roles.owners} diff --git a/bot/exts/fun/latex.py b/bot/exts/fun/latex.py index 5ce60fa163..63a16f7622 100644 --- a/bot/exts/fun/latex.py +++ b/bot/exts/fun/latex.py @@ -1,13 +1,10 @@ import hashlib import os -import re import string -from io import BytesIO from pathlib import Path from typing import BinaryIO import discord -from PIL import Image from aiohttp import client_exceptions from discord.ext import commands from pydis_core.utils.logging import get_logger @@ -15,18 +12,12 @@ from bot.bot import Bot from bot.constants import Channels, WHITELISTED_CHANNELS +from bot.utils.codeblocks import prepare_input from bot.utils.decorators import whitelist_override +from bot.utils.images import process_image log = get_logger(__name__) -FORMATTED_CODE_REGEX = re.compile( - r"(?P(?P```)|``?)" # code delimiter: 1-3 backticks; (?P=block) only matches if it's a block - r"(?(block)(?:(?P[a-z]+)\n)?)" # if we're in a block, match optional language (only letters plus newline) - r"(?:[ \t]*\n)*" # any blank (empty or tabs/spaces only) lines before the code - r"(?P.*?)" # extract all code inside the markup - r"\s*" # any more whitespace before the end of the code markup - r"(?P=delim)", # match the exact same delimiter from the start again - re.DOTALL | re.IGNORECASE, # "." also matches newlines, case insensitive -) + LATEX_API_URL = os.getenv("LATEX_API_URL", "https://rtex.probablyaweb.site/api/v2") PASTEBIN_URL = "https://paste.pythondiscord.com" @@ -45,26 +36,6 @@ ) -def _prepare_input(text: str) -> str: - """Extract latex from a codeblock, if it is in one.""" - if match := FORMATTED_CODE_REGEX.match(text): - return match.group("code") - return text - - -def _process_image(data: bytes, out_file: BinaryIO) -> None: - """Read `data` as an image file, and paste it on a white background.""" - image = Image.open(BytesIO(data)).convert("RGBA") - width, height = image.size - background = Image.new("RGBA", (width + 2 * PAD, height + 2 * PAD), "WHITE") - - # paste the image on the background, using the same image as the mask - # when an RGBA image is passed as the mask, its alpha band is used. - # this has the effect of skipping pasting the pixels where the image is transparent. - background.paste(image, (PAD, PAD), image) - background.save(out_file) - - class InvalidLatexError(Exception): """Represents an error caused by invalid latex.""" @@ -97,7 +68,7 @@ async def _generate_image(self, query: str, out_file: BinaryIO) -> None: f"{LATEX_API_URL}/{response_json['filename']}", raise_for_status=True ) as response: - _process_image(await response.read(), out_file) + process_image(await response.read(), out_file, PAD) async def _upload_to_pastebin(self, text: str) -> str | None: """Uploads `text` to the paste service, returning the url if successful.""" @@ -132,7 +103,7 @@ async def _prepare_error_embed(self, err: InvalidLatexError | LatexServerError | @whitelist_override(channels=LATEX_ALLOWED_CHANNNELS) async def latex(self, ctx: commands.Context, *, query: str) -> None: """Renders the text in latex and sends the image.""" - query = _prepare_input(query) + query = prepare_input(query) # the hash of the query is used as the filename in the cache. query_hash = hashlib.md5(query.encode()).hexdigest() # noqa: S324 diff --git a/bot/exts/fun/typst.py b/bot/exts/fun/typst.py new file mode 100644 index 0000000000..d1e9793918 --- /dev/null +++ b/bot/exts/fun/typst.py @@ -0,0 +1,309 @@ +import asyncio +import hashlib +import string +import sys +from io import BytesIO +from pathlib import Path +from subprocess import CalledProcessError +from tempfile import TemporaryDirectory + +import discord +import platformdirs +from PIL import Image +from discord.ext import commands +from pydis_core.utils.logging import get_logger +from pydis_core.utils.paste_service import ( + PasteFile, + PasteTooLongError, + PasteUploadError, + send_to_paste_service, +) + +from bot.bot import Bot +from bot.constants import Channels, Typst as Config, WHITELISTED_CHANNELS +from bot.utils.archives import archive_retrieve_file +from bot.utils.codeblocks import prepare_input +from bot.utils.decorators import whitelist_override +from bot.utils.images import crop_background +from bot.utils.typst import compile_typst + +log = get_logger(__name__) + +PASTEBIN_URL = "https://paste.pythondiscord.com" + +THIS_DIR = Path(__file__).parent +# The cache directory used for typst. A temporary subdirectory is made for each invocation, +# which should be cleaned up automatically on success. +CACHE_DIRECTORY = THIS_DIR / "_typst_cache" +CACHE_DIRECTORY.mkdir(exist_ok=True) +TEMPLATE = string.Template(Path("bot/resources/fun/typst_template.typ").read_text()) +PACKAGES_INSTALL_STRING = Path("bot/resources/fun/typst_packages.typ").read_text() + +# the default typst packages directory. +TYPST_PACKAGES_DIR = platformdirs.user_cache_path("typst") / "packages" + +# how many pixels to leave on each side when cropping the image to only the contents. Set to None to disable cropping. +CROP_PADDING: int | None = 10 +# commands.max_concurrency limit for .typst +MAX_CONCURRENCY: int = 2 +# max time in seconds to allow the typst process to run +TYPST_TIMEOUT: float = 1.0 +# memory limit (in bytes) to set via RLIMIT_AS for the child process. +TYPST_MEMORY_LIMIT: int = 200 * 1024**2 # 200MB, which is pretty generous +# max size of the typst output image (before cropping) to allow rather than emitting an error +MAX_RAW_SIZE = 2 * 1024**2 # 2MB +# if set, limits the internal parallelism of the typst subprocess (--jobs argument). +WORKER_JOBS: int | None = None + + +TYPST_ALLOWED_CHANNNELS = WHITELISTED_CHANNELS + ( + Channels.data_science_and_ai, + Channels.algos_and_data_structs, + Channels.python_help, +) + + +class InvalidTypstError(Exception): + """Represents an error caused by invalid typst source code.""" + + def __init__(self, logs: str | None): + super().__init__(logs) + self.logs = logs + + +class TypstTimeoutError(Exception): + """Represents an error caused by the Typst rendering taking too long.""" + + +class OutputTooBigError(Exception): + """Represents an error caused by the Typst output image being too big.""" + + +class EmptyImageError(Exception): + """Represents an error caused by the output image being empty.""" + + +class TypstWorkerCrashedError(Exception): + """Represents an error caused by Typst rendering process crashing. This can mean the memory limit was exceeded.""" + + +class Typst(commands.Cog): + """Renders typst.""" + + def __init__(self, bot: Bot): + self.bot = bot + + async def _setup_typst(self) -> None: + await self._ensure_typst_executable() + await self._setup_packages() + + async def _ensure_typst_executable(self) -> None: + path = Path(Config.typst_path).resolve() + if path.exists(): + if not path.is_file(): + raise ValueError("Typst path exists but doesn't point to a file:", path) + else: + log.info("Typst executable not found at '%s', downloading", path) + await self._download_typst_executable() + proc = await asyncio.subprocess.create_subprocess_exec( + path, + "--version", + stdin=asyncio.subprocess.DEVNULL, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, _ = await proc.communicate() + log.info( + f"Typst executable reports itself as {stdout.decode('utf-8').strip()!r}." + ) + + async def _download_typst_executable(self) -> None: + if not Config.typst_archive_url: + raise ValueError("Trying to download Typst but the archive URL isn't set") + if not Config.typst_archive_sha256: + raise ValueError("Trying to download Typst but the archive hash isn't set") + async with self.bot.http_session.get( + Config.typst_archive_url, raise_for_status=True + ) as response: + arc_data = await response.read() + digest = hashlib.sha256(arc_data).hexdigest() + if digest != Config.typst_archive_sha256: + raise ValueError( + f"Retrieved archive doesn't match hash {Config.typst_archive_sha256}; " + f"instead got file with size {len(arc_data)} and hash {digest}" + ) + log.info("Retrieved Typst archive, unpacking") + typst_executable = archive_retrieve_file( + arc_data, + filename="typst.exe" if sys.platform == "win32" else "typst", + ) + log.info("Storing the typst executable on disk") + exe_path = Path(Config.typst_path) + exe_path.parent.mkdir(exist_ok=True) + exe_path.write_bytes(typst_executable) + exe_path.chmod(0o555) # read, execute, not write + + async def _setup_packages(self) -> None: + if TYPST_PACKAGES_DIR.exists(): + return + log.info( + f"The Typst package directory '{TYPST_PACKAGES_DIR}' doesn't currently exist; populating allowed packages." + ) + with TemporaryDirectory( + prefix="packageinstall", dir=CACHE_DIRECTORY + ) as tempdir: + await compile_typst(PACKAGES_INSTALL_STRING, root_path=Path(tempdir)) + + if not TYPST_PACKAGES_DIR.exists(): + raise ValueError( + f"'{TYPST_PACKAGES_DIR}' still doesn't exist after installing packages - " + "this suggests the packages path is incorrect or no packages were installed." + ) + num_packages = 0 + for universe in TYPST_PACKAGES_DIR.iterdir(): + num_packages += sum(1 for _ in universe.iterdir()) + log.info( + f"Installed {num_packages} packages. Locking the packages directory against writes." + ) + # for security, remove the write permissions from typst packages + mode = 0o555 # read, execute, not write + for ( + dirpath, + dirnames, + filenames, + ) in TYPST_PACKAGES_DIR.walk(): + for d in dirnames + filenames: + (dirpath / d).chmod(mode) + TYPST_PACKAGES_DIR.chmod(mode) + + @commands.command() + @commands.max_concurrency(MAX_CONCURRENCY, commands.BucketType.guild, wait=True) + @whitelist_override(channels=TYPST_ALLOWED_CHANNNELS) + async def typst(self, ctx: commands.Context, *, query: str) -> None: + """Renders the text in typst and sends the image.""" + query = prepare_input(query) + + # the hash of the query is used as the tempdir name in the cache, + # as well as the name for the rendered file. + query_hash = hashlib.md5(query.encode()).hexdigest() # noqa: S324 + image_path = CACHE_DIRECTORY / f"{query_hash}.png" + async with ctx.typing(): + if not image_path.exists(): + try: + await self.render_typst(query, query_hash, image_path) + except InvalidTypstError as err: + embed = await self._prepare_error_embed(err) + await ctx.send(embed=embed) + image_path.unlink(missing_ok=True) + return + except EmptyImageError: + await ctx.send("The output image was empty.") + return + except TypstTimeoutError: + await ctx.send( + f"Typst rendering took too long (current timeout is {TYPST_TIMEOUT}s)." + ) + image_path.unlink(missing_ok=True) + return + except OutputTooBigError: + await ctx.send( + f"Typst output was too big (current limit is {MAX_RAW_SIZE/1024**2:.1f}MB.)" + ) + return + except TypstWorkerCrashedError: + await ctx.send( + "Worker process crashed. " + f"Perhaps the memory limit of {TYPST_MEMORY_LIMIT/1024**2:.1f}MB was exceeded?" + ) + return + await ctx.send(file=discord.File(image_path, "typst.png")) + + async def render_typst( + self, query: str, tempdir_name: str, image_path: Path + ) -> None: + """ + Renders the query as Typst to PNG. + + If successful, the processed output is stored in `image_path`. + `tempdir_name` under cache will be the temporary root directory to be used. + """ + source = TEMPLATE.substitute(text=query) + with TemporaryDirectory(prefix=tempdir_name, dir=CACHE_DIRECTORY) as tempdir: + try: + async with asyncio.timeout(TYPST_TIMEOUT): + res = await compile_typst( + source, + root_path=Path(tempdir), + format="png", + mem_rlimit=TYPST_MEMORY_LIMIT, + jobs=WORKER_JOBS, + ) + except TimeoutError: + raise TypstTimeoutError + except CalledProcessError as e: + err = e.stderr.decode("utf-8") + # when the memory limit is reached this usually shows up as signal 6 (SIGABRT), but it can vary + if e.returncode < 0: + log.debug( + "Typst subprocess died due to a signal %s", + str(e).split("died with")[-1].strip(), + ) + raise TypstWorkerCrashedError + # if in doubt we assume it's a normal error and return the logs + raise InvalidTypstError(err) + + raw_img = res.output + if len(raw_img) > MAX_RAW_SIZE: + log.debug( + "Raw image rejected for having size %.1f MB", len(raw_img) / 1024**2 + ) + raise OutputTooBigError + + if CROP_PADDING is None: + image_path.write_bytes(raw_img) + else: + res = crop_background( + Image.open(BytesIO(raw_img)).convert("RGB"), + (255, 255, 255), + pad=CROP_PADDING, + ) + if res is None: + raise EmptyImageError + res.save(image_path) + + async def _prepare_error_embed( + self, err: InvalidTypstError | None + ) -> discord.Embed: + title = "There was some issue rendering your Typst, please retry later." + if isinstance(err, InvalidTypstError): + title = "Failed to render input as Typst." + + embed = discord.Embed(title=title) + embed.description = "No logs available." + logs = getattr(err, "logs", None) + if logs: + logs_paste_url = await self._upload_to_pastebin(logs) + embed.description = "Couldn't upload logs." + if logs_paste_url: + embed.description = f"[View Logs]({logs_paste_url})" + return embed + + async def _upload_to_pastebin(self, text: str) -> str | None: + """Uploads `text` to the paste service, returning the url if successful.""" + file = PasteFile(content=text, lexer="text") + try: + resp = await send_to_paste_service( + files=[file], + http_session=self.bot.http_session, + ) + return resp.link + except (PasteTooLongError, PasteUploadError) as e: + log.info("Error when uploading typst output to pastebin. %s", e) + return None + + +async def setup(bot: Bot) -> None: + """Load the Typst Cog.""" + cog = Typst(bot) + await cog._setup_typst() + await bot.add_cog(cog) diff --git a/bot/resources/fun/typst_packages.typ b/bot/resources/fun/typst_packages.typ new file mode 100644 index 0000000000..ce79ade15b --- /dev/null +++ b/bot/resources/fun/typst_packages.typ @@ -0,0 +1,6 @@ +// This file is ran to install the allowed packages before the package directory is write-locked. +// It is NOT included into every query's template. +#import "@preview/codly:1.0.0" // code presentation (needs configuration in the document to work) +#import "@preview/cetz:0.3.1" // similar to latex's tikz +#import "@preview/fletcher:0.5.2" // drawing diagrams; depends on cetz +#import "@preview/physica:0.9.3" // math and physics diff --git a/bot/resources/fun/typst_template.typ b/bot/resources/fun/typst_template.typ new file mode 100644 index 0000000000..7f080d9634 --- /dev/null +++ b/bot/resources/fun/typst_template.typ @@ -0,0 +1,4 @@ +// margin:0cm cuts off parts of letters, so add a bit more. +#set page("a4", height: auto, margin: 0.5cm) + +$text diff --git a/bot/utils/archives.py b/bot/utils/archives.py new file mode 100644 index 0000000000..61ad2d1a51 --- /dev/null +++ b/bot/utils/archives.py @@ -0,0 +1,44 @@ +import tarfile +import typing +import zipfile +from io import BytesIO +from pathlib import PurePath + + +def _tar_retrieve_file(archive_data: typing.BinaryIO, filename: str) -> bytes: + with tarfile.open(fileobj=archive_data) as arc: + for el in arc.getmembers(): + if PurePath(el.name).name == filename: + fo = arc.extractfile(el) + if fo is None: + raise ValueError( + "Member has the right name but couldn't extract:", el + ) + return fo.read() + raise ValueError("No member with this name was found in archive:", filename) + + +def _zip_retrieve_file(archive_data: typing.BinaryIO, filename: str) -> bytes: + with zipfile.ZipFile(file=archive_data) as arc: + for el in arc.filelist: + if PurePath(el.filename).name == filename: + return arc.read(el) + raise ValueError("No member with this name was found in archive:", filename) + + +def archive_retrieve_file( + archive_data: bytes | typing.BinaryIO, filename: str +) -> bytes: + """Retrieves a single file by filename (not by full path) from a tar or zip archive in memory.""" + if isinstance(archive_data, bytes | bytearray | memoryview): + archive_data = BytesIO(archive_data) + if tarfile.is_tarfile(archive_data): + return _tar_retrieve_file(archive_data, filename) + try: + return _zip_retrieve_file(archive_data, filename) + except zipfile.BadZipFile as e: + if "File is not a zip file" in str(e): + raise ValueError( + "Archive unsupported: was neither a valid tarfile nor a valid zipfile" + ) + raise diff --git a/bot/utils/codeblocks.py b/bot/utils/codeblocks.py new file mode 100644 index 0000000000..55a3c9f991 --- /dev/null +++ b/bot/utils/codeblocks.py @@ -0,0 +1,18 @@ +import re + +FORMATTED_CODE_REGEX = re.compile( + r"(?P(?P```)|``?)" # code delimiter: 1-3 backticks; (?P=block) only matches if it's a block + r"(?(block)(?:(?P[a-z]+)\n)?)" # if we're in a block, match optional language (only letters plus newline) + r"(?:[ \t]*\n)*" # any blank (empty or tabs/spaces only) lines before the code + r"(?P.*?)" # extract all code inside the markup + r"\s*" # any more whitespace before the end of the code markup + r"(?P=delim)", # match the exact same delimiter from the start again + re.DOTALL | re.IGNORECASE, # "." also matches newlines, case insensitive +) + + +def prepare_input(text: str) -> str: + """Extract input from a codeblock, if it is in one. Otherwise returns the entire text.""" + if match := FORMATTED_CODE_REGEX.match(text): + return match.group("code") + return text diff --git a/bot/utils/images.py b/bot/utils/images.py new file mode 100644 index 0000000000..d4ac742f01 --- /dev/null +++ b/bot/utils/images.py @@ -0,0 +1,49 @@ +from io import BytesIO +from typing import BinaryIO + +from PIL import Image, ImageChops + + +def process_image(data: bytes, out_file: BinaryIO, pad: int) -> None: + """Read `data` as an image file, and paste it on a white background.""" + image = Image.open(BytesIO(data)).convert("RGBA") + width, height = image.size + background = Image.new("RGBA", (width + 2 * pad, height + 2 * pad), "WHITE") + + # paste the image on the background, using the same image as the mask + # when an RGBA image is passed as the mask, its alpha band is used. + # this has the effect of skipping pasting the pixels where the image is transparent. + background.paste(image, (pad, pad), image) + background.save(out_file) + + +def crop_background( + img: Image.Image, background_color: tuple[int, ...], pad: int = 0 +) -> Image.Image | None: + """ + Crops the image to include only the pixels that aren't the `background_color`. Optionally leaves some padding. + + If the image is totally empty, returns None if pad==0, otherwise an empty image of only the padding. + """ + if not pad >= 0: + raise ValueError(f"pad must be >=0, got {pad}") + + # https://stackoverflow.com/a/48605963 + bg = Image.new(img.mode, img.size, background_color) + diff = ImageChops.difference(img, bg) + diff = ImageChops.add(diff, diff, 2.0, -100) + bbox = diff.getbbox() + if not bbox: + if pad == 0: + return None + # empty image with padding-related sizes + bbox = (0, 0, 2 * pad, 2 * pad) + else: + l, u, r, b = bbox # noqa: E741 + bbox = ( + max(l - pad, 0), + max(u - pad, 0), + min(r + pad, img.width), + min(b + pad, img.height), + ) + return img.crop(bbox) diff --git a/bot/utils/typst.py b/bot/utils/typst.py new file mode 100644 index 0000000000..02e8483feb --- /dev/null +++ b/bot/utils/typst.py @@ -0,0 +1,86 @@ +import asyncio.subprocess +import contextlib +import resource +from dataclasses import dataclass +from functools import partial +from pathlib import Path +from subprocess import CalledProcessError +from typing import Literal + +from bot.constants import Typst as Config + + +def _set_limits(mem_rlimit: int | None = None) -> None: + if mem_rlimit is not None: + resource.setrlimit(resource.RLIMIT_AS, (mem_rlimit, -1)) + + +@dataclass +class TypstCompileResult: + """Result of Typst compilation.""" + + output: bytes + stderr: bytes + + +async def compile_typst( + source: str, + root_path: Path, + format: Literal["pdf", "svg", "png"] = "png", + ppi: float | None = None, + mem_rlimit: int | None = None, + jobs: int | None = None, +) -> TypstCompileResult: + """ + Renders Typst in a subprocess. + + Since malicious Typst source can take arbitrary resources to compile, + this should be ran with a timeout, and ideally a `mem_rlimit`. + `root_path` should be a path to a directory where all the files (if any) + that should be accessible are placed. + + """ + typst_path = Path(Config.typst_path).resolve() + if not typst_path.exists(): + raise ValueError("Typst executable was not found at path", typst_path) + if not root_path.is_dir(): + raise ValueError("Root directory was not a directory", root_path) + + args = [ + "compile", + "--root", + root_path, + "--format", + format, + ] + if ppi is not None: + args += ["--ppi", str(ppi)] + if jobs is not None: + args += ["--jobs", str(jobs)] + # input and output from CLI + args += ["-", "-"] + + try: + proc = await asyncio.subprocess.create_subprocess_exec( + typst_path, + *args, + stdin=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + preexec_fn=partial(_set_limits, mem_rlimit=mem_rlimit), + ) + + stdout, stderr = await proc.communicate(input=source.encode("utf-8")) + if proc.returncode is None: + # shouldn't be possible + raise RuntimeError("Process didn't terminate after communicate") + if proc.returncode != 0: + raise CalledProcessError( + proc.returncode, [typst_path, *args], stdout, stderr + ) + # if the task is cancelled or any other problem happens, make sure to kill the worker if it still exists + except BaseException: + with contextlib.suppress(UnboundLocalError, ProcessLookupError): + proc.kill() + raise + return TypstCompileResult(output=stdout, stderr=stderr) diff --git a/pyproject.toml b/pyproject.toml index edac48d0fa..653c218fcd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,6 +74,7 @@ ignore = [ "RUF029", "S311", "SIM102", "SIM108", + "S404", # S404 is bugged and doesn't respect noqa ] [tool.ruff.lint.isort]