From b95645b0a0c26afb22e51d1e220e79a0c47a927c Mon Sep 17 00:00:00 2001 From: Max Heller Date: Tue, 21 Jan 2025 20:54:47 -0500 Subject: [PATCH] Prefer MIME type when determining extensions for MediaBag items (#10557) Currently, remote images added to the MediaBag are stored at paths with extensions determined based on the external URI. For instance, an image from https://example.com/image.png is stored as .png. If the URI does not contain an extension (e.g., https://example.com/image), then the content-type of the downloaded image is used to determine the extension. This change switches the precedence such that content-type is preferred over extensions contained in the URI. This is necessary because some images are located at URIs with misleading extensions -- shields.io, for instance, serves SVGs from URIs with .yml extensions. With this change, the image/svg+xml content-type is now preferred over the .yml URI extension. This fixes a bug in the PDF writer in which such an image would be mishandled due to not being identified as an SVG. --- src/Text/Pandoc/MediaBag.hs | 8 +++++--- test/Tests/MediaBag.hs | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/Text/Pandoc/MediaBag.hs b/src/Text/Pandoc/MediaBag.hs index 67839970359d..1a4ea2ba1da8 100644 --- a/src/Text/Pandoc/MediaBag.hs +++ b/src/Text/Pandoc/MediaBag.hs @@ -107,9 +107,11 @@ insertMedia fp mbMime contents (MediaBag mediamap) _ -> getMimeTypeDef fp'' mt = fromMaybe fallback mbMime path = maybe fp'' (unEscapeString . uriPath) uri - ext = case takeExtension path of - '.':e | '%' `notElem` e -> '.':e - _ -> maybe "" (\x -> '.':T.unpack x) $ extensionFromMimeType mt + ext = case extensionFromMimeType mt of + Just e -> '.':T.unpack e + Nothing -> case takeExtension path of + '.':e | '%' `notElem` e -> '.':e + _ -> "" -- | Lookup a media item in a 'MediaBag', returning mime type and contents. lookupMedia :: FilePath diff --git a/test/Tests/MediaBag.hs b/test/Tests/MediaBag.hs index 4cb4ab807dc6..fbc9eb153194 100644 --- a/test/Tests/MediaBag.hs +++ b/test/Tests/MediaBag.hs @@ -29,7 +29,7 @@ tests = [ assertBool "file in directory is not extracted with original name" exists1 exists2 <- doesFileExist ("foo" "f9d88c3dbe18f6a7f5670e994a947d51216cdf0e.jpg") assertBool "file above directory is not extracted with hashed name" exists2 - exists3 <- doesFileExist ("foo" "2a0eaa89f43fada3e6c577beea4f2f8f53ab6a1d.lua") + exists3 <- doesFileExist ("foo" "2a0eaa89f43fada3e6c577beea4f2f8f53ab6a1d.png") exists4 <- doesFileExist "a.lua" assertBool "data uri with malicious payload gets written outside of destination dir" (exists3 && not exists4)