Skip to content

Commit

Permalink
Merge pull request #144 from SupercedeTech/master
Browse files Browse the repository at this point in the history
Add streaming of xlsx file support
  • Loading branch information
qrilka authored Dec 19, 2021
2 parents bb3d80d + e549d1f commit 5a7dc61
Show file tree
Hide file tree
Showing 21 changed files with 2,317 additions and 548 deletions.
16 changes: 15 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,10 +1,24 @@
TAGS
cabal-dev
dist
dist-newstyle
*sandbox*
#*#
*.*~
specs
samples
.stack-work
*.lock

# nix
result
result-doc
*.lock
*.o
*.hi
*.prof
*.aux
*.hp
*.ps
.envrc
.direnv
cabal.project.local
27 changes: 27 additions & 0 deletions benchmarks/Main.hs
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,47 @@
module Main (main) where

import Codec.Xlsx
import Codec.Xlsx.Parser.Stream
import Codec.Xlsx.Writer.Stream
import Control.DeepSeq
import Control.Lens
import Control.Monad (void)
import Criterion.Main
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as LB
import qualified Data.Conduit as C
import qualified Data.Conduit.Combinators as C
import Data.Maybe

main :: IO ()
main = do
let filename = "data/testInput.xlsx"
-- "data/6000.rows.x.26.cols.xlsx"
bs <- BS.readFile filename
let bs' = LB.fromStrict bs
parsed :: Xlsx
parsed = toXlsxFast bs'
idx <- fmap (fromMaybe (error "ix not found")) $ runXlsxM filename $ makeIndexFromName "Sample list"
items <- runXlsxM filename $ collectItems idx
deepseq (parsed, bs', idx, items) (pure ())
defaultMain
[ bgroup
"readFile"
[ bench "with xlsx" $ nf toXlsx bs'
, bench "with xlsx fast" $ nf toXlsxFast bs'
, bench "with stream (counting)" $ nfIO $ runXlsxM filename $ countRowsInSheet idx
, bench "with stream (reading)" $ nfIO $ runXlsxM filename $ readSheet idx (pure . rwhnf)
]
, bgroup
"writeFile"
[ bench "with xlsx" $ nf (fromXlsx 0) parsed
, bench "with stream (no sst)" $
nfIO $ C.runConduit $
void (writeXlsxWithSharedStrings defaultSettings mempty $ C.yieldMany $ view si_row <$> items)
C..| C.fold
, bench "with stream (sst)" $
nfIO $ C.runConduit $
void (writeXlsx defaultSettings $ C.yieldMany $ view si_row <$> items)
C..| C.fold
]
]
Binary file added data/inline-strings.xlsx
Binary file not shown.
27 changes: 27 additions & 0 deletions default.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
let
rev = "07ca3a021f05d6ff46bbd03c418b418abb781279"; # first 21.05 release
url = "https://github.com/NixOS/nixpkgs/archive/${rev}.tar.gz";
compiler = "ghc884";
isLibraryProfiling = false;
pkgs = import (builtins.fetchTarball url) {
config = if isLibraryProfiling then ({
packageOverrides = pkgs_super: {
haskell = pkgs_super.haskell // {
packages = pkgs_super.haskell.packages // {
"${compiler}" = pkgs_super.haskell.packages."${compiler}".override {
overrides = self: super: {
mkDerivation = args: super.mkDerivation (args // {
enableLibraryProfiling = true;
});
};
};
};
};
};
}) else {};
};

hpkgs = pkgs.haskell.packages."${compiler}";
in pkgs.haskell.lib.overrideCabal (hpkgs.callCabal2nix "xlsx" ./. {}) {
libraryToolDepends = [pkgs.cabal-install];
}
2 changes: 2 additions & 0 deletions shell.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
(import ./default.nix).env # not flake-based
# (import ./.).devShell."${builtins.currentSystem}" # flake-based
10 changes: 5 additions & 5 deletions src/Codec/Xlsx/Parser.hs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE NoMonomorphismRestriction #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE PackageImports #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TupleSections #-}
Expand All @@ -16,7 +17,7 @@ module Codec.Xlsx.Parser
, Parser
) where

import qualified Codec.Archive.Zip as Zip
import qualified "zip-archive" Codec.Archive.Zip as Zip
import Control.Applicative
import Control.Arrow (left)
import Control.Error.Safe (headErr)
Expand All @@ -27,7 +28,7 @@ import Lens.Micro
#else
import Control.Lens hiding ((<.>), element, views)
#endif
import Control.Monad (forM, join, void)
import Control.Monad (join, void)
import Control.Monad.Except (catchError, throwError)
import Data.Bool (bool)
import Data.ByteString (ByteString)
Expand All @@ -54,7 +55,6 @@ import Codec.Xlsx.Parser.Internal
import Codec.Xlsx.Parser.Internal.PivotTable
import Codec.Xlsx.Types
import Codec.Xlsx.Types.Cell (formulaDataFromCursor)
import Codec.Xlsx.Types.Common (xlsxTextToCellValue)
import Codec.Xlsx.Types.Internal
import Codec.Xlsx.Types.Internal.CfPair
import Codec.Xlsx.Types.Internal.CommentTable as CommentTable
Expand All @@ -71,7 +71,7 @@ import Codec.Xlsx.Types.PivotTable.Internal
toXlsx :: L.ByteString -> Xlsx
toXlsx = either (error . show) id . toXlsxEither

data ParseError = InvalidZipArchive
data ParseError = InvalidZipArchive String
| MissingFile FilePath
| InvalidFile FilePath Text
| InvalidRef FilePath RefId
Expand Down Expand Up @@ -106,7 +106,7 @@ toXlsxEitherBase ::
-> L.ByteString
-> Parser Xlsx
toXlsxEitherBase parseSheet bs = do
ar <- left (const InvalidZipArchive) $ Zip.toArchiveOrFail bs
ar <- left InvalidZipArchive $ Zip.toArchiveOrFail bs
sst <- getSharedStrings ar
contentTypes <- getContentTypes ar
(wfs, names, cacheSources, dateBase) <- readWorkbook ar
Expand Down
53 changes: 53 additions & 0 deletions src/Codec/Xlsx/Parser/Internal/Memoize.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE TypeApplications #-}

-- | I rewrote: https://hackage.haskell.org/package/unliftio-0.2.20/docs/src/UnliftIO.Memoize.html#Memoized
-- for monad trans basecontrol
-- we don't need a generic `m` anyway. it's good enough in base IO.
module Codec.Xlsx.Parser.Internal.Memoize
( Memoized
, runMemoized
, memoizeRef
) where

import Control.Applicative as A
import Control.Monad (join)
import Control.Monad.IO.Class
import Data.IORef
import Control.Exception

-- | A \"run once\" value, with results saved. Extract the value with
-- 'runMemoized'. For single-threaded usage, you can use 'memoizeRef' to
-- create a value. If you need guarantees that only one thread will run the
-- action at a time, use 'memoizeMVar'.
--
-- Note that this type provides a 'Show' instance for convenience, but not
-- useful information can be provided.
newtype Memoized a = Memoized (IO a)
deriving (Functor, A.Applicative, Monad)
instance Show (Memoized a) where
show _ = "<<Memoized>>"

-- | Extract a value from a 'Memoized', running an action if no cached value is
-- available.
runMemoized :: MonadIO m => Memoized a -> m a
runMemoized (Memoized m) = liftIO m
{-# INLINE runMemoized #-}

-- | Create a new 'Memoized' value using an 'IORef' under the surface. Note that
-- the action may be run in multiple threads simultaneously, so this may not be
-- thread safe (depending on the underlying action).
memoizeRef :: IO a -> IO (Memoized a)
memoizeRef action = do
ref <- newIORef Nothing
pure $ Memoized $ do
mres <- readIORef ref
res <-
case mres of
Just res -> pure res
Nothing -> do
res <- try @SomeException action
writeIORef ref $ Just res
pure res
either throwIO pure res
Loading

0 comments on commit 5a7dc61

Please sign in to comment.