From 6a7859ea53c07d0e8eacf7e6424c20f9336279d5 Mon Sep 17 00:00:00 2001 From: Stephan Schiffels Date: Wed, 8 Nov 2023 18:17:13 +0100 Subject: [PATCH 01/34] started jannoCoalesce --- src/Poseidon/CLI/Jannocoalesce.hs | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 src/Poseidon/CLI/Jannocoalesce.hs diff --git a/src/Poseidon/CLI/Jannocoalesce.hs b/src/Poseidon/CLI/Jannocoalesce.hs new file mode 100644 index 00000000..5e60f5ef --- /dev/null +++ b/src/Poseidon/CLI/Jannocoalesce.hs @@ -0,0 +1,22 @@ +module Poseidon.CLI.Jannocoalesce where + +data JannoCoalesceOptions = JannoCoalesceOptions + { _jannocoalesceSource :: FilePath + , _jannocoalesceTarget :: FilePath + , _jannocoalesceOutSpec :: Maybe FilePath -- Nothing means "in place" + , _jannocoalesceFillColumns :: [String] -- empty list means All + , _jannocoalesceOverwriteColumns :: Bool + } + +runJannocoalesce :: JannoCoalesceOptions -> PoseidonIO () +runJannocoalesce (JannoCoalesceOptions source target outSpec fillColumns overwrite) = do + sourceJanno <- readJannoFile source + targetJanno <- readJannoFile target + let newJanno = do -- list monad + sourceRow <- sourceJanno -- loop over source rows + targetRow <- lookupRowById targetJanno sourceRow + + +lookupRowById :: (MonadThrow m) => JannoRows -> JannoRow -> m JannoRow +lookupRowById = do + undefined From 887721198b0017b58a724c325de23c96c57c3de5 Mon Sep 17 00:00:00 2001 From: Stephan Schiffels Date: Wed, 8 Nov 2023 18:20:39 +0100 Subject: [PATCH 02/34] minor --- src/Poseidon/CLI/Jannocoalesce.hs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/Poseidon/CLI/Jannocoalesce.hs b/src/Poseidon/CLI/Jannocoalesce.hs index 5e60f5ef..a0c69e33 100644 --- a/src/Poseidon/CLI/Jannocoalesce.hs +++ b/src/Poseidon/CLI/Jannocoalesce.hs @@ -12,10 +12,9 @@ runJannocoalesce :: JannoCoalesceOptions -> PoseidonIO () runJannocoalesce (JannoCoalesceOptions source target outSpec fillColumns overwrite) = do sourceJanno <- readJannoFile source targetJanno <- readJannoFile target - let newJanno = do -- list monad - sourceRow <- sourceJanno -- loop over source rows - targetRow <- lookupRowById targetJanno sourceRow - + newJanno = forM sourceJanno $ \sourceRow -> do + targetRow <- lookupRowById targetJanno sourceRow + lookupRowById :: (MonadThrow m) => JannoRows -> JannoRow -> m JannoRow lookupRowById = do From 6db282c4c8650c91f1f7d1b05906facc3c5fefa5 Mon Sep 17 00:00:00 2001 From: Stephan Schiffels Date: Wed, 15 Nov 2023 18:30:47 +0100 Subject: [PATCH 03/34] first simple implementation, no compile, no test yet --- src/Poseidon/CLI/Jannocoalesce.hs | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/src/Poseidon/CLI/Jannocoalesce.hs b/src/Poseidon/CLI/Jannocoalesce.hs index a0c69e33..42a69ed6 100644 --- a/src/Poseidon/CLI/Jannocoalesce.hs +++ b/src/Poseidon/CLI/Jannocoalesce.hs @@ -9,13 +9,24 @@ data JannoCoalesceOptions = JannoCoalesceOptions } runJannocoalesce :: JannoCoalesceOptions -> PoseidonIO () -runJannocoalesce (JannoCoalesceOptions source target outSpec fillColumns overwrite) = do - sourceJanno <- readJannoFile source - targetJanno <- readJannoFile target - newJanno = forM sourceJanno $ \sourceRow -> do - targetRow <- lookupRowById targetJanno sourceRow +runJannocoalesce (JannoCoalesceOptions source target outSpec fields overwrite) = do + JannoRows sourceRows <- readJannoFile source + JannoRows targetRows <- readJannoFile target + newJanno <- forM targetRows $ \targetRow -> do + let posId = jPoseidonID targetRow + sourceRowCandidates = filter (\r -> jPoseidonId r == posId) sourceRows + case sourceRowCandidates of + [] -> return targetRow + [keyRow] -> return $ mergeRows targetRow keyRow fields overwrite + _ -> throwM $ PoseidonGenericException $ "source file contains multiple rows with key " ++ posId + let outPath = maybe target id outSpec + liftIO $ writeJannoFile outPath newJanno -lookupRowById :: (MonadThrow m) => JannoRows -> JannoRow -> m JannoRow -lookupRowById = do - undefined +mergeRow :: JannoRow -> JannoRow -> [String] -> Bool -> JannoRow +mergeRow targetRow sourceRow fields overwrite = do + let targetRowRecord = toNamedRecord targetRow + sourceRowRecord = toNamedRecord sourceRow + fromNamedRecord $ HM.unionWith mergeIfMissing targetRowRecord sourceRowRecord + where + mergeIfMissing targetVal sourceVal = if targetVal `elem` ["n/a", ""] || overwrite then sourceVal else targetVal From cdaa4e5059a185f869dd0c33219e55a28cab2757 Mon Sep 17 00:00:00 2001 From: Stephan Schiffels Date: Tue, 21 Nov 2023 20:56:07 +0100 Subject: [PATCH 04/34] continued --- src/Poseidon/CLI/Jannocoalesce.hs | 34 +++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/src/Poseidon/CLI/Jannocoalesce.hs b/src/Poseidon/CLI/Jannocoalesce.hs index 42a69ed6..16a36bbb 100644 --- a/src/Poseidon/CLI/Jannocoalesce.hs +++ b/src/Poseidon/CLI/Jannocoalesce.hs @@ -1,5 +1,16 @@ +{-# LANGUAGE OverloadedStrings #-} module Poseidon.CLI.Jannocoalesce where +import Poseidon.Janno (JannoRow(..), JannoRows(..), writeJannoFile, readJannoFile) +import Poseidon.Utils (PoseidonIO, PoseidonException(..)) + +import Control.Monad (forM) +import Control.Monad.Catch (throwM, MonadThrow) +import Control.Monad.IO.Class (liftIO) +import qualified Data.ByteString.Char8 as BSC +import qualified Data.Csv as Csv +import qualified Data.HashMap.Strict as HM + data JannoCoalesceOptions = JannoCoalesceOptions { _jannocoalesceSource :: FilePath , _jannocoalesceTarget :: FilePath @@ -14,19 +25,26 @@ runJannocoalesce (JannoCoalesceOptions source target outSpec fields overwrite) = JannoRows targetRows <- readJannoFile target newJanno <- forM targetRows $ \targetRow -> do let posId = jPoseidonID targetRow - sourceRowCandidates = filter (\r -> jPoseidonId r == posId) sourceRows + sourceRowCandidates = filter (\r -> jPoseidonID r == posId) sourceRows case sourceRowCandidates of [] -> return targetRow - [keyRow] -> return $ mergeRows targetRow keyRow fields overwrite + [keyRow] -> mergeRow targetRow keyRow fields overwrite _ -> throwM $ PoseidonGenericException $ "source file contains multiple rows with key " ++ posId let outPath = maybe target id outSpec - liftIO $ writeJannoFile outPath newJanno + liftIO $ writeJannoFile outPath (JannoRows newJanno) -mergeRow :: JannoRow -> JannoRow -> [String] -> Bool -> JannoRow +mergeRow :: (MonadThrow m) => JannoRow -> JannoRow -> [String] -> Bool -> m JannoRow mergeRow targetRow sourceRow fields overwrite = do - let targetRowRecord = toNamedRecord targetRow - sourceRowRecord = toNamedRecord sourceRow - fromNamedRecord $ HM.unionWith mergeIfMissing targetRowRecord sourceRowRecord + let targetRowRecord = Csv.toNamedRecord targetRow + sourceRowRecord = Csv.toNamedRecord sourceRow + parseResult = Csv.runParser . Csv.parseNamedRecord $ HM.unionWithKey mergeIfMissing targetRowRecord sourceRowRecord + case parseResult of + Left err -> throwM $ PoseidonGenericException $ "Janno row-merge error: " ++ err + Right r -> return r where - mergeIfMissing targetVal sourceVal = if targetVal `elem` ["n/a", ""] || overwrite then sourceVal else targetVal + mergeIfMissing key targetVal sourceVal = + if (null key || (BSC.unpack key `elem` fields)) && (targetVal `elem` ["n/a", ""] || overwrite) then + sourceVal + else + targetVal From 0e57116dfeb062e3e4c8f28b1f43d0a6f4f1da8c Mon Sep 17 00:00:00 2001 From: Stephan Schiffels Date: Thu, 30 Nov 2023 14:08:05 +0100 Subject: [PATCH 05/34] finished API and compiles. Not tested yet --- poseidon-hs.cabal | 3 +- src-executables/Main-trident.hs | 39 +++++--- src/Poseidon/CLI/Jannocoalesce.hs | 44 ++++++--- .../CLI/OptparseApplicativeParsers.hs | 97 ++++++++++++++----- 4 files changed, 131 insertions(+), 52 deletions(-) diff --git a/poseidon-hs.cabal b/poseidon-hs.cabal index 9f96073b..64561a57 100644 --- a/poseidon-hs.cabal +++ b/poseidon-hs.cabal @@ -21,7 +21,8 @@ library Poseidon.CLI.Summarise, Poseidon.CLI.Validate, Poseidon.Utils, Poseidon.CLI.Survey, Poseidon.CLI.Forge, Poseidon.CLI.Init, Poseidon.CLI.Rectify, Poseidon.CLI.Fetch, Poseidon.CLI.Genoconvert, - Poseidon.CLI.OptparseApplicativeParsers, Poseidon.CLI.Timetravel + Poseidon.CLI.OptparseApplicativeParsers, Poseidon.CLI.Timetravel, + Poseidon.CLI.Jannocoalesce other-modules: Paths_poseidon_hs hs-source-dirs: src build-depends: base >= 4.7 && < 5, sequence-formats>=1.6.1, text, time, pipes-safe, diff --git a/src-executables/Main-trident.hs b/src-executables/Main-trident.hs index 28be33b2..0727acc7 100644 --- a/src-executables/Main-trident.hs +++ b/src-executables/Main-trident.hs @@ -11,6 +11,8 @@ import Poseidon.CLI.Genoconvert (GenoconvertOptions (.. runGenoconvert) import Poseidon.CLI.Init (InitOptions (..), runInit) +import Poseidon.CLI.Jannocoalesce (JannoCoalesceOptions (..), + runJannocoalesce) import Poseidon.CLI.List (ListOptions (..), runList) import Poseidon.CLI.OptparseApplicativeParsers @@ -68,6 +70,7 @@ data Subcommand = | CmdChronicle ChronicleOptions | CmdTimetravel TimetravelOptions | CmdServe ServeOptions + | CmdJannoCoalesce JannoCoalesceOptions main :: IO () main = do @@ -88,18 +91,20 @@ main = do runCmd :: Subcommand -> PoseidonIO () runCmd o = case o of - CmdInit opts -> runInit opts - CmdList opts -> runList opts - CmdFetch opts -> runFetch opts - CmdForge opts -> runForge opts - CmdGenoconvert opts -> runGenoconvert opts - CmdSummarise opts -> runSummarise opts - CmdSurvey opts -> runSurvey opts - CmdRectify opts -> runRectify opts - CmdValidate opts -> runValidate opts - CmdChronicle opts -> runChronicle opts - CmdTimetravel opts -> runTimetravel opts - CmdServe opts -> runServerMainThread opts + -- alphabetic order + CmdChronicle opts -> runChronicle opts + CmdFetch opts -> runFetch opts + CmdForge opts -> runForge opts + CmdGenoconvert opts -> runGenoconvert opts + CmdJannoCoalesce opts -> runJannocoalesce opts + CmdInit opts -> runInit opts + CmdList opts -> runList opts + CmdRectify opts -> runRectify opts + CmdServe opts -> runServerMainThread opts + CmdSummarise opts -> runSummarise opts + CmdSurvey opts -> runSurvey opts + CmdTimetravel opts -> runTimetravel opts + CmdValidate opts -> runValidate opts optParserInfo :: OP.ParserInfo Options optParserInfo = OP.info ( @@ -131,6 +136,7 @@ subcommandParser = OP.subparser ( OP.command "fetch" fetchOptInfo <> OP.command "forge" forgeOptInfo <> OP.command "genoconvert" genoconvertOptInfo <> + OP.command "jannocoalesce" jannocoalesceOptInfo <> OP.command "rectify" rectifyOptInfo <> OP.commandGroup "Package creation and manipulation commands:" ) <|> @@ -182,6 +188,8 @@ subcommandParser = OP.subparser ( (OP.progDesc "Construct package directories from chronicle files") serveOptInfo = OP.info (OP.helper <*> (CmdServe <$> serveOptParser)) (OP.progDesc "Serve Poseidon packages via HTTP or HTTPS") + jannocoalesceOptInfo = OP.info (OP.helper <*> (CmdJannoCoalesce <$> jannocoalesceOptParser)) + (OP.progDesc "Coalesce information from one or multiple janno files to another one") initOptParser :: OP.Parser InitOptions initOptParser = InitOptions <$> parseInGenotypeDataset @@ -260,3 +268,10 @@ serveOptParser = ServeOptions <$> parseArchiveBasePaths <*> parsePort <*> parseIgnoreChecksums <*> parseMaybeCertFiles + +jannocoalesceOptParser :: OP.Parser JannoCoalesceOptions +jannocoalesceOptParser = JannoCoalesceOptions <$> parseJannocoalSourceSpec + <*> parseJannocoalTargetFile + <*> parseJannocoalOutSpec + <*> parseJannocoalFillColumns + <*> parseJannocoalOverride diff --git a/src/Poseidon/CLI/Jannocoalesce.hs b/src/Poseidon/CLI/Jannocoalesce.hs index 16a36bbb..39353ee6 100644 --- a/src/Poseidon/CLI/Jannocoalesce.hs +++ b/src/Poseidon/CLI/Jannocoalesce.hs @@ -1,18 +1,26 @@ {-# LANGUAGE OverloadedStrings #-} module Poseidon.CLI.Jannocoalesce where -import Poseidon.Janno (JannoRow(..), JannoRows(..), writeJannoFile, readJannoFile) -import Poseidon.Utils (PoseidonIO, PoseidonException(..)) +import Poseidon.Janno (JannoRow (..), JannoRows (..), + readJannoFile, writeJannoFile) +import Poseidon.Package (PackageReadOptions (..), + defaultPackageReadOptions, + getJointJanno, + readPoseidonPackageCollection) +import Poseidon.Utils (PoseidonException (..), PoseidonIO) -import Control.Monad (forM) -import Control.Monad.Catch (throwM, MonadThrow) -import Control.Monad.IO.Class (liftIO) -import qualified Data.ByteString.Char8 as BSC -import qualified Data.Csv as Csv -import qualified Data.HashMap.Strict as HM +import Control.Monad (forM) +import Control.Monad.Catch (MonadThrow, throwM) +import Control.Monad.IO.Class (liftIO) +import qualified Data.ByteString.Char8 as BSC +import qualified Data.Csv as Csv +import qualified Data.HashMap.Strict as HM + +-- the source can be a single janno file, or a set of base directories as usual. +data JannoSourceSpec = JannoSourceSingle FilePath | JannoSourceBaseDirs [FilePath] data JannoCoalesceOptions = JannoCoalesceOptions - { _jannocoalesceSource :: FilePath + { _jannocoalesceSource :: JannoSourceSpec , _jannocoalesceTarget :: FilePath , _jannocoalesceOutSpec :: Maybe FilePath -- Nothing means "in place" , _jannocoalesceFillColumns :: [String] -- empty list means All @@ -20,8 +28,17 @@ data JannoCoalesceOptions = JannoCoalesceOptions } runJannocoalesce :: JannoCoalesceOptions -> PoseidonIO () -runJannocoalesce (JannoCoalesceOptions source target outSpec fields overwrite) = do - JannoRows sourceRows <- readJannoFile source +runJannocoalesce (JannoCoalesceOptions sourceSpec target outSpec fields overwrite) = do + JannoRows sourceRows <- case sourceSpec of + JannoSourceSingle sourceFile -> readJannoFile sourceFile + JannoSourceBaseDirs sourceDirs -> do + let pacReadOpts = defaultPackageReadOptions { + _readOptIgnoreChecksums = True + , _readOptGenoCheck = False + , _readOptIgnoreGeno = True + , _readOptOnlyLatest = True + } + getJointJanno <$> readPoseidonPackageCollection pacReadOpts sourceDirs JannoRows targetRows <- readJannoFile target newJanno <- forM targetRows $ \targetRow -> do let posId = jPoseidonID targetRow @@ -30,7 +47,7 @@ runJannocoalesce (JannoCoalesceOptions source target outSpec fields overwrite) = [] -> return targetRow [keyRow] -> mergeRow targetRow keyRow fields overwrite _ -> throwM $ PoseidonGenericException $ "source file contains multiple rows with key " ++ posId - let outPath = maybe target id outSpec + let outPath = maybe target id outSpec liftIO $ writeJannoFile outPath (JannoRows newJanno) @@ -43,8 +60,9 @@ mergeRow targetRow sourceRow fields overwrite = do Left err -> throwM $ PoseidonGenericException $ "Janno row-merge error: " ++ err Right r -> return r where + mergeIfMissing :: BSC.ByteString -> BSC.ByteString -> BSC.ByteString -> BSC.ByteString mergeIfMissing key targetVal sourceVal = - if (null key || (BSC.unpack key `elem` fields)) && (targetVal `elem` ["n/a", ""] || overwrite) then + if (null fields || (BSC.unpack key `elem` fields)) && (targetVal `elem` ["n/a", ""] || overwrite) then sourceVal else targetVal diff --git a/src/Poseidon/CLI/OptparseApplicativeParsers.hs b/src/Poseidon/CLI/OptparseApplicativeParsers.hs index 50176dc8..df1e5823 100644 --- a/src/Poseidon/CLI/OptparseApplicativeParsers.hs +++ b/src/Poseidon/CLI/OptparseApplicativeParsers.hs @@ -2,32 +2,37 @@ module Poseidon.CLI.OptparseApplicativeParsers where -import Poseidon.CLI.Chronicle (ChronOperation (..)) -import Poseidon.CLI.List (ListEntity (..), RepoLocationSpec (..)) -import Poseidon.CLI.Rectify (ChecksumsToRectify (..), - PackageVersionUpdate (..)) -import Poseidon.CLI.Validate (ValidatePlan (..)) -import Poseidon.Contributor (ContributorSpec (..), - contributorSpecParser) -import Poseidon.EntityTypes (EntitiesList, EntityInput (..), - PoseidonEntity, SignedEntitiesList, - SignedEntity, readEntitiesFromString) -import Poseidon.GenotypeData (GenoDataSource (..), - GenotypeDataSpec (..), - GenotypeFormatSpec (..), - SNPSetSpec (..)) -import Poseidon.ServerClient (ArchiveEndpoint (..)) -import Poseidon.Utils (LogMode (..), TestMode (..)) -import Poseidon.Version (VersionComponent (..), parseVersion) - -import Control.Applicative ((<|>)) -import Data.List.Split (splitOn) -import Data.Version (Version) -import qualified Options.Applicative as OP -import SequenceFormats.Plink (PlinkPopNameMode (PlinkPopNameAsBoth, PlinkPopNameAsFamily, PlinkPopNameAsPhenotype)) -import System.FilePath (dropExtension, takeExtension, (<.>)) -import qualified Text.Parsec as P -import Text.Read (readMaybe) +import Poseidon.CLI.Chronicle (ChronOperation (..)) +import Poseidon.CLI.Jannocoalesce (JannoSourceSpec (..)) +import Poseidon.CLI.List (ListEntity (..), + RepoLocationSpec (..)) +import Poseidon.CLI.Rectify (ChecksumsToRectify (..), + PackageVersionUpdate (..)) +import Poseidon.CLI.Validate (ValidatePlan (..)) +import Poseidon.Contributor (ContributorSpec (..), + contributorSpecParser) +import Poseidon.EntityTypes (EntitiesList, EntityInput (..), + PoseidonEntity, SignedEntitiesList, + SignedEntity, + readEntitiesFromString) +import Poseidon.GenotypeData (GenoDataSource (..), + GenotypeDataSpec (..), + GenotypeFormatSpec (..), + SNPSetSpec (..)) +import Poseidon.ServerClient (ArchiveEndpoint (..)) +import Poseidon.Utils (LogMode (..), TestMode (..)) +import Poseidon.Version (VersionComponent (..), + parseVersion) + +import Control.Applicative ((<|>)) +import Data.List.Split (splitOn) +import Data.Version (Version) +import qualified Options.Applicative as OP +import SequenceFormats.Plink (PlinkPopNameMode (PlinkPopNameAsBoth, PlinkPopNameAsFamily, PlinkPopNameAsPhenotype)) +import System.FilePath (dropExtension, takeExtension, + (<.>)) +import qualified Text.Parsec as P +import Text.Read (readMaybe) parseChronOperation :: OP.Parser ChronOperation @@ -762,3 +767,43 @@ parseMaybeArchiveName = OP.option (Just <$> OP.str) ( OP.value Nothing <> OP.showDefault ) + +parseJannocoalSourceSpec :: OP.Parser JannoSourceSpec +parseJannocoalSourceSpec = parseJannocoalSingleSource <|> (JannoSourceBaseDirs <$> parseBasePaths) + where + parseJannocoalSingleSource = OP.option (JannoSourceSingle <$> OP.str) ( + OP.long "sourceFile" <> + OP.metavar "FILE" <> + OP.help "The source Janno file" + ) + +parseJannocoalTargetFile :: OP.Parser FilePath +parseJannocoalTargetFile = OP.strOption ( + OP.long "targetFile" <> + OP.metavar "FILE" <> + OP.help "The target file to fill" + ) + +parseJannocoalOutSpec :: OP.Parser (Maybe FilePath) +parseJannocoalOutSpec = OP.option (Just <$> OP.str) ( + OP.long "outFile" <> + OP.metavar "FILE" <> + OP.value Nothing <> + OP.showDefault <> + OP.help "An optional file to write the results to. If not specified, change the target file in place." + ) + +parseJannocoalFillColumns :: OP.Parser [String] +parseJannocoalFillColumns = OP.option (splitOn "," <$> OP.str) ( + OP.long "fillColumns" <> + OP.short 'f' <> + OP.value [] <> + OP.help "A comma-separated list of Janno field names. If not specified, fill all columns that can be found in the source and target." + ) + +parseJannocoalOverride :: OP.Parser Bool +parseJannocoalOverride = OP.switch ( + OP.long "force" <> + OP.short 'f' <> + OP.help "With this option, potential non-missing content in target columns gets overridden with non-missing content in source columns. By default, only missing data gets filled-in." + ) From ade046ada5bf3b83f303e510a34f6ba5738ed19a Mon Sep 17 00:00:00 2001 From: Stephan Schiffels Date: Mon, 4 Dec 2023 14:56:29 +0100 Subject: [PATCH 06/34] added unit tests --- poseidon-hs.cabal | 2 +- src/Poseidon/Janno.hs | 2 + test/Poseidon/JannocoalesceSpec.hs | 62 +++++++++++++++++++ .../testJannoFiles/normal_subset.janno | 4 ++ 4 files changed, 69 insertions(+), 1 deletion(-) create mode 100644 test/Poseidon/JannocoalesceSpec.hs create mode 100644 test/testDat/testJannoFiles/normal_subset.janno diff --git a/poseidon-hs.cabal b/poseidon-hs.cabal index 64561a57..b1cde946 100644 --- a/poseidon-hs.cabal +++ b/poseidon-hs.cabal @@ -53,7 +53,7 @@ Test-Suite poseidon-tools-tests filepath, pipes, pipes-safe, pipes-ordered-zip, unordered-containers, cassava, containers, process other-modules: Poseidon.PackageSpec, Poseidon.JannoSpec, - Poseidon.BibFileSpec, Poseidon.MathHelpersSpec, + Poseidon.BibFileSpec, Poseidon.MathHelpersSpec, Poseidon.JannocoalesceSpec, Poseidon.SummariseSpec, Poseidon.SurveySpec, Poseidon.GenotypeDataSpec, Poseidon.EntitiesListSpec, PoseidonGoldenTests.GoldenTestsValidateChecksumsSpec, PoseidonGoldenTests.GoldenTestsRunCommands, Poseidon.ChronicleSpec, diff --git a/src/Poseidon/Janno.hs b/src/Poseidon/Janno.hs index 352a651f..ddfc37b3 100644 --- a/src/Poseidon/Janno.hs +++ b/src/Poseidon/Janno.hs @@ -27,9 +27,11 @@ module Poseidon.Janno ( JannoLibraryBuilt (..), AccessionID (..), makeAccessionID, + makeLatitude, makeLongitude, writeJannoFile, readJannoFile, createMinimalJanno, + createMinimalSample, jannoHeaderString, CsvNamedRecord (..), JannoRows (..), diff --git a/test/Poseidon/JannocoalesceSpec.hs b/test/Poseidon/JannocoalesceSpec.hs new file mode 100644 index 00000000..980b8145 --- /dev/null +++ b/test/Poseidon/JannocoalesceSpec.hs @@ -0,0 +1,62 @@ +{-# LANGUAGE OverloadedStrings #-} +module Poseidon.JannocoalesceSpec (spec) where + +import Poseidon.CLI.Jannocoalesce (mergeRow) +import Poseidon.Janno (JannoList (..), JannoRow (..), + createMinimalSample, + makeLatitude, makeLongitude) + +import SequenceFormats.Eigenstrat (EigenstratIndEntry (..), Sex (..)) +import Test.Hspec + +jannoTarget :: JannoRow +jannoTarget = + let row = createMinimalSample (EigenstratIndEntry "Name" Male "SamplePop") + in row { + jCountry = Just "Austria", + jSite = Just "Vienna", + jDateNote = Just "dating didn't work" + } + +jannoSource :: JannoRow +jannoSource = + let row = createMinimalSample (EigenstratIndEntry "Name" Male "SamplePop2") + in row { + jCountry = Just "Austria", + jSite = Just "Salzburg", + jLatitude = makeLatitude 30.0, + jLongitude = makeLongitude 30.0 + } + +spec :: Spec +spec = do + testMergeRow + +testMergeRow :: Spec +testMergeRow = + describe "Poseidon.Jannocoalesce.mergeRow" $ do + it "should correctly merge without fields and no override" $ do + merged <- mergeRow jannoTarget jannoSource [] False + jSite merged `shouldBe` Just "Vienna" + jGroupName merged `shouldBe` JannoList ["SamplePop"] + jLatitude merged `shouldBe` makeLatitude 30.0 + jLongitude merged `shouldBe` makeLongitude 30.0 + it "should correctly merge without fields and override" $ do + merged <- mergeRow jannoTarget jannoSource [] True + jSite merged `shouldBe` Just "Salzburg" + jGroupName merged `shouldBe` JannoList ["SamplePop2"] + jLatitude merged `shouldBe` makeLatitude 30.0 + jLongitude merged `shouldBe` makeLongitude 30.0 + it "should correctly merge with fields and no override" $ do + merged <- mergeRow jannoTarget jannoSource ["Group_Name", "Latitude"] False + jSite merged `shouldBe` Just "Vienna" + jGroupName merged `shouldBe` JannoList ["SamplePop"] + jLatitude merged `shouldBe` makeLatitude 30.0 + jLongitude merged `shouldBe` Nothing + it "should correctly merge with fields and override" $ do + merged <- mergeRow jannoTarget jannoSource ["Group_Name", "Latitude"] True + jSite merged `shouldBe` Just "Vienna" + jGroupName merged `shouldBe` JannoList ["SamplePop2"] + jLatitude merged `shouldBe` makeLatitude 30.0 + jLongitude merged `shouldBe` Nothing + diff --git a/test/testDat/testJannoFiles/normal_subset.janno b/test/testDat/testJannoFiles/normal_subset.janno new file mode 100644 index 00000000..ea7a635d --- /dev/null +++ b/test/testDat/testJannoFiles/normal_subset.janno @@ -0,0 +1,4 @@ +Poseidon_ID Genetic_Sex Group_Name Location Site Latitude Longitude Date_Type Date_C14_Labnr Date_C14_Uncal_BP Date_C14_Uncal_BP_Err Date_BC_AD_Start Date_BC_AD_Median Date_BC_AD_Stop Date_Note MT_Haplogroup Y_Haplogroup Source_Tissue Nr_Libraries Library_Names Capture_Type UDG Library_Built Genotype_Ploidy Data_Preparation_Pipeline_URL Endogenous Nr_SNPs Coverage_on_Target_SNPs Damage Contamination Contamination_Err Contamination_Meas Contamination_Note Genetic_Source_Accession_IDs Primary_Contact Publication Note Keywords AdditionalColumn2 AdditionalColumn1 +XXX011 M POP1;POP3 Aachen xxx 0 0 C14 A-1;A-2;A-3 3000;3100;2900 30;40;20 -1200 -1000 -800 x x x A C xxx;yyy 2 Lib1;Lib2 Shotgun;1240K minus ds diploid ftp://test.test 0 0 0 0 10 1 ANGSD v.123 Ich;mag;Kekse Ich unpublished This is a fine sample Hutschnur test2 test1 +XXX012 F POP2 xxx Cologne -90 -180 contextual n/a n/a n/a -5500 -5000 -4500 yyy B B xxx 0 Lib3 1240K half ss haploid https://www.google.de 0 0 0 100 20;50;70 2;5;7.4 Schmutzi v145;Zwiebel;other xxx aus Du PaulNature2026 Cheesecake n/a test4 test3 +XXX013 M POP1 xxx xxx 90 180 modern n/a n/a n/a 2000 2000 2000 n/a C A xxx 0 ReferenceGenome plus mixed diploid http://huhu.org/23&test 0 0 0 50 n/a n/a n/a n/a der Dose Müllers Kuh BovineCell1618 n/a A;B;C test6 test5 From fc567f6924809cd0687d2983f0c702e3eb43f48b Mon Sep 17 00:00:00 2001 From: Stephan Schiffels Date: Tue, 5 Dec 2023 09:52:18 +0100 Subject: [PATCH 07/34] added golden test for jannocoalesce --- src/Poseidon/CLI/Jannocoalesce.hs | 10 +- .../GoldenTestCheckSumFile.txt | 3 +- .../GoldenTestData/chronicle/chronicle2.yml | 14 +- .../targetNoFieldsNoOverride.janno | 4 + .../GoldenTestsRunCommands.hs | 136 +++++++++++------- 5 files changed, 104 insertions(+), 63 deletions(-) create mode 100644 test/PoseidonGoldenTests/GoldenTestData/jannocoalesce/targetNoFieldsNoOverride.janno diff --git a/src/Poseidon/CLI/Jannocoalesce.hs b/src/Poseidon/CLI/Jannocoalesce.hs index 39353ee6..b0cbc010 100644 --- a/src/Poseidon/CLI/Jannocoalesce.hs +++ b/src/Poseidon/CLI/Jannocoalesce.hs @@ -7,7 +7,7 @@ import Poseidon.Package (PackageReadOptions (..), defaultPackageReadOptions, getJointJanno, readPoseidonPackageCollection) -import Poseidon.Utils (PoseidonException (..), PoseidonIO) +import Poseidon.Utils (PoseidonException (..), PoseidonIO, logInfo) import Control.Monad (forM) import Control.Monad.Catch (MonadThrow, throwM) @@ -15,6 +15,8 @@ import Control.Monad.IO.Class (liftIO) import qualified Data.ByteString.Char8 as BSC import qualified Data.Csv as Csv import qualified Data.HashMap.Strict as HM +import System.Directory (createDirectoryIfMissing) +import System.FilePath (takeDirectory) -- the source can be a single janno file, or a set of base directories as usual. data JannoSourceSpec = JannoSourceSingle FilePath | JannoSourceBaseDirs [FilePath] @@ -48,8 +50,10 @@ runJannocoalesce (JannoCoalesceOptions sourceSpec target outSpec fields overwrit [keyRow] -> mergeRow targetRow keyRow fields overwrite _ -> throwM $ PoseidonGenericException $ "source file contains multiple rows with key " ++ posId let outPath = maybe target id outSpec - liftIO $ writeJannoFile outPath (JannoRows newJanno) - + logInfo $ "Writing to file (directory will be created if missing): " ++ outPath + liftIO $ do + createDirectoryIfMissing True (takeDirectory outPath) + writeJannoFile outPath (JannoRows newJanno) mergeRow :: (MonadThrow m) => JannoRow -> JannoRow -> [String] -> Bool -> m JannoRow mergeRow targetRow sourceRow fields overwrite = do diff --git a/test/PoseidonGoldenTests/GoldenTestCheckSumFile.txt b/test/PoseidonGoldenTests/GoldenTestCheckSumFile.txt index 6c6899b4..761322d1 100644 --- a/test/PoseidonGoldenTests/GoldenTestCheckSumFile.txt +++ b/test/PoseidonGoldenTests/GoldenTestCheckSumFile.txt @@ -112,4 +112,5 @@ b43da4d5734371c0648553120f812466 fetch fetch/multi_packages_2/Lamnidis_2018-1.0. 1d2a588b88e6d1017147c01f19d0b878 listRemote listRemote/listRemote1 0ddad9ea097bca0253e0c3c6157efa68 listRemote listRemote/listRemote2 b2286cf9af7c6c8757b8109a1f58e2d9 listRemote listRemote/listRemote3 -0433b2a80ee5a2eb5bf8c6404130e562 listRemote listRemote/listRemote4 \ No newline at end of file +0433b2a80ee5a2eb5bf8c6404130e562 listRemote listRemote/listRemote4 +4361cf41737cb13cc50cd21936c64030 jannocoalesce jannocoalesce/targetNoFieldsNoOverride.janno \ No newline at end of file diff --git a/test/PoseidonGoldenTests/GoldenTestData/chronicle/chronicle2.yml b/test/PoseidonGoldenTests/GoldenTestData/chronicle/chronicle2.yml index 99865099..f9a6e3de 100644 --- a/test/PoseidonGoldenTests/GoldenTestData/chronicle/chronicle2.yml +++ b/test/PoseidonGoldenTests/GoldenTestData/chronicle/chronicle2.yml @@ -1,29 +1,29 @@ title: Chronicle title description: Chronicle description chronicleVersion: 0.2.0 -lastModified: 2023-09-22 +lastModified: 2023-12-05 packages: - title: Lamnidis_2018 version: 1.0.0 - commit: eb2e7c2af61b6738f0ad8862645c23dd57bf0bd1 + commit: fee8b1e56bbcf70632bcdc55da017e888734ad8e path: Lamnidis_2018 - title: Lamnidis_2018 version: 1.0.1 - commit: eb2e7c2af61b6738f0ad8862645c23dd57bf0bd1 + commit: fee8b1e56bbcf70632bcdc55da017e888734ad8e path: Lamnidis_2018_newVersion - title: Schiffels version: 1.1.1 - commit: fa2e92af97376489b32ce8b6874428c958d55f3f + commit: a573f677fea11b12a1e9d00256c208a18dd09d51 path: Schiffels - title: Schiffels_2016 version: 1.0.1 - commit: eb2e7c2af61b6738f0ad8862645c23dd57bf0bd1 + commit: fee8b1e56bbcf70632bcdc55da017e888734ad8e path: Schiffels_2016 - title: Schmid_2028 version: 1.0.0 - commit: eb2e7c2af61b6738f0ad8862645c23dd57bf0bd1 + commit: fee8b1e56bbcf70632bcdc55da017e888734ad8e path: Schmid_2028 - title: Wang_2020 version: 0.1.0 - commit: eb2e7c2af61b6738f0ad8862645c23dd57bf0bd1 + commit: fee8b1e56bbcf70632bcdc55da017e888734ad8e path: Wang_2020 diff --git a/test/PoseidonGoldenTests/GoldenTestData/jannocoalesce/targetNoFieldsNoOverride.janno b/test/PoseidonGoldenTests/GoldenTestData/jannocoalesce/targetNoFieldsNoOverride.janno new file mode 100644 index 00000000..5fb72c09 --- /dev/null +++ b/test/PoseidonGoldenTests/GoldenTestData/jannocoalesce/targetNoFieldsNoOverride.janno @@ -0,0 +1,4 @@ +Poseidon_ID Genetic_Sex Group_Name Alternative_IDs Relation_To Relation_Degree Relation_Type Relation_Note Collection_ID Country Country_ISO Location Site Latitude Longitude Date_Type Date_C14_Labnr Date_C14_Uncal_BP Date_C14_Uncal_BP_Err Date_BC_AD_Start Date_BC_AD_Median Date_BC_AD_Stop Date_Note MT_Haplogroup Y_Haplogroup Source_Tissue Nr_Libraries Library_Names Capture_Type UDG Library_Built Genotype_Ploidy Data_Preparation_Pipeline_URL Endogenous Nr_SNPs Coverage_on_Target_SNPs Damage Contamination Contamination_Err Contamination_Meas Contamination_Note Genetic_Source_Accession_IDs Primary_Contact Publication Note Keywords AdditionalColumn1 AdditionalColumn2 +XXX011 M POP1;POP3 Paul;Peter XXX012;I1234 first;second father_of;grandfather_of yyy n/a xxx DE Aachen xxx 0.0 0.0 C14 A-1;A-2;A-3 3000;3100;2900 30;40;20 -1200 -1000 -800 x x x A C xxx;yyy 2 Lib1;Lib2 Shotgun;1240K minus ds diploid ftp://test.test 0.0 0 0.0 0.0 10 1 ANGSD v.123 n/a Ich;mag;Kekse Ich unpublished This is a fine sample Hutschnur test1 test2 +XXX012 F POP2 n/a XXX011 first daughter_of n/a n/a xxx FR xxx Cologne -90.0 -180.0 contextual n/a n/a n/a -5500 -5000 -4500 yyy B B xxx 0 Lib3 1240K half ss haploid https://www.google.de 0.0 0 0.0 100.0 20;50;70 2;5;7.4 Schmutzi v145;Zwiebel;other xxx aus Du PaulNature2026 Cheesecake n/a test3 test4 +XXX013 M POP1 Skeleton Joe XXX011 sixthToTenth n/a xxx n/a xxx EG xxx xxx 90.0 180.0 modern n/a n/a n/a 2000 2000 2000 n/a C A xxx 0 n/a ReferenceGenome plus mixed diploid http://huhu.org/23&test 0.0 0 0.0 50.0 n/a n/a n/a n/a der Dose Müllers Kuh BovineCell1618 n/a A;B;C test5 test6 diff --git a/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs b/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs index 9613413f..c95eb0aa 100644 --- a/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs +++ b/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs @@ -4,59 +4,63 @@ module PoseidonGoldenTests.GoldenTestsRunCommands ( createStaticCheckSumFile, createDynamicCheckSumFile, staticCheckSumFile, dynamicCheckSumFile ) where -import Poseidon.CLI.Fetch (FetchOptions (..), runFetch) -import Poseidon.CLI.Forge (ForgeOptions (..), runForge) -import Poseidon.CLI.Genoconvert (GenoconvertOptions (..), - runGenoconvert) -import Poseidon.CLI.Init (InitOptions (..), runInit) -import Poseidon.CLI.List (ListEntity (..), ListOptions (..), - RepoLocationSpec (..), runList) -import Poseidon.CLI.Rectify (ChecksumsToRectify (..), - PackageVersionUpdate (..), - RectifyOptions (..), runRectify) -import Poseidon.CLI.Serve (ServeOptions (..), runServer) -import Poseidon.CLI.Summarise (SummariseOptions (..), runSummarise) -import Poseidon.CLI.Survey (SurveyOptions (..), runSurvey) -import Poseidon.CLI.Timetravel (TimetravelOptions (..), - runTimetravel) -import Poseidon.CLI.Validate (ValidateOptions (..), - ValidatePlan (..), runValidate) -import Poseidon.Contributor (ContributorSpec (..)) -import Poseidon.EntityTypes (EntityInput (..), - PoseidonEntity (..), - readEntitiesFromString) -import Poseidon.GenotypeData (GenoDataSource (..), - GenotypeDataSpec (..), - GenotypeFormatSpec (..), - SNPSetSpec (..)) -import Poseidon.ServerClient (ArchiveEndpoint (..)) -import Poseidon.Utils (LogMode (..), TestMode (..), - getChecksum, testLog, - usePoseidonLogger) -import Poseidon.Version (VersionComponent (..)) +import Poseidon.CLI.Fetch (FetchOptions (..), runFetch) +import Poseidon.CLI.Forge (ForgeOptions (..), runForge) +import Poseidon.CLI.Genoconvert (GenoconvertOptions (..), + runGenoconvert) +import Poseidon.CLI.Init (InitOptions (..), runInit) +import Poseidon.CLI.Jannocoalesce (JannoCoalesceOptions (..), + runJannocoalesce, JannoSourceSpec(..)) +import Poseidon.CLI.List (ListEntity (..), ListOptions (..), + RepoLocationSpec (..), runList) +import Poseidon.CLI.Rectify (ChecksumsToRectify (..), + PackageVersionUpdate (..), + RectifyOptions (..), runRectify) +import Poseidon.CLI.Serve (ServeOptions (..), runServer) +import Poseidon.CLI.Summarise (SummariseOptions (..), + runSummarise) +import Poseidon.CLI.Survey (SurveyOptions (..), runSurvey) +import Poseidon.CLI.Timetravel (TimetravelOptions (..), + runTimetravel) +import Poseidon.CLI.Validate (ValidateOptions (..), + ValidatePlan (..), runValidate) +import Poseidon.Contributor (ContributorSpec (..)) +import Poseidon.EntityTypes (EntityInput (..), + PoseidonEntity (..), + readEntitiesFromString) +import Poseidon.GenotypeData (GenoDataSource (..), + GenotypeDataSpec (..), + GenotypeFormatSpec (..), + SNPSetSpec (..)) +import Poseidon.ServerClient (ArchiveEndpoint (..)) +import Poseidon.Utils (LogMode (..), TestMode (..), + getChecksum, testLog, + usePoseidonLogger) +import Poseidon.Version (VersionComponent (..)) -import Control.Concurrent (forkIO, killThread, newEmptyMVar) -import Control.Concurrent.MVar (takeMVar) -import Control.Exception (finally) -import Control.Monad (forM_, unless, when) -import Data.Either (fromRight) -import Data.Function ((&)) -import qualified Data.Text as T -import qualified Data.Text.IO as T -import Data.Version (makeVersion) -import GHC.IO.Handle (hClose, hDuplicate, hDuplicateTo) -import Poseidon.CLI.Chronicle (ChronOperation (..), - ChronicleOptions (..), runChronicle) -import Poseidon.EntityTypes (PacNameAndVersion (..)) -import SequenceFormats.Plink (PlinkPopNameMode (..)) -import System.Directory (copyFile, createDirectory, - createDirectoryIfMissing, - doesDirectoryExist, listDirectory, - removeDirectoryRecursive) -import System.FilePath.Posix (()) -import System.IO (IOMode (WriteMode), hPutStrLn, - openFile, stderr, stdout, withFile) -import System.Process (callCommand) +import Control.Concurrent (forkIO, killThread, newEmptyMVar) +import Control.Concurrent.MVar (takeMVar) +import Control.Exception (finally) +import Control.Monad (forM_, unless, when) +import Data.Either (fromRight) +import Data.Function ((&)) +import qualified Data.Text as T +import qualified Data.Text.IO as T +import Data.Version (makeVersion) +import GHC.IO.Handle (hClose, hDuplicate, hDuplicateTo) +import Poseidon.CLI.Chronicle (ChronOperation (..), + ChronicleOptions (..), + runChronicle) +import Poseidon.EntityTypes (PacNameAndVersion (..)) +import SequenceFormats.Plink (PlinkPopNameMode (..)) +import System.Directory (copyFile, createDirectory, + createDirectoryIfMissing, + doesDirectoryExist, listDirectory, + removeDirectoryRecursive) +import System.FilePath.Posix (()) +import System.IO (IOMode (WriteMode), hPutStrLn, + openFile, stderr, stdout, withFile) +import System.Process (callCommand) -- file paths -- @@ -182,6 +186,8 @@ runCLICommands interactive testDir checkFilePath = do testPipelineFetch testDir checkFilePath hPutStrLn stderr "--- list --remote" testPipelineListRemote testDir checkFilePath + hPutStrLn stderr "--- jannocoalesce" + testPipelineJannocoalesce testDir checkFilePath -- close error sink hClose devNull unless interactive $ hDuplicateTo stderr_old stderr @@ -1076,3 +1082,29 @@ testPipelineListRemote testDir checkFilePath = do ) ( killThread threadID ) + +testPipelineJannocoalesce :: FilePath -> FilePath -> IO () +testPipelineJannocoalesce testDir checkFilePath = do + let jannocoalesceOpts1 = JannoCoalesceOptions { + _jannocoalesceSource = JannoSourceSingle "test/testDat/testJannoFiles/normal_full.janno", + _jannocoalesceTarget = "test/testDat/testJannoFiles/normal_subset.janno", + _jannocoalesceOutSpec = Just (testDir "jannocoalesce" "targetNoFieldsNoOverride.janno"), + _jannocoalesceFillColumns = [], + _jannocoalesceOverwriteColumns = False + } + runAndChecksumFiles checkFilePath testDir (testLog $ runJannocoalesce jannocoalesceOpts1) "jannocoalesce" [ + "jannocoalesce" "targetNoFieldsNoOverride.janno" + ] + -- runAndChecksumFiles checkFilePath testDir (testLog $ runJannocoalesce jannocoalesceOpts2) "jannocoalesce" [ + -- "jannocoalesce" "targetFieldsNoOverride.janno", + -- ] + -- runAndChecksumFiles checkFilePath testDir (testLog $ runJannocoalesce jannocoalesceOpts3) "jannocoalesce" [ + -- "jannocoalesce" "targetNoFieldsOverride.janno", + -- ] + -- runAndChecksumFiles checkFilePath testDir (testLog $ runJannocoalesce jannocoalesceOpts4) "jannocoalesce" [ + -- "jannocoalesce" "targetFieldsOverride.janno", + -- ] + -- runAndChecksumFiles checkFilePath testDir (testLog $ runJannocoalesce jannocoalesceOpts5) "jannocoalesce" [ + -- "jannocoalesce" "targetFromBaseDir.janno", + -- ] + From 0d019f049f23f49a71d622fb130c0619d3a3b787 Mon Sep 17 00:00:00 2001 From: Stephan Schiffels Date: Tue, 5 Dec 2023 09:53:39 +0100 Subject: [PATCH 08/34] bumped version, added Changelog --- CHANGELOG.md | 2 ++ poseidon-hs.cabal | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cf95e9a1..ab9756ed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,5 @@ +- V 1.4.1.0: + - Added new tool `trident jannocoalesce`, which merges information from a source janno file to a target janno file, see `trident jannocoalesce --help` for details. - V 1.4.0.3: - Fixed a severe performance leak in code around `resolveEntityIndices`, which was called in various functions and wastefully recomputed `isLatestInCollection` way too often. This affected simple commands, like fetching a few packages from the server, forging, and has effects also in xerxes. - Bumped to a newer Compiler (GHC 9.4.7) and new Stackage Snapshot (LTS-21.17) diff --git a/poseidon-hs.cabal b/poseidon-hs.cabal index b1cde946..cab929fd 100644 --- a/poseidon-hs.cabal +++ b/poseidon-hs.cabal @@ -1,5 +1,5 @@ name: poseidon-hs -version: 1.4.0.3 +version: 1.4.1.0 synopsis: A package with tools for working with Poseidon Genotype Data description: The tools in this package read and analyse Poseidon-formatted genotype databases, a modular system for storing genotype data from thousands of individuals. license: MIT From 6ee7de78ccd9651da746552561263697ec98d3f0 Mon Sep 17 00:00:00 2001 From: Stephan Schiffels Date: Wed, 6 Dec 2023 14:33:58 +0100 Subject: [PATCH 09/34] stylish-haskell --- src/Poseidon/CLI/Jannocoalesce.hs | 3 ++- test/Poseidon/JannocoalesceSpec.hs | 4 ++-- test/PoseidonGoldenTests/GoldenTestsRunCommands.hs | 3 ++- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/Poseidon/CLI/Jannocoalesce.hs b/src/Poseidon/CLI/Jannocoalesce.hs index b0cbc010..9fcfb7b0 100644 --- a/src/Poseidon/CLI/Jannocoalesce.hs +++ b/src/Poseidon/CLI/Jannocoalesce.hs @@ -7,7 +7,8 @@ import Poseidon.Package (PackageReadOptions (..), defaultPackageReadOptions, getJointJanno, readPoseidonPackageCollection) -import Poseidon.Utils (PoseidonException (..), PoseidonIO, logInfo) +import Poseidon.Utils (PoseidonException (..), PoseidonIO, + logInfo) import Control.Monad (forM) import Control.Monad.Catch (MonadThrow, throwM) diff --git a/test/Poseidon/JannocoalesceSpec.hs b/test/Poseidon/JannocoalesceSpec.hs index 980b8145..7745e1e7 100644 --- a/test/Poseidon/JannocoalesceSpec.hs +++ b/test/Poseidon/JannocoalesceSpec.hs @@ -3,8 +3,8 @@ module Poseidon.JannocoalesceSpec (spec) where import Poseidon.CLI.Jannocoalesce (mergeRow) import Poseidon.Janno (JannoList (..), JannoRow (..), - createMinimalSample, - makeLatitude, makeLongitude) + createMinimalSample, makeLatitude, + makeLongitude) import SequenceFormats.Eigenstrat (EigenstratIndEntry (..), Sex (..)) import Test.Hspec diff --git a/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs b/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs index c95eb0aa..b10657e4 100644 --- a/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs +++ b/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs @@ -10,7 +10,8 @@ import Poseidon.CLI.Genoconvert (GenoconvertOptions (..), runGenoconvert) import Poseidon.CLI.Init (InitOptions (..), runInit) import Poseidon.CLI.Jannocoalesce (JannoCoalesceOptions (..), - runJannocoalesce, JannoSourceSpec(..)) + JannoSourceSpec (..), + runJannocoalesce) import Poseidon.CLI.List (ListEntity (..), ListOptions (..), RepoLocationSpec (..), runList) import Poseidon.CLI.Rectify (ChecksumsToRectify (..), From 7a8c3316cfb05b287781711613e789a10940603e Mon Sep 17 00:00:00 2001 From: Stephan Schiffels Date: Mon, 18 Dec 2023 16:07:34 +0100 Subject: [PATCH 10/34] continued --- src-executables/Main-trident.hs | 3 ++ src/Poseidon/CLI/Jannocoalesce.hs | 33 ++++++++++++++++--- .../CLI/OptparseApplicativeParsers.hs | 11 +++++++ .../GoldenTestsRunCommands.hs | 5 ++- 4 files changed, 47 insertions(+), 5 deletions(-) diff --git a/src-executables/Main-trident.hs b/src-executables/Main-trident.hs index 0727acc7..81c33f47 100644 --- a/src-executables/Main-trident.hs +++ b/src-executables/Main-trident.hs @@ -275,3 +275,6 @@ jannocoalesceOptParser = JannoCoalesceOptions <$> parseJannocoalSourceSpec <*> parseJannocoalOutSpec <*> parseJannocoalFillColumns <*> parseJannocoalOverride + <*> parseJannocoalSourceKey + <*> parseJannocoalTargetKey + <*> parseJannocoalIdStripRegex diff --git a/src/Poseidon/CLI/Jannocoalesce.hs b/src/Poseidon/CLI/Jannocoalesce.hs index 9fcfb7b0..55a454ad 100644 --- a/src/Poseidon/CLI/Jannocoalesce.hs +++ b/src/Poseidon/CLI/Jannocoalesce.hs @@ -10,14 +10,16 @@ import Poseidon.Package (PackageReadOptions (..), import Poseidon.Utils (PoseidonException (..), PoseidonIO, logInfo) -import Control.Monad (forM) +import Control.Monad (filterM, forM) import Control.Monad.Catch (MonadThrow, throwM) import Control.Monad.IO.Class (liftIO) import qualified Data.ByteString.Char8 as BSC import qualified Data.Csv as Csv import qualified Data.HashMap.Strict as HM +import Data.List.Utils (replace) import System.Directory (createDirectoryIfMissing) import System.FilePath (takeDirectory) +import Text.Regex.TDFA (subRegex) -- the source can be a single janno file, or a set of base directories as usual. data JannoSourceSpec = JannoSourceSingle FilePath | JannoSourceBaseDirs [FilePath] @@ -28,10 +30,13 @@ data JannoCoalesceOptions = JannoCoalesceOptions , _jannocoalesceOutSpec :: Maybe FilePath -- Nothing means "in place" , _jannocoalesceFillColumns :: [String] -- empty list means All , _jannocoalesceOverwriteColumns :: Bool + , _jannocoalesceSourceKey :: String -- by default set to "Poseidon_ID" + , _jannocoalesceTargetKey :: String -- by default set to "Poseidon_ID" + , _jannocoalesceIdStrip :: Maybe String -- an optional regex to strip from target and source keys } runJannocoalesce :: JannoCoalesceOptions -> PoseidonIO () -runJannocoalesce (JannoCoalesceOptions sourceSpec target outSpec fields overwrite) = do +runJannocoalesce (JannoCoalesceOptions sourceSpec target outSpec fields overwrite sKey tKey maybeStrip) = do JannoRows sourceRows <- case sourceSpec of JannoSourceSingle sourceFile -> readJannoFile sourceFile JannoSourceBaseDirs sourceDirs -> do @@ -44,8 +49,8 @@ runJannocoalesce (JannoCoalesceOptions sourceSpec target outSpec fields overwrit getJointJanno <$> readPoseidonPackageCollection pacReadOpts sourceDirs JannoRows targetRows <- readJannoFile target newJanno <- forM targetRows $ \targetRow -> do - let posId = jPoseidonID targetRow - sourceRowCandidates = filter (\r -> jPoseidonID r == posId) sourceRows + posId <- getKeyFromJanno targetRow tKey + sourceRowCandidates <- filterM (\r -> (matchWithOptionalStrip maybeStrip posId) <$> getKeyFromJanno r sKey) sourceRows case sourceRowCandidates of [] -> return targetRow [keyRow] -> mergeRow targetRow keyRow fields overwrite @@ -56,6 +61,26 @@ runJannocoalesce (JannoCoalesceOptions sourceSpec target outSpec fields overwrit createDirectoryIfMissing True (takeDirectory outPath) writeJannoFile outPath (JannoRows newJanno) +getKeyFromJanno :: (MonadThrow m) => JannoRow -> String -> m String +getKeyFromJanno jannoRow key = do + let jannoRowDict = Csv.toNamedRecord jannoRow + case jannoRowDict HM.!? (BSC.pack key) of + Nothing -> throwM $ PoseidonGenericException ("Key " ++ key ++ " not present in janno file") + Just r -> return $ BSC.unpack r + +matchWithOptionalStrip :: (Maybe String) -> String -> String -> Bool +matchWithOptionalStrip maybeRegex id1 id2 = + case maybeRegex of + Nothing -> id1 == id2 + Just r -> + let id1stripped = stripR (mkRegex r) id1 "" + id2stripped = stripR (mkRegex r) id2 "" + in id1stripped == id2stripped + where + stripR r s = + let match = s =~ r + in replace s match "" + mergeRow :: (MonadThrow m) => JannoRow -> JannoRow -> [String] -> Bool -> m JannoRow mergeRow targetRow sourceRow fields overwrite = do let targetRowRecord = Csv.toNamedRecord targetRow diff --git a/src/Poseidon/CLI/OptparseApplicativeParsers.hs b/src/Poseidon/CLI/OptparseApplicativeParsers.hs index df1e5823..947621ec 100644 --- a/src/Poseidon/CLI/OptparseApplicativeParsers.hs +++ b/src/Poseidon/CLI/OptparseApplicativeParsers.hs @@ -807,3 +807,14 @@ parseJannocoalOverride = OP.switch ( OP.short 'f' <> OP.help "With this option, potential non-missing content in target columns gets overridden with non-missing content in source columns. By default, only missing data gets filled-in." ) + +parseJannocoalSourceKey :: OP.Parser String +parseJannocoalSourceKey = OP.strOption (OP.long "sourceKey" <> OP.help "the janno column to use as the source key" <> OP.value "Poseidon_ID" <> OP.showDefault) + +parseJannocoalTargetKey :: OP.Parser String +parseJannocoalTargetKey = OP.strOption (OP.long "targetKey" <> OP.help "the janno column to use as the target key" <> OP.value "Poseidon_ID" <> OP.showDefault) + +parseJannocoalIdStripRegex :: OP.Parser (Maybe String) +parseJannocoalIdStripRegex = OP.option (Just <$> OP.str) (OP.long "stripIdSuffix" <> + OP.help "an optional regular expression to identify parts of the IDs to strip before matching between source and target" <> OP.value Nothing) + diff --git a/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs b/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs index b10657e4..62b161be 100644 --- a/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs +++ b/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs @@ -1091,7 +1091,10 @@ testPipelineJannocoalesce testDir checkFilePath = do _jannocoalesceTarget = "test/testDat/testJannoFiles/normal_subset.janno", _jannocoalesceOutSpec = Just (testDir "jannocoalesce" "targetNoFieldsNoOverride.janno"), _jannocoalesceFillColumns = [], - _jannocoalesceOverwriteColumns = False + _jannocoalesceOverwriteColumns = False, + _jannocoalesceSourceKey = "Poseidon_ID", + _jannocoalesceTargetKey = "Poseidon_ID", + _jannocoalesceIdStripRegex = Nothing } runAndChecksumFiles checkFilePath testDir (testLog $ runJannocoalesce jannocoalesceOpts1) "jannocoalesce" [ "jannocoalesce" "targetNoFieldsNoOverride.janno" From 21db6a401793b18e19ffbe4c9e9e784f356af0ab Mon Sep 17 00:00:00 2001 From: Stephan Schiffels Date: Tue, 19 Dec 2023 15:51:04 +0100 Subject: [PATCH 11/34] added tests --- src/Poseidon/CLI/Jannocoalesce.hs | 39 ++++++---- test/Poseidon/JannocoalesceSpec.hs | 72 +++++++++++++++---- .../GoldenTestsRunCommands.hs | 15 +--- 3 files changed, 82 insertions(+), 44 deletions(-) diff --git a/src/Poseidon/CLI/Jannocoalesce.hs b/src/Poseidon/CLI/Jannocoalesce.hs index 55a454ad..17cbf137 100644 --- a/src/Poseidon/CLI/Jannocoalesce.hs +++ b/src/Poseidon/CLI/Jannocoalesce.hs @@ -1,4 +1,5 @@ {-# LANGUAGE OverloadedStrings #-} + module Poseidon.CLI.Jannocoalesce where import Poseidon.Janno (JannoRow (..), JannoRows (..), @@ -16,10 +17,10 @@ import Control.Monad.IO.Class (liftIO) import qualified Data.ByteString.Char8 as BSC import qualified Data.Csv as Csv import qualified Data.HashMap.Strict as HM -import Data.List.Utils (replace) +import Data.Text (pack, replace, unpack) import System.Directory (createDirectoryIfMissing) import System.FilePath (takeDirectory) -import Text.Regex.TDFA (subRegex) +import Text.Regex.TDFA ((=~)) -- the source can be a single janno file, or a set of base directories as usual. data JannoSourceSpec = JannoSourceSingle FilePath | JannoSourceBaseDirs [FilePath] @@ -48,19 +49,25 @@ runJannocoalesce (JannoCoalesceOptions sourceSpec target outSpec fields overwrit } getJointJanno <$> readPoseidonPackageCollection pacReadOpts sourceDirs JannoRows targetRows <- readJannoFile target - newJanno <- forM targetRows $ \targetRow -> do - posId <- getKeyFromJanno targetRow tKey - sourceRowCandidates <- filterM (\r -> (matchWithOptionalStrip maybeStrip posId) <$> getKeyFromJanno r sKey) sourceRows - case sourceRowCandidates of - [] -> return targetRow - [keyRow] -> mergeRow targetRow keyRow fields overwrite - _ -> throwM $ PoseidonGenericException $ "source file contains multiple rows with key " ++ posId + + newJanno <- makeNewJannoRows sourceRows targetRows fields overwrite sKey tKey maybeStrip + let outPath = maybe target id outSpec logInfo $ "Writing to file (directory will be created if missing): " ++ outPath liftIO $ do createDirectoryIfMissing True (takeDirectory outPath) writeJannoFile outPath (JannoRows newJanno) +makeNewJannoRows :: (MonadThrow m) => [JannoRow] -> [JannoRow] -> [String] -> Bool -> String -> String -> Maybe String -> m [JannoRow] +makeNewJannoRows sourceRows targetRows fields overwrite sKey tKey maybeStrip = + forM targetRows $ \targetRow -> do + posId <- getKeyFromJanno targetRow tKey + sourceRowCandidates <- filterM (\r -> (matchWithOptionalStrip maybeStrip posId) <$> getKeyFromJanno r sKey) sourceRows + case sourceRowCandidates of + [] -> return targetRow + [keyRow] -> mergeRow targetRow keyRow fields overwrite tKey + _ -> throwM $ PoseidonGenericException $ "source file contains multiple rows with key " ++ posId + getKeyFromJanno :: (MonadThrow m) => JannoRow -> String -> m String getKeyFromJanno jannoRow key = do let jannoRowDict = Csv.toNamedRecord jannoRow @@ -73,16 +80,18 @@ matchWithOptionalStrip maybeRegex id1 id2 = case maybeRegex of Nothing -> id1 == id2 Just r -> - let id1stripped = stripR (mkRegex r) id1 "" - id2stripped = stripR (mkRegex r) id2 "" + let id1stripped = stripR r id1 + id2stripped = stripR r id2 in id1stripped == id2stripped where + stripR :: String -> String -> String stripR r s = let match = s =~ r - in replace s match "" + in if null match then s else unpack $ replace (pack match) "" (pack s) -mergeRow :: (MonadThrow m) => JannoRow -> JannoRow -> [String] -> Bool -> m JannoRow -mergeRow targetRow sourceRow fields overwrite = do +-- note that we never overwrite values in the key-colum, that's why this function takes it as an argument to check. +mergeRow :: (MonadThrow m) => JannoRow -> JannoRow -> [String] -> Bool -> String -> m JannoRow +mergeRow targetRow sourceRow fields overwrite keyCol = do let targetRowRecord = Csv.toNamedRecord targetRow sourceRowRecord = Csv.toNamedRecord sourceRow parseResult = Csv.runParser . Csv.parseNamedRecord $ HM.unionWithKey mergeIfMissing targetRowRecord sourceRowRecord @@ -92,7 +101,7 @@ mergeRow targetRow sourceRow fields overwrite = do where mergeIfMissing :: BSC.ByteString -> BSC.ByteString -> BSC.ByteString -> BSC.ByteString mergeIfMissing key targetVal sourceVal = - if (null fields || (BSC.unpack key `elem` fields)) && (targetVal `elem` ["n/a", ""] || overwrite) then + if key /= BSC.pack keyCol && (null fields || (BSC.unpack key `elem` fields)) && (targetVal `elem` ["n/a", ""] || overwrite) then sourceVal else targetVal diff --git a/test/Poseidon/JannocoalesceSpec.hs b/test/Poseidon/JannocoalesceSpec.hs index 7745e1e7..44691220 100644 --- a/test/Poseidon/JannocoalesceSpec.hs +++ b/test/Poseidon/JannocoalesceSpec.hs @@ -1,16 +1,23 @@ {-# LANGUAGE OverloadedStrings #-} module Poseidon.JannocoalesceSpec (spec) where -import Poseidon.CLI.Jannocoalesce (mergeRow) -import Poseidon.Janno (JannoList (..), JannoRow (..), +import Poseidon.CLI.Jannocoalesce (makeNewJannoRows, mergeRow) +import Poseidon.Janno (CsvNamedRecord (..), + JannoList (..), JannoRow (..), createMinimalSample, makeLatitude, makeLongitude) +import qualified Data.HashMap.Strict as HM import SequenceFormats.Eigenstrat (EigenstratIndEntry (..), Sex (..)) import Test.Hspec -jannoTarget :: JannoRow -jannoTarget = +spec :: Spec +spec = do + testMergeSingleRow + testCoalesceMultipleRows + +jannoTargetRow :: JannoRow +jannoTargetRow = let row = createMinimalSample (EigenstratIndEntry "Name" Male "SamplePop") in row { jCountry = Just "Austria", @@ -18,8 +25,8 @@ jannoTarget = jDateNote = Just "dating didn't work" } -jannoSource :: JannoRow -jannoSource = +jannoSourceRow :: JannoRow +jannoSourceRow = let row = createMinimalSample (EigenstratIndEntry "Name" Male "SamplePop2") in row { jCountry = Just "Austria", @@ -28,35 +35,70 @@ jannoSource = jLongitude = makeLongitude 30.0 } -spec :: Spec -spec = do - testMergeRow +jannoTargetRows :: [JannoRow] +jannoTargetRows = + let row1 = createMinimalSample (EigenstratIndEntry "Ind1" Male "SamplePop") + row2 = createMinimalSample (EigenstratIndEntry "Ind2" Male "SamplePop") + row3 = createMinimalSample (EigenstratIndEntry "Ind3_AB" Male "SamplePop") + in [row1 {jCountry = Just "Germany"}, row2, row3] + +jannoSourceRows :: [JannoRow] +jannoSourceRows = + let row1 = createMinimalSample (EigenstratIndEntry "Ind1" Male "SamplePop") + row2 = createMinimalSample (EigenstratIndEntry "Ind2" Male "SamplePop") + row3 = createMinimalSample (EigenstratIndEntry "Ind3" Male "SamplePop") + in [ + row1 {jCountry = Just "Austria", jAdditionalColumns = CsvNamedRecord $ HM.fromList [("Poseidon_ID_alt", "Ind1")]}, + row2 {jCountry = Just "Austria", jAdditionalColumns = CsvNamedRecord $ HM.fromList [("Poseidon_ID_alt", "Ind2")]}, + row3 {jCountry = Just "Austria", jAdditionalColumns = CsvNamedRecord $ HM.fromList [("Poseidon_ID_alt", "Ind3_AB")]}] -testMergeRow :: Spec -testMergeRow = +testMergeSingleRow :: Spec +testMergeSingleRow = describe "Poseidon.Jannocoalesce.mergeRow" $ do it "should correctly merge without fields and no override" $ do - merged <- mergeRow jannoTarget jannoSource [] False + merged <- mergeRow jannoTargetRow jannoSourceRow [] False "Poseidon_ID" jSite merged `shouldBe` Just "Vienna" jGroupName merged `shouldBe` JannoList ["SamplePop"] jLatitude merged `shouldBe` makeLatitude 30.0 jLongitude merged `shouldBe` makeLongitude 30.0 it "should correctly merge without fields and override" $ do - merged <- mergeRow jannoTarget jannoSource [] True + merged <- mergeRow jannoTargetRow jannoSourceRow [] True "Poseidon_ID" jSite merged `shouldBe` Just "Salzburg" jGroupName merged `shouldBe` JannoList ["SamplePop2"] jLatitude merged `shouldBe` makeLatitude 30.0 jLongitude merged `shouldBe` makeLongitude 30.0 it "should correctly merge with fields and no override" $ do - merged <- mergeRow jannoTarget jannoSource ["Group_Name", "Latitude"] False + merged <- mergeRow jannoTargetRow jannoSourceRow ["Group_Name", "Latitude"] False "Poseidon_ID" jSite merged `shouldBe` Just "Vienna" jGroupName merged `shouldBe` JannoList ["SamplePop"] jLatitude merged `shouldBe` makeLatitude 30.0 jLongitude merged `shouldBe` Nothing it "should correctly merge with fields and override" $ do - merged <- mergeRow jannoTarget jannoSource ["Group_Name", "Latitude"] True + merged <- mergeRow jannoTargetRow jannoSourceRow ["Group_Name", "Latitude"] True "Poseidon_ID" jSite merged `shouldBe` Just "Vienna" jGroupName merged `shouldBe` JannoList ["SamplePop2"] jLatitude merged `shouldBe` makeLatitude 30.0 jLongitude merged `shouldBe` Nothing +testCoalesceMultipleRows :: Spec +testCoalesceMultipleRows = describe "Poseidon.Jannocoalesce.makeNewJannoRows" $ do + it "should correctly copy with simple matching" $ do + newJ <- makeNewJannoRows jannoSourceRows jannoTargetRows [] False "Poseidon_ID" "Poseidon_ID" Nothing + jCountry (newJ !! 0) `shouldBe` Just "Germany" + jCountry (newJ !! 1) `shouldBe` Just "Austria" + jCountry (newJ !! 2) `shouldBe` Nothing + it "should correctly copy with simple matching and overwrite" $ do + newJ <- makeNewJannoRows jannoSourceRows jannoTargetRows [] True "Poseidon_ID" "Poseidon_ID" Nothing + jCountry (newJ !! 0) `shouldBe` Just "Austria" + jCountry (newJ !! 1) `shouldBe` Just "Austria" + it "should throw with duplicate source keys" $ do + let s = jannoSourceRows ++ [jannoSourceRows !! 1] + makeNewJannoRows s jannoTargetRows [] False "Poseidon_ID" "Poseidon_ID" Nothing `shouldThrow` anyException + it "should correctly copy with suffix strip" $ do + newJ <- makeNewJannoRows jannoSourceRows jannoTargetRows [] False "Poseidon_ID" "Poseidon_ID" (Just "_AB") + jCountry (newJ !! 0) `shouldBe` Just "Germany" + jCountry (newJ !! 1) `shouldBe` Just "Austria" + jCountry (newJ !! 2) `shouldBe` Just "Austria" + it "should correctly copy with alternative ID column" $ do + newJ <- makeNewJannoRows jannoSourceRows jannoTargetRows [] False "Poseidon_ID_alt" "Poseidon_ID" Nothing + jCountry (newJ !! 2) `shouldBe` Just "Austria" diff --git a/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs b/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs index 62b161be..3151571d 100644 --- a/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs +++ b/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs @@ -1094,21 +1094,8 @@ testPipelineJannocoalesce testDir checkFilePath = do _jannocoalesceOverwriteColumns = False, _jannocoalesceSourceKey = "Poseidon_ID", _jannocoalesceTargetKey = "Poseidon_ID", - _jannocoalesceIdStripRegex = Nothing + _jannocoalesceIdStrip = Nothing } runAndChecksumFiles checkFilePath testDir (testLog $ runJannocoalesce jannocoalesceOpts1) "jannocoalesce" [ "jannocoalesce" "targetNoFieldsNoOverride.janno" ] - -- runAndChecksumFiles checkFilePath testDir (testLog $ runJannocoalesce jannocoalesceOpts2) "jannocoalesce" [ - -- "jannocoalesce" "targetFieldsNoOverride.janno", - -- ] - -- runAndChecksumFiles checkFilePath testDir (testLog $ runJannocoalesce jannocoalesceOpts3) "jannocoalesce" [ - -- "jannocoalesce" "targetNoFieldsOverride.janno", - -- ] - -- runAndChecksumFiles checkFilePath testDir (testLog $ runJannocoalesce jannocoalesceOpts4) "jannocoalesce" [ - -- "jannocoalesce" "targetFieldsOverride.janno", - -- ] - -- runAndChecksumFiles checkFilePath testDir (testLog $ runJannocoalesce jannocoalesceOpts5) "jannocoalesce" [ - -- "jannocoalesce" "targetFromBaseDir.janno", - -- ] - From 8f5b769dfd27f9877fee6231fd67679f23562511 Mon Sep 17 00:00:00 2001 From: Stephan Schiffels Date: Tue, 19 Dec 2023 15:56:58 +0100 Subject: [PATCH 12/34] changed some option short forms --- src/Poseidon/CLI/OptparseApplicativeParsers.hs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Poseidon/CLI/OptparseApplicativeParsers.hs b/src/Poseidon/CLI/OptparseApplicativeParsers.hs index 947621ec..1bdd9426 100644 --- a/src/Poseidon/CLI/OptparseApplicativeParsers.hs +++ b/src/Poseidon/CLI/OptparseApplicativeParsers.hs @@ -773,6 +773,7 @@ parseJannocoalSourceSpec = parseJannocoalSingleSource <|> (JannoSourceBaseDirs < where parseJannocoalSingleSource = OP.option (JannoSourceSingle <$> OP.str) ( OP.long "sourceFile" <> + OP.short 's' <> OP.metavar "FILE" <> OP.help "The source Janno file" ) @@ -780,6 +781,7 @@ parseJannocoalSourceSpec = parseJannocoalSingleSource <|> (JannoSourceBaseDirs < parseJannocoalTargetFile :: OP.Parser FilePath parseJannocoalTargetFile = OP.strOption ( OP.long "targetFile" <> + OP.short 't' <> OP.metavar "FILE" <> OP.help "The target file to fill" ) @@ -787,6 +789,7 @@ parseJannocoalTargetFile = OP.strOption ( parseJannocoalOutSpec :: OP.Parser (Maybe FilePath) parseJannocoalOutSpec = OP.option (Just <$> OP.str) ( OP.long "outFile" <> + OP.short `-o` <> OP.metavar "FILE" <> OP.value Nothing <> OP.showDefault <> @@ -796,7 +799,6 @@ parseJannocoalOutSpec = OP.option (Just <$> OP.str) ( parseJannocoalFillColumns :: OP.Parser [String] parseJannocoalFillColumns = OP.option (splitOn "," <$> OP.str) ( OP.long "fillColumns" <> - OP.short 'f' <> OP.value [] <> OP.help "A comma-separated list of Janno field names. If not specified, fill all columns that can be found in the source and target." ) From d85995731c7a288da0acf8c862322882dfb2e076 Mon Sep 17 00:00:00 2001 From: Stephan Schiffels Date: Tue, 19 Dec 2023 16:47:02 +0100 Subject: [PATCH 13/34] added log outputs, not tested yet --- src/Poseidon/CLI/Jannocoalesce.hs | 29 +++++++++++++------ .../CLI/OptparseApplicativeParsers.hs | 2 +- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/src/Poseidon/CLI/Jannocoalesce.hs b/src/Poseidon/CLI/Jannocoalesce.hs index 17cbf137..7f214325 100644 --- a/src/Poseidon/CLI/Jannocoalesce.hs +++ b/src/Poseidon/CLI/Jannocoalesce.hs @@ -58,14 +58,14 @@ runJannocoalesce (JannoCoalesceOptions sourceSpec target outSpec fields overwrit createDirectoryIfMissing True (takeDirectory outPath) writeJannoFile outPath (JannoRows newJanno) -makeNewJannoRows :: (MonadThrow m) => [JannoRow] -> [JannoRow] -> [String] -> Bool -> String -> String -> Maybe String -> m [JannoRow] +makeNewJannoRows :: [JannoRow] -> [JannoRow] -> [String] -> Bool -> String -> String -> Maybe String -> PoseidonIO [JannoRow] makeNewJannoRows sourceRows targetRows fields overwrite sKey tKey maybeStrip = forM targetRows $ \targetRow -> do posId <- getKeyFromJanno targetRow tKey sourceRowCandidates <- filterM (\r -> (matchWithOptionalStrip maybeStrip posId) <$> getKeyFromJanno r sKey) sourceRows case sourceRowCandidates of [] -> return targetRow - [keyRow] -> mergeRow targetRow keyRow fields overwrite tKey + [keyRow] -> mergeRow targetRow keyRow fields overwrite sKey tKey _ -> throwM $ PoseidonGenericException $ "source file contains multiple rows with key " ++ posId getKeyFromJanno :: (MonadThrow m) => JannoRow -> String -> m String @@ -89,19 +89,30 @@ matchWithOptionalStrip maybeRegex id1 id2 = let match = s =~ r in if null match then s else unpack $ replace (pack match) "" (pack s) --- note that we never overwrite values in the key-colum, that's why this function takes it as an argument to check. -mergeRow :: (MonadThrow m) => JannoRow -> JannoRow -> [String] -> Bool -> String -> m JannoRow -mergeRow targetRow sourceRow fields overwrite keyCol = do +mergeRow :: JannoRow -> JannoRow -> [String] -> Bool -> String -> String -> PoseidonIO JannoRow +mergeRow targetRow sourceRow fields overwrite sKey tKey = do let targetRowRecord = Csv.toNamedRecord targetRow sourceRowRecord = Csv.toNamedRecord sourceRow - parseResult = Csv.runParser . Csv.parseNamedRecord $ HM.unionWithKey mergeIfMissing targetRowRecord sourceRowRecord + newRowRecord = HM.unionWithKey mergeIfMissing targetRowRecord sourceRowRecord + parseResult = Csv.runParser . Csv.parseNamedRecord $ newRowRecord + logInfo $ "matched target individual with ID " ++ BSC.unpack (targetRowRecord HM.! (BSC.pack tKey)) ++ + " with source individual with ID " ++ BSC.unpack (sourceRowRecord HM.! (BSC.pack sKey)) case parseResult of - Left err -> throwM $ PoseidonGenericException $ "Janno row-merge error: " ++ err - Right r -> return r + Left err -> throwM . PoseidonGenericException $ "Janno row-merge error: " ++ err + Right r -> do + let newFields = HM.differenceWith (\v1 v2 -> if v1 == v2 then Nothing else Just v1) newRowRecord targetRowRecord + if HM.null newFields then do + logInfo "-- no changes" + return r + else do + forM (HM.toList newFields) $ \(key, val) -> + logInfo $ "-- copied \"" ++ BSC.unpack val ++ "\" from column " ++ BSC.unpack key + return r + return r where mergeIfMissing :: BSC.ByteString -> BSC.ByteString -> BSC.ByteString -> BSC.ByteString mergeIfMissing key targetVal sourceVal = - if key /= BSC.pack keyCol && (null fields || (BSC.unpack key `elem` fields)) && (targetVal `elem` ["n/a", ""] || overwrite) then + if key /= BSC.pack tKey && (null fields || (BSC.unpack key `elem` fields)) && (targetVal `elem` ["n/a", ""] || overwrite) then sourceVal else targetVal diff --git a/src/Poseidon/CLI/OptparseApplicativeParsers.hs b/src/Poseidon/CLI/OptparseApplicativeParsers.hs index 1bdd9426..e59aaad9 100644 --- a/src/Poseidon/CLI/OptparseApplicativeParsers.hs +++ b/src/Poseidon/CLI/OptparseApplicativeParsers.hs @@ -789,7 +789,7 @@ parseJannocoalTargetFile = OP.strOption ( parseJannocoalOutSpec :: OP.Parser (Maybe FilePath) parseJannocoalOutSpec = OP.option (Just <$> OP.str) ( OP.long "outFile" <> - OP.short `-o` <> + OP.short 'o' <> OP.metavar "FILE" <> OP.value Nothing <> OP.showDefault <> From 4486b344bebc16eb991be4843fbe81289d308e8a Mon Sep 17 00:00:00 2001 From: Stephan Schiffels Date: Tue, 19 Dec 2023 22:40:19 +0100 Subject: [PATCH 14/34] pedantic fixes --- src/Poseidon/CLI/Jannocoalesce.hs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/Poseidon/CLI/Jannocoalesce.hs b/src/Poseidon/CLI/Jannocoalesce.hs index 7f214325..9d0afba9 100644 --- a/src/Poseidon/CLI/Jannocoalesce.hs +++ b/src/Poseidon/CLI/Jannocoalesce.hs @@ -11,7 +11,7 @@ import Poseidon.Package (PackageReadOptions (..), import Poseidon.Utils (PoseidonException (..), PoseidonIO, logInfo) -import Control.Monad (filterM, forM) +import Control.Monad (filterM, forM, forM_) import Control.Monad.Catch (MonadThrow, throwM) import Control.Monad.IO.Class (liftIO) import qualified Data.ByteString.Char8 as BSC @@ -103,11 +103,9 @@ mergeRow targetRow sourceRow fields overwrite sKey tKey = do let newFields = HM.differenceWith (\v1 v2 -> if v1 == v2 then Nothing else Just v1) newRowRecord targetRowRecord if HM.null newFields then do logInfo "-- no changes" - return r else do - forM (HM.toList newFields) $ \(key, val) -> + forM_ (HM.toList newFields) $ \(key, val) -> logInfo $ "-- copied \"" ++ BSC.unpack val ++ "\" from column " ++ BSC.unpack key - return r return r where mergeIfMissing :: BSC.ByteString -> BSC.ByteString -> BSC.ByteString -> BSC.ByteString From c760e8364defd7c3339800da3f08c54db90016d0 Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Wed, 20 Dec 2023 11:20:28 +0100 Subject: [PATCH 15/34] fixed tests --- test/Poseidon/JannocoalesceSpec.hs | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/test/Poseidon/JannocoalesceSpec.hs b/test/Poseidon/JannocoalesceSpec.hs index 44691220..3db04d91 100644 --- a/test/Poseidon/JannocoalesceSpec.hs +++ b/test/Poseidon/JannocoalesceSpec.hs @@ -6,6 +6,7 @@ import Poseidon.Janno (CsvNamedRecord (..), JannoList (..), JannoRow (..), createMinimalSample, makeLatitude, makeLongitude) +import Poseidon.Utils (testLog) import qualified Data.HashMap.Strict as HM import SequenceFormats.Eigenstrat (EigenstratIndEntry (..), Sex (..)) @@ -56,25 +57,25 @@ testMergeSingleRow :: Spec testMergeSingleRow = describe "Poseidon.Jannocoalesce.mergeRow" $ do it "should correctly merge without fields and no override" $ do - merged <- mergeRow jannoTargetRow jannoSourceRow [] False "Poseidon_ID" + merged <- testLog $ mergeRow jannoTargetRow jannoSourceRow [] False "Poseidon_ID" "Poseidon_ID" jSite merged `shouldBe` Just "Vienna" jGroupName merged `shouldBe` JannoList ["SamplePop"] jLatitude merged `shouldBe` makeLatitude 30.0 jLongitude merged `shouldBe` makeLongitude 30.0 it "should correctly merge without fields and override" $ do - merged <- mergeRow jannoTargetRow jannoSourceRow [] True "Poseidon_ID" + merged <- testLog $ mergeRow jannoTargetRow jannoSourceRow [] True "Poseidon_ID" "Poseidon_ID" jSite merged `shouldBe` Just "Salzburg" jGroupName merged `shouldBe` JannoList ["SamplePop2"] jLatitude merged `shouldBe` makeLatitude 30.0 jLongitude merged `shouldBe` makeLongitude 30.0 it "should correctly merge with fields and no override" $ do - merged <- mergeRow jannoTargetRow jannoSourceRow ["Group_Name", "Latitude"] False "Poseidon_ID" + merged <- testLog $ mergeRow jannoTargetRow jannoSourceRow ["Group_Name", "Latitude"] False "Poseidon_ID" "Poseidon_ID" jSite merged `shouldBe` Just "Vienna" jGroupName merged `shouldBe` JannoList ["SamplePop"] jLatitude merged `shouldBe` makeLatitude 30.0 jLongitude merged `shouldBe` Nothing it "should correctly merge with fields and override" $ do - merged <- mergeRow jannoTargetRow jannoSourceRow ["Group_Name", "Latitude"] True "Poseidon_ID" + merged <- testLog $ mergeRow jannoTargetRow jannoSourceRow ["Group_Name", "Latitude"] True "Poseidon_ID" "Poseidon_ID" jSite merged `shouldBe` Just "Vienna" jGroupName merged `shouldBe` JannoList ["SamplePop2"] jLatitude merged `shouldBe` makeLatitude 30.0 @@ -83,22 +84,22 @@ testMergeSingleRow = testCoalesceMultipleRows :: Spec testCoalesceMultipleRows = describe "Poseidon.Jannocoalesce.makeNewJannoRows" $ do it "should correctly copy with simple matching" $ do - newJ <- makeNewJannoRows jannoSourceRows jannoTargetRows [] False "Poseidon_ID" "Poseidon_ID" Nothing + newJ <- testLog $ makeNewJannoRows jannoSourceRows jannoTargetRows [] False "Poseidon_ID" "Poseidon_ID" Nothing jCountry (newJ !! 0) `shouldBe` Just "Germany" jCountry (newJ !! 1) `shouldBe` Just "Austria" jCountry (newJ !! 2) `shouldBe` Nothing it "should correctly copy with simple matching and overwrite" $ do - newJ <- makeNewJannoRows jannoSourceRows jannoTargetRows [] True "Poseidon_ID" "Poseidon_ID" Nothing + newJ <- testLog $ makeNewJannoRows jannoSourceRows jannoTargetRows [] True "Poseidon_ID" "Poseidon_ID" Nothing jCountry (newJ !! 0) `shouldBe` Just "Austria" jCountry (newJ !! 1) `shouldBe` Just "Austria" it "should throw with duplicate source keys" $ do let s = jannoSourceRows ++ [jannoSourceRows !! 1] - makeNewJannoRows s jannoTargetRows [] False "Poseidon_ID" "Poseidon_ID" Nothing `shouldThrow` anyException + testLog (makeNewJannoRows s jannoTargetRows [] False "Poseidon_ID" "Poseidon_ID" Nothing) `shouldThrow` anyException it "should correctly copy with suffix strip" $ do - newJ <- makeNewJannoRows jannoSourceRows jannoTargetRows [] False "Poseidon_ID" "Poseidon_ID" (Just "_AB") + newJ <- testLog $ makeNewJannoRows jannoSourceRows jannoTargetRows [] False "Poseidon_ID" "Poseidon_ID" (Just "_AB") jCountry (newJ !! 0) `shouldBe` Just "Germany" jCountry (newJ !! 1) `shouldBe` Just "Austria" jCountry (newJ !! 2) `shouldBe` Just "Austria" it "should correctly copy with alternative ID column" $ do - newJ <- makeNewJannoRows jannoSourceRows jannoTargetRows [] False "Poseidon_ID_alt" "Poseidon_ID" Nothing + newJ <- testLog $ makeNewJannoRows jannoSourceRows jannoTargetRows [] False "Poseidon_ID_alt" "Poseidon_ID" Nothing jCountry (newJ !! 2) `shouldBe` Just "Austria" From d2dc568ba7e1b550df677700dc46260b56c94947 Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Wed, 20 Dec 2023 11:22:38 +0100 Subject: [PATCH 16/34] stylish haskell --- test/Poseidon/JannocoalesceSpec.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/Poseidon/JannocoalesceSpec.hs b/test/Poseidon/JannocoalesceSpec.hs index 3db04d91..f95417fd 100644 --- a/test/Poseidon/JannocoalesceSpec.hs +++ b/test/Poseidon/JannocoalesceSpec.hs @@ -6,7 +6,7 @@ import Poseidon.Janno (CsvNamedRecord (..), JannoList (..), JannoRow (..), createMinimalSample, makeLatitude, makeLongitude) -import Poseidon.Utils (testLog) +import Poseidon.Utils (testLog) import qualified Data.HashMap.Strict as HM import SequenceFormats.Eigenstrat (EigenstratIndEntry (..), Sex (..)) From 1bb95a973403098ad9fcf08ada90640cfc107043 Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Wed, 20 Dec 2023 13:01:56 +0100 Subject: [PATCH 17/34] various minor suggestions --- src/Poseidon/CLI/Jannocoalesce.hs | 18 +++++----- .../CLI/OptparseApplicativeParsers.hs | 35 ++++++++++++++----- 2 files changed, 36 insertions(+), 17 deletions(-) diff --git a/src/Poseidon/CLI/Jannocoalesce.hs b/src/Poseidon/CLI/Jannocoalesce.hs index 9d0afba9..6d95ceef 100644 --- a/src/Poseidon/CLI/Jannocoalesce.hs +++ b/src/Poseidon/CLI/Jannocoalesce.hs @@ -9,7 +9,7 @@ import Poseidon.Package (PackageReadOptions (..), getJointJanno, readPoseidonPackageCollection) import Poseidon.Utils (PoseidonException (..), PoseidonIO, - logInfo) + logInfo, logDebug) import Control.Monad (filterM, forM, forM_) import Control.Monad.Catch (MonadThrow, throwM) @@ -64,7 +64,9 @@ makeNewJannoRows sourceRows targetRows fields overwrite sKey tKey maybeStrip = posId <- getKeyFromJanno targetRow tKey sourceRowCandidates <- filterM (\r -> (matchWithOptionalStrip maybeStrip posId) <$> getKeyFromJanno r sKey) sourceRows case sourceRowCandidates of - [] -> return targetRow + [] -> do + logInfo $ "no match for target " ++ posId ++ " in source" + return targetRow [keyRow] -> mergeRow targetRow keyRow fields overwrite sKey tKey _ -> throwM $ PoseidonGenericException $ "source file contains multiple rows with key " ++ posId @@ -72,7 +74,7 @@ getKeyFromJanno :: (MonadThrow m) => JannoRow -> String -> m String getKeyFromJanno jannoRow key = do let jannoRowDict = Csv.toNamedRecord jannoRow case jannoRowDict HM.!? (BSC.pack key) of - Nothing -> throwM $ PoseidonGenericException ("Key " ++ key ++ " not present in janno file") + Nothing -> throwM $ PoseidonGenericException ("Key " ++ key ++ " not present in .janno file") Just r -> return $ BSC.unpack r matchWithOptionalStrip :: (Maybe String) -> String -> String -> Bool @@ -95,17 +97,17 @@ mergeRow targetRow sourceRow fields overwrite sKey tKey = do sourceRowRecord = Csv.toNamedRecord sourceRow newRowRecord = HM.unionWithKey mergeIfMissing targetRowRecord sourceRowRecord parseResult = Csv.runParser . Csv.parseNamedRecord $ newRowRecord - logInfo $ "matched target individual with ID " ++ BSC.unpack (targetRowRecord HM.! (BSC.pack tKey)) ++ - " with source individual with ID " ++ BSC.unpack (sourceRowRecord HM.! (BSC.pack sKey)) + logInfo $ "matched target " ++ BSC.unpack (targetRowRecord HM.! (BSC.pack tKey)) ++ + " with source " ++ BSC.unpack (sourceRowRecord HM.! (BSC.pack sKey)) case parseResult of - Left err -> throwM . PoseidonGenericException $ "Janno row-merge error: " ++ err + Left err -> throwM . PoseidonGenericException $ ".janno row-merge error: " ++ err Right r -> do let newFields = HM.differenceWith (\v1 v2 -> if v1 == v2 then Nothing else Just v1) newRowRecord targetRowRecord if HM.null newFields then do - logInfo "-- no changes" + logDebug "-- no changes" else do forM_ (HM.toList newFields) $ \(key, val) -> - logInfo $ "-- copied \"" ++ BSC.unpack val ++ "\" from column " ++ BSC.unpack key + logDebug $ "-- copied \"" ++ BSC.unpack val ++ "\" from column " ++ BSC.unpack key return r where mergeIfMissing :: BSC.ByteString -> BSC.ByteString -> BSC.ByteString -> BSC.ByteString diff --git a/src/Poseidon/CLI/OptparseApplicativeParsers.hs b/src/Poseidon/CLI/OptparseApplicativeParsers.hs index e59aaad9..1c52049e 100644 --- a/src/Poseidon/CLI/OptparseApplicativeParsers.hs +++ b/src/Poseidon/CLI/OptparseApplicativeParsers.hs @@ -775,7 +775,7 @@ parseJannocoalSourceSpec = parseJannocoalSingleSource <|> (JannoSourceBaseDirs < OP.long "sourceFile" <> OP.short 's' <> OP.metavar "FILE" <> - OP.help "The source Janno file" + OP.help "The source .janno file." ) parseJannocoalTargetFile :: OP.Parser FilePath @@ -783,7 +783,7 @@ parseJannocoalTargetFile = OP.strOption ( OP.long "targetFile" <> OP.short 't' <> OP.metavar "FILE" <> - OP.help "The target file to fill" + OP.help "The target .janno file to fill." ) parseJannocoalOutSpec :: OP.Parser (Maybe FilePath) @@ -793,30 +793,47 @@ parseJannocoalOutSpec = OP.option (Just <$> OP.str) ( OP.metavar "FILE" <> OP.value Nothing <> OP.showDefault <> - OP.help "An optional file to write the results to. If not specified, change the target file in place." + OP.help "An optional file to write the results to. \ + \If not specified, change the target file in place." ) parseJannocoalFillColumns :: OP.Parser [String] parseJannocoalFillColumns = OP.option (splitOn "," <$> OP.str) ( OP.long "fillColumns" <> OP.value [] <> - OP.help "A comma-separated list of Janno field names. If not specified, fill all columns that can be found in the source and target." + OP.help "A comma-separated list of .janno field names. \ + \If not specified, fill all columns that can be found in the source and target." ) parseJannocoalOverride :: OP.Parser Bool parseJannocoalOverride = OP.switch ( OP.long "force" <> OP.short 'f' <> - OP.help "With this option, potential non-missing content in target columns gets overridden with non-missing content in source columns. By default, only missing data gets filled-in." + OP.help "With this option, potential non-missing content in target columns gets overridden \ + \with non-missing content in source columns. By default, only missing data gets filled-in." ) parseJannocoalSourceKey :: OP.Parser String -parseJannocoalSourceKey = OP.strOption (OP.long "sourceKey" <> OP.help "the janno column to use as the source key" <> OP.value "Poseidon_ID" <> OP.showDefault) +parseJannocoalSourceKey = OP.strOption ( + OP.long "sourceKey" <> + OP.help "The .janno column to use as the source key." <> + OP.value "Poseidon_ID" <> + OP.showDefault + ) parseJannocoalTargetKey :: OP.Parser String -parseJannocoalTargetKey = OP.strOption (OP.long "targetKey" <> OP.help "the janno column to use as the target key" <> OP.value "Poseidon_ID" <> OP.showDefault) +parseJannocoalTargetKey = OP.strOption ( + OP.long "targetKey" <> + OP.help "The .janno column to use as the target key." <> + OP.value "Poseidon_ID" <> + OP.showDefault + ) parseJannocoalIdStripRegex :: OP.Parser (Maybe String) -parseJannocoalIdStripRegex = OP.option (Just <$> OP.str) (OP.long "stripIdSuffix" <> - OP.help "an optional regular expression to identify parts of the IDs to strip before matching between source and target" <> OP.value Nothing) +parseJannocoalIdStripRegex = OP.option (Just <$> OP.str) ( + OP.long "stripIdRegex" <> + OP.help "An optional regular expression to identify parts of the IDs to strip \ + \before matching between source and target. Uses POSIX Extended regular expressions." <> + OP.value Nothing + ) From d2d97cbcd87d3c3ef35717dfc20e8b257dabd6f2 Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Wed, 20 Dec 2023 13:02:26 +0100 Subject: [PATCH 18/34] stylish haskell --- src/Poseidon/CLI/Jannocoalesce.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Poseidon/CLI/Jannocoalesce.hs b/src/Poseidon/CLI/Jannocoalesce.hs index 6d95ceef..e6076d0a 100644 --- a/src/Poseidon/CLI/Jannocoalesce.hs +++ b/src/Poseidon/CLI/Jannocoalesce.hs @@ -9,7 +9,7 @@ import Poseidon.Package (PackageReadOptions (..), getJointJanno, readPoseidonPackageCollection) import Poseidon.Utils (PoseidonException (..), PoseidonIO, - logInfo, logDebug) + logDebug, logInfo) import Control.Monad (filterM, forM, forM_) import Control.Monad.Catch (MonadThrow, throwM) From 7453873b92355d1a7a92e561b928c7200156156d Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Wed, 20 Dec 2023 13:15:06 +0100 Subject: [PATCH 19/34] added message to indicate the start of the coalescing process --- src/Poseidon/CLI/Jannocoalesce.hs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Poseidon/CLI/Jannocoalesce.hs b/src/Poseidon/CLI/Jannocoalesce.hs index e6076d0a..8c16f233 100644 --- a/src/Poseidon/CLI/Jannocoalesce.hs +++ b/src/Poseidon/CLI/Jannocoalesce.hs @@ -59,7 +59,8 @@ runJannocoalesce (JannoCoalesceOptions sourceSpec target outSpec fields overwrit writeJannoFile outPath (JannoRows newJanno) makeNewJannoRows :: [JannoRow] -> [JannoRow] -> [String] -> Bool -> String -> String -> Maybe String -> PoseidonIO [JannoRow] -makeNewJannoRows sourceRows targetRows fields overwrite sKey tKey maybeStrip = +makeNewJannoRows sourceRows targetRows fields overwrite sKey tKey maybeStrip = do + logInfo "Starting to coalesce..." forM targetRows $ \targetRow -> do posId <- getKeyFromJanno targetRow tKey sourceRowCandidates <- filterM (\r -> (matchWithOptionalStrip maybeStrip posId) <$> getKeyFromJanno r sKey) sourceRows From 899433ad52a9f7ec54839af8774e0ff473ed73c1 Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Wed, 21 Feb 2024 15:39:04 +0100 Subject: [PATCH 20/34] changed the interface of jannocoalesce to allow excluding columns --- CHANGELOG.md | 2 +- src-executables/Main-trident.hs | 2 +- src/Poseidon/CLI/Jannocoalesce.hs | 27 ++++++++++++++----- .../CLI/OptparseApplicativeParsers.hs | 24 +++++++++++------ .../GoldenTestsRunCommands.hs | 5 ++-- 5 files changed, 41 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bfb7ea7d..92e5f3f1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,5 @@ - V 1.4.1.0: - - Added new tool `trident jannocoalesce`, which merges information from a source janno file to a target janno file. + - Added new tool `trident jannocoalesce`, which merges information from a source .janno file to a target .janno file. - V 1.4.0.4: - Added better error messages for generic cassava parsing (e.g. for broken Int and Double fields) in .janno files. - Added better error handling and messages for inconsistent `Date_*`, `Contamination_*` and `Relation_*` columns in .janno files using an `Except` & `Writer` monad stack. diff --git a/src-executables/Main-trident.hs b/src-executables/Main-trident.hs index 81c33f47..b04b7a06 100644 --- a/src-executables/Main-trident.hs +++ b/src-executables/Main-trident.hs @@ -273,7 +273,7 @@ jannocoalesceOptParser :: OP.Parser JannoCoalesceOptions jannocoalesceOptParser = JannoCoalesceOptions <$> parseJannocoalSourceSpec <*> parseJannocoalTargetFile <*> parseJannocoalOutSpec - <*> parseJannocoalFillColumns + <*> parseJannocoalJannoColumns <*> parseJannocoalOverride <*> parseJannocoalSourceKey <*> parseJannocoalTargetKey diff --git a/src/Poseidon/CLI/Jannocoalesce.hs b/src/Poseidon/CLI/Jannocoalesce.hs index 8c16f233..0a3970f3 100644 --- a/src/Poseidon/CLI/Jannocoalesce.hs +++ b/src/Poseidon/CLI/Jannocoalesce.hs @@ -25,11 +25,16 @@ import Text.Regex.TDFA ((=~)) -- the source can be a single janno file, or a set of base directories as usual. data JannoSourceSpec = JannoSourceSingle FilePath | JannoSourceBaseDirs [FilePath] +data CoalesceJannoColumnSpec = + AllJannoColumns + | IncludeJannoColumns [String] + | ExcludeJannoColumns [String] + data JannoCoalesceOptions = JannoCoalesceOptions { _jannocoalesceSource :: JannoSourceSpec , _jannocoalesceTarget :: FilePath , _jannocoalesceOutSpec :: Maybe FilePath -- Nothing means "in place" - , _jannocoalesceFillColumns :: [String] -- empty list means All + , _jannocoalesceJannoColumns :: CoalesceJannoColumnSpec , _jannocoalesceOverwriteColumns :: Bool , _jannocoalesceSourceKey :: String -- by default set to "Poseidon_ID" , _jannocoalesceTargetKey :: String -- by default set to "Poseidon_ID" @@ -58,7 +63,7 @@ runJannocoalesce (JannoCoalesceOptions sourceSpec target outSpec fields overwrit createDirectoryIfMissing True (takeDirectory outPath) writeJannoFile outPath (JannoRows newJanno) -makeNewJannoRows :: [JannoRow] -> [JannoRow] -> [String] -> Bool -> String -> String -> Maybe String -> PoseidonIO [JannoRow] +makeNewJannoRows :: [JannoRow] -> [JannoRow] -> CoalesceJannoColumnSpec -> Bool -> String -> String -> Maybe String -> PoseidonIO [JannoRow] makeNewJannoRows sourceRows targetRows fields overwrite sKey tKey maybeStrip = do logInfo "Starting to coalesce..." forM targetRows $ \targetRow -> do @@ -92,7 +97,7 @@ matchWithOptionalStrip maybeRegex id1 id2 = let match = s =~ r in if null match then s else unpack $ replace (pack match) "" (pack s) -mergeRow :: JannoRow -> JannoRow -> [String] -> Bool -> String -> String -> PoseidonIO JannoRow +mergeRow :: JannoRow -> JannoRow -> CoalesceJannoColumnSpec -> Bool -> String -> String -> PoseidonIO JannoRow mergeRow targetRow sourceRow fields overwrite sKey tKey = do let targetRowRecord = Csv.toNamedRecord targetRow sourceRowRecord = Csv.toNamedRecord sourceRow @@ -113,7 +118,15 @@ mergeRow targetRow sourceRow fields overwrite sKey tKey = do where mergeIfMissing :: BSC.ByteString -> BSC.ByteString -> BSC.ByteString -> BSC.ByteString mergeIfMissing key targetVal sourceVal = - if key /= BSC.pack tKey && (null fields || (BSC.unpack key `elem` fields)) && (targetVal `elem` ["n/a", ""] || overwrite) then - sourceVal - else - targetVal + -- don't overwrite key + if key /= BSC.pack tKey + -- overwrite field only if it's requested + && includeField (BSC.unpack key) fields + -- overwrite only empty fields, except overwrite is set + && (targetVal `elem` ["n/a", ""] || overwrite) + then sourceVal + else targetVal + includeField :: String -> CoalesceJannoColumnSpec -> Bool + includeField _ AllJannoColumns = True + includeField key (IncludeJannoColumns xs) = key `elem` xs + includeField key (ExcludeJannoColumns xs) = key `notElem` xs diff --git a/src/Poseidon/CLI/OptparseApplicativeParsers.hs b/src/Poseidon/CLI/OptparseApplicativeParsers.hs index 1c52049e..ec038457 100644 --- a/src/Poseidon/CLI/OptparseApplicativeParsers.hs +++ b/src/Poseidon/CLI/OptparseApplicativeParsers.hs @@ -3,7 +3,7 @@ module Poseidon.CLI.OptparseApplicativeParsers where import Poseidon.CLI.Chronicle (ChronOperation (..)) -import Poseidon.CLI.Jannocoalesce (JannoSourceSpec (..)) +import Poseidon.CLI.Jannocoalesce (JannoSourceSpec (..), CoalesceJannoColumnSpec (..)) import Poseidon.CLI.List (ListEntity (..), RepoLocationSpec (..)) import Poseidon.CLI.Rectify (ChecksumsToRectify (..), @@ -797,13 +797,21 @@ parseJannocoalOutSpec = OP.option (Just <$> OP.str) ( \If not specified, change the target file in place." ) -parseJannocoalFillColumns :: OP.Parser [String] -parseJannocoalFillColumns = OP.option (splitOn "," <$> OP.str) ( - OP.long "fillColumns" <> - OP.value [] <> - OP.help "A comma-separated list of .janno field names. \ - \If not specified, fill all columns that can be found in the source and target." - ) +parseJannocoalJannoColumns :: OP.Parser CoalesceJannoColumnSpec +parseJannocoalJannoColumns = includeJannoColumns OP.<|> excludeJannoColumns OP.<|> pure AllJannoColumns + where + includeJannoColumns = OP.option (IncludeJannoColumns . splitOn "," <$> OP.str) ( + OP.long "includeColumns" <> + OP.help "A comma-separated list of .janno column names to coalesce. \ + \If not specified, all columns that can be found in the source \ + \and target will get filled." + ) + excludeJannoColumns = OP.option (ExcludeJannoColumns . splitOn "," <$> OP.str) ( + OP.long "excludeColumns" <> + OP.help "A comma-separated list of .janno column names NOT to coalesce. \ + \All columns that can be found in the source and target will get filled, \ + \except the ones listed here." + ) parseJannocoalOverride :: OP.Parser Bool parseJannocoalOverride = OP.switch ( diff --git a/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs b/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs index 3151571d..161ebd2f 100644 --- a/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs +++ b/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs @@ -11,7 +11,8 @@ import Poseidon.CLI.Genoconvert (GenoconvertOptions (..), import Poseidon.CLI.Init (InitOptions (..), runInit) import Poseidon.CLI.Jannocoalesce (JannoCoalesceOptions (..), JannoSourceSpec (..), - runJannocoalesce) + runJannocoalesce, + CoalesceJannoColumnSpec (..)) import Poseidon.CLI.List (ListEntity (..), ListOptions (..), RepoLocationSpec (..), runList) import Poseidon.CLI.Rectify (ChecksumsToRectify (..), @@ -1090,7 +1091,7 @@ testPipelineJannocoalesce testDir checkFilePath = do _jannocoalesceSource = JannoSourceSingle "test/testDat/testJannoFiles/normal_full.janno", _jannocoalesceTarget = "test/testDat/testJannoFiles/normal_subset.janno", _jannocoalesceOutSpec = Just (testDir "jannocoalesce" "targetNoFieldsNoOverride.janno"), - _jannocoalesceFillColumns = [], + _jannocoalesceJannoColumns = AllJannoColumns, _jannocoalesceOverwriteColumns = False, _jannocoalesceSourceKey = "Poseidon_ID", _jannocoalesceTargetKey = "Poseidon_ID", From 07fe33a92f1e2bd77bfab8b3ebce84f2063464de Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Wed, 21 Feb 2024 16:27:44 +0100 Subject: [PATCH 21/34] added some more tests for the new column selection functionality in jannocoalesce --- test/Poseidon/JannocoalesceSpec.hs | 28 +++++++++------ .../GoldenTestCheckSumFile.txt | 4 ++- .../GoldenTestData/chronicle/chronicle2.yml | 14 ++++---- ...NoFieldsNoOverride.janno => target1.janno} | 4 +-- .../jannocoalesce/target2.janno | 4 +++ .../jannocoalesce/target3.janno | 4 +++ .../GoldenTestsRunCommands.hs | 35 +++++++++++++++++-- 7 files changed, 69 insertions(+), 24 deletions(-) rename test/PoseidonGoldenTests/GoldenTestData/jannocoalesce/{targetNoFieldsNoOverride.janno => target1.janno} (59%) create mode 100644 test/PoseidonGoldenTests/GoldenTestData/jannocoalesce/target2.janno create mode 100644 test/PoseidonGoldenTests/GoldenTestData/jannocoalesce/target3.janno diff --git a/test/Poseidon/JannocoalesceSpec.hs b/test/Poseidon/JannocoalesceSpec.hs index f95417fd..da78d799 100644 --- a/test/Poseidon/JannocoalesceSpec.hs +++ b/test/Poseidon/JannocoalesceSpec.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} module Poseidon.JannocoalesceSpec (spec) where -import Poseidon.CLI.Jannocoalesce (makeNewJannoRows, mergeRow) +import Poseidon.CLI.Jannocoalesce (makeNewJannoRows, mergeRow, CoalesceJannoColumnSpec (..)) import Poseidon.Janno (CsvNamedRecord (..), JannoList (..), JannoRow (..), createMinimalSample, makeLatitude, @@ -57,25 +57,31 @@ testMergeSingleRow :: Spec testMergeSingleRow = describe "Poseidon.Jannocoalesce.mergeRow" $ do it "should correctly merge without fields and no override" $ do - merged <- testLog $ mergeRow jannoTargetRow jannoSourceRow [] False "Poseidon_ID" "Poseidon_ID" + merged <- testLog $ mergeRow jannoTargetRow jannoSourceRow AllJannoColumns False "Poseidon_ID" "Poseidon_ID" jSite merged `shouldBe` Just "Vienna" jGroupName merged `shouldBe` JannoList ["SamplePop"] jLatitude merged `shouldBe` makeLatitude 30.0 jLongitude merged `shouldBe` makeLongitude 30.0 it "should correctly merge without fields and override" $ do - merged <- testLog $ mergeRow jannoTargetRow jannoSourceRow [] True "Poseidon_ID" "Poseidon_ID" + merged <- testLog $ mergeRow jannoTargetRow jannoSourceRow AllJannoColumns True "Poseidon_ID" "Poseidon_ID" jSite merged `shouldBe` Just "Salzburg" jGroupName merged `shouldBe` JannoList ["SamplePop2"] jLatitude merged `shouldBe` makeLatitude 30.0 jLongitude merged `shouldBe` makeLongitude 30.0 - it "should correctly merge with fields and no override" $ do - merged <- testLog $ mergeRow jannoTargetRow jannoSourceRow ["Group_Name", "Latitude"] False "Poseidon_ID" "Poseidon_ID" + it "should correctly merge with fields selection and no override" $ do + merged <- testLog $ mergeRow jannoTargetRow jannoSourceRow (IncludeJannoColumns ["Group_Name", "Latitude"]) False "Poseidon_ID" "Poseidon_ID" jSite merged `shouldBe` Just "Vienna" jGroupName merged `shouldBe` JannoList ["SamplePop"] jLatitude merged `shouldBe` makeLatitude 30.0 jLongitude merged `shouldBe` Nothing + it "should correctly merge with negative field selection and no override" $ do + merged <- testLog $ mergeRow jannoTargetRow jannoSourceRow (ExcludeJannoColumns ["Latitude"]) False "Poseidon_ID" "Poseidon_ID" + jSite merged `shouldBe` Just "Vienna" + jGroupName merged `shouldBe` JannoList ["SamplePop"] + jLatitude merged `shouldBe` Nothing + jLongitude merged `shouldBe` makeLongitude 30.0 it "should correctly merge with fields and override" $ do - merged <- testLog $ mergeRow jannoTargetRow jannoSourceRow ["Group_Name", "Latitude"] True "Poseidon_ID" "Poseidon_ID" + merged <- testLog $ mergeRow jannoTargetRow jannoSourceRow (IncludeJannoColumns ["Group_Name", "Latitude"]) True "Poseidon_ID" "Poseidon_ID" jSite merged `shouldBe` Just "Vienna" jGroupName merged `shouldBe` JannoList ["SamplePop2"] jLatitude merged `shouldBe` makeLatitude 30.0 @@ -84,22 +90,22 @@ testMergeSingleRow = testCoalesceMultipleRows :: Spec testCoalesceMultipleRows = describe "Poseidon.Jannocoalesce.makeNewJannoRows" $ do it "should correctly copy with simple matching" $ do - newJ <- testLog $ makeNewJannoRows jannoSourceRows jannoTargetRows [] False "Poseidon_ID" "Poseidon_ID" Nothing + newJ <- testLog $ makeNewJannoRows jannoSourceRows jannoTargetRows AllJannoColumns False "Poseidon_ID" "Poseidon_ID" Nothing jCountry (newJ !! 0) `shouldBe` Just "Germany" jCountry (newJ !! 1) `shouldBe` Just "Austria" jCountry (newJ !! 2) `shouldBe` Nothing it "should correctly copy with simple matching and overwrite" $ do - newJ <- testLog $ makeNewJannoRows jannoSourceRows jannoTargetRows [] True "Poseidon_ID" "Poseidon_ID" Nothing + newJ <- testLog $ makeNewJannoRows jannoSourceRows jannoTargetRows AllJannoColumns True "Poseidon_ID" "Poseidon_ID" Nothing jCountry (newJ !! 0) `shouldBe` Just "Austria" jCountry (newJ !! 1) `shouldBe` Just "Austria" it "should throw with duplicate source keys" $ do let s = jannoSourceRows ++ [jannoSourceRows !! 1] - testLog (makeNewJannoRows s jannoTargetRows [] False "Poseidon_ID" "Poseidon_ID" Nothing) `shouldThrow` anyException + testLog (makeNewJannoRows s jannoTargetRows AllJannoColumns False "Poseidon_ID" "Poseidon_ID" Nothing) `shouldThrow` anyException it "should correctly copy with suffix strip" $ do - newJ <- testLog $ makeNewJannoRows jannoSourceRows jannoTargetRows [] False "Poseidon_ID" "Poseidon_ID" (Just "_AB") + newJ <- testLog $ makeNewJannoRows jannoSourceRows jannoTargetRows AllJannoColumns False "Poseidon_ID" "Poseidon_ID" (Just "_AB") jCountry (newJ !! 0) `shouldBe` Just "Germany" jCountry (newJ !! 1) `shouldBe` Just "Austria" jCountry (newJ !! 2) `shouldBe` Just "Austria" it "should correctly copy with alternative ID column" $ do - newJ <- testLog $ makeNewJannoRows jannoSourceRows jannoTargetRows [] False "Poseidon_ID_alt" "Poseidon_ID" Nothing + newJ <- testLog $ makeNewJannoRows jannoSourceRows jannoTargetRows AllJannoColumns False "Poseidon_ID_alt" "Poseidon_ID" Nothing jCountry (newJ !! 2) `shouldBe` Just "Austria" diff --git a/test/PoseidonGoldenTests/GoldenTestCheckSumFile.txt b/test/PoseidonGoldenTests/GoldenTestCheckSumFile.txt index 761322d1..38598237 100644 --- a/test/PoseidonGoldenTests/GoldenTestCheckSumFile.txt +++ b/test/PoseidonGoldenTests/GoldenTestCheckSumFile.txt @@ -113,4 +113,6 @@ b43da4d5734371c0648553120f812466 fetch fetch/multi_packages_2/Lamnidis_2018-1.0. 0ddad9ea097bca0253e0c3c6157efa68 listRemote listRemote/listRemote2 b2286cf9af7c6c8757b8109a1f58e2d9 listRemote listRemote/listRemote3 0433b2a80ee5a2eb5bf8c6404130e562 listRemote listRemote/listRemote4 -4361cf41737cb13cc50cd21936c64030 jannocoalesce jannocoalesce/targetNoFieldsNoOverride.janno \ No newline at end of file +282cedf121f37e81c1e45ec0dfb97560 jannocoalesce jannocoalesce/target1.janno +bca03ca94c0f4d872aafd03d649a4f6c jannocoalesce jannocoalesce/target2.janno +a202f0c1636d55258454ad0a0dfea977 jannocoalesce jannocoalesce/target3.janno \ No newline at end of file diff --git a/test/PoseidonGoldenTests/GoldenTestData/chronicle/chronicle2.yml b/test/PoseidonGoldenTests/GoldenTestData/chronicle/chronicle2.yml index f9a6e3de..f446f239 100644 --- a/test/PoseidonGoldenTests/GoldenTestData/chronicle/chronicle2.yml +++ b/test/PoseidonGoldenTests/GoldenTestData/chronicle/chronicle2.yml @@ -1,29 +1,29 @@ title: Chronicle title description: Chronicle description chronicleVersion: 0.2.0 -lastModified: 2023-12-05 +lastModified: 2024-02-21 packages: - title: Lamnidis_2018 version: 1.0.0 - commit: fee8b1e56bbcf70632bcdc55da017e888734ad8e + commit: 671da4388a118b5c2616cea7b935f2c35fc84060 path: Lamnidis_2018 - title: Lamnidis_2018 version: 1.0.1 - commit: fee8b1e56bbcf70632bcdc55da017e888734ad8e + commit: 671da4388a118b5c2616cea7b935f2c35fc84060 path: Lamnidis_2018_newVersion - title: Schiffels version: 1.1.1 - commit: a573f677fea11b12a1e9d00256c208a18dd09d51 + commit: 9122486b44a36185bea5033d41220e474bb0ada9 path: Schiffels - title: Schiffels_2016 version: 1.0.1 - commit: fee8b1e56bbcf70632bcdc55da017e888734ad8e + commit: 671da4388a118b5c2616cea7b935f2c35fc84060 path: Schiffels_2016 - title: Schmid_2028 version: 1.0.0 - commit: fee8b1e56bbcf70632bcdc55da017e888734ad8e + commit: 671da4388a118b5c2616cea7b935f2c35fc84060 path: Schmid_2028 - title: Wang_2020 version: 0.1.0 - commit: fee8b1e56bbcf70632bcdc55da017e888734ad8e + commit: 671da4388a118b5c2616cea7b935f2c35fc84060 path: Wang_2020 diff --git a/test/PoseidonGoldenTests/GoldenTestData/jannocoalesce/targetNoFieldsNoOverride.janno b/test/PoseidonGoldenTests/GoldenTestData/jannocoalesce/target1.janno similarity index 59% rename from test/PoseidonGoldenTests/GoldenTestData/jannocoalesce/targetNoFieldsNoOverride.janno rename to test/PoseidonGoldenTests/GoldenTestData/jannocoalesce/target1.janno index 5fb72c09..8760de51 100644 --- a/test/PoseidonGoldenTests/GoldenTestData/jannocoalesce/targetNoFieldsNoOverride.janno +++ b/test/PoseidonGoldenTests/GoldenTestData/jannocoalesce/target1.janno @@ -1,4 +1,4 @@ Poseidon_ID Genetic_Sex Group_Name Alternative_IDs Relation_To Relation_Degree Relation_Type Relation_Note Collection_ID Country Country_ISO Location Site Latitude Longitude Date_Type Date_C14_Labnr Date_C14_Uncal_BP Date_C14_Uncal_BP_Err Date_BC_AD_Start Date_BC_AD_Median Date_BC_AD_Stop Date_Note MT_Haplogroup Y_Haplogroup Source_Tissue Nr_Libraries Library_Names Capture_Type UDG Library_Built Genotype_Ploidy Data_Preparation_Pipeline_URL Endogenous Nr_SNPs Coverage_on_Target_SNPs Damage Contamination Contamination_Err Contamination_Meas Contamination_Note Genetic_Source_Accession_IDs Primary_Contact Publication Note Keywords AdditionalColumn1 AdditionalColumn2 -XXX011 M POP1;POP3 Paul;Peter XXX012;I1234 first;second father_of;grandfather_of yyy n/a xxx DE Aachen xxx 0.0 0.0 C14 A-1;A-2;A-3 3000;3100;2900 30;40;20 -1200 -1000 -800 x x x A C xxx;yyy 2 Lib1;Lib2 Shotgun;1240K minus ds diploid ftp://test.test 0.0 0 0.0 0.0 10 1 ANGSD v.123 n/a Ich;mag;Kekse Ich unpublished This is a fine sample Hutschnur test1 test2 -XXX012 F POP2 n/a XXX011 first daughter_of n/a n/a xxx FR xxx Cologne -90.0 -180.0 contextual n/a n/a n/a -5500 -5000 -4500 yyy B B xxx 0 Lib3 1240K half ss haploid https://www.google.de 0.0 0 0.0 100.0 20;50;70 2;5;7.4 Schmutzi v145;Zwiebel;other xxx aus Du PaulNature2026 Cheesecake n/a test3 test4 +XXX011 M POP1 Paul;Peter XXX012;I1234 first;second father_of;grandfather_of yyy n/a xxx DE xxx xxx 0.0 0.0 C14 A-1;A-2;A-3 3000;3100;2900 30;40;20 -1200 -1000 -800 x x x A C xxx;yyy 2 Lib1;Lib2 Shotgun;1240K minus ds diploid ftp://test.test 0.0 0 0.0 0.0 10 1 ANGSD v.123 n/a Ich;mag;Kekse Ich unpublished This is a fine sample Hutschnur test1 test2 +XXX012 F POP2 n/a XXX011 first daughter_of n/a n/a xxx FR xxx xxx -90.0 -180.0 contextual n/a n/a n/a -5500 -5000 -4500 yyy B B xxx 0 Lib3 1240K half ss haploid https://www.google.de 0.0 0 0.0 100.0 20;50;70 2;5;7.4 Schmutzi v145;Zwiebel;other xxx aus Du PaulNature2026 Cheesecake n/a test3 test4 XXX013 M POP1 Skeleton Joe XXX011 sixthToTenth n/a xxx n/a xxx EG xxx xxx 90.0 180.0 modern n/a n/a n/a 2000 2000 2000 n/a C A xxx 0 n/a ReferenceGenome plus mixed diploid http://huhu.org/23&test 0.0 0 0.0 50.0 n/a n/a n/a n/a der Dose Müllers Kuh BovineCell1618 n/a A;B;C test5 test6 diff --git a/test/PoseidonGoldenTests/GoldenTestData/jannocoalesce/target2.janno b/test/PoseidonGoldenTests/GoldenTestData/jannocoalesce/target2.janno new file mode 100644 index 00000000..cec0295f --- /dev/null +++ b/test/PoseidonGoldenTests/GoldenTestData/jannocoalesce/target2.janno @@ -0,0 +1,4 @@ +Poseidon_ID Genetic_Sex Group_Name Alternative_IDs Relation_To Relation_Degree Relation_Type Relation_Note Collection_ID Country Country_ISO Location Site Latitude Longitude Date_Type Date_C14_Labnr Date_C14_Uncal_BP Date_C14_Uncal_BP_Err Date_BC_AD_Start Date_BC_AD_Median Date_BC_AD_Stop Date_Note MT_Haplogroup Y_Haplogroup Source_Tissue Nr_Libraries Library_Names Capture_Type UDG Library_Built Genotype_Ploidy Data_Preparation_Pipeline_URL Endogenous Nr_SNPs Coverage_on_Target_SNPs Damage Contamination Contamination_Err Contamination_Meas Contamination_Note Genetic_Source_Accession_IDs Primary_Contact Publication Note Keywords AdditionalColumn1 AdditionalColumn2 +XXX011 M POP1 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a 0.0 0.0 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a test1 test2 +XXX012 F POP2 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a -90.0 -180.0 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a test3 test4 +XXX013 M POP1 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a 90.0 180.0 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a test5 test6 diff --git a/test/PoseidonGoldenTests/GoldenTestData/jannocoalesce/target3.janno b/test/PoseidonGoldenTests/GoldenTestData/jannocoalesce/target3.janno new file mode 100644 index 00000000..866bce4a --- /dev/null +++ b/test/PoseidonGoldenTests/GoldenTestData/jannocoalesce/target3.janno @@ -0,0 +1,4 @@ +Poseidon_ID Genetic_Sex Group_Name Alternative_IDs Relation_To Relation_Degree Relation_Type Relation_Note Collection_ID Country Country_ISO Location Site Latitude Longitude Date_Type Date_C14_Labnr Date_C14_Uncal_BP Date_C14_Uncal_BP_Err Date_BC_AD_Start Date_BC_AD_Median Date_BC_AD_Stop Date_Note MT_Haplogroup Y_Haplogroup Source_Tissue Nr_Libraries Library_Names Capture_Type UDG Library_Built Genotype_Ploidy Data_Preparation_Pipeline_URL Endogenous Nr_SNPs Coverage_on_Target_SNPs Damage Contamination Contamination_Err Contamination_Meas Contamination_Note Genetic_Source_Accession_IDs Primary_Contact Publication Note Keywords AdditionalColumn1 AdditionalColumn2 +XXX011 M POP1 Paul;Peter XXX012;I1234 first;second father_of;grandfather_of yyy n/a xxx DE xxx xxx n/a n/a C14 A-1;A-2;A-3 3000;3100;2900 30;40;20 -1200 -1000 -800 x x x A C xxx;yyy 2 Lib1;Lib2 Shotgun;1240K minus ds diploid ftp://test.test 0.0 0 0.0 0.0 10 1 ANGSD v.123 n/a Ich;mag;Kekse Ich unpublished This is a fine sample Hutschnur test1 test2 +XXX012 F POP2 n/a XXX011 first daughter_of n/a n/a xxx FR xxx xxx n/a n/a contextual n/a n/a n/a -5500 -5000 -4500 yyy B B xxx 0 Lib3 1240K half ss haploid https://www.google.de 0.0 0 0.0 100.0 20;50;70 2;5;7.4 Schmutzi v145;Zwiebel;other xxx aus Du PaulNature2026 Cheesecake n/a test3 test4 +XXX013 M POP1 Skeleton Joe XXX011 sixthToTenth n/a xxx n/a xxx EG xxx xxx n/a n/a modern n/a n/a n/a 2000 2000 2000 n/a C A xxx 0 n/a ReferenceGenome plus mixed diploid http://huhu.org/23&test 0.0 0 0.0 50.0 n/a n/a n/a n/a der Dose Müllers Kuh BovineCell1618 n/a A;B;C test5 test6 diff --git a/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs b/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs index 161ebd2f..8c3e1d29 100644 --- a/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs +++ b/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs @@ -1087,10 +1087,11 @@ testPipelineListRemote testDir checkFilePath = do testPipelineJannocoalesce :: FilePath -> FilePath -> IO () testPipelineJannocoalesce testDir checkFilePath = do + -- simple coalesce let jannocoalesceOpts1 = JannoCoalesceOptions { _jannocoalesceSource = JannoSourceSingle "test/testDat/testJannoFiles/normal_full.janno", - _jannocoalesceTarget = "test/testDat/testJannoFiles/normal_subset.janno", - _jannocoalesceOutSpec = Just (testDir "jannocoalesce" "targetNoFieldsNoOverride.janno"), + _jannocoalesceTarget = "test/testDat/testJannoFiles/minimal_full.janno", + _jannocoalesceOutSpec = Just (testDir "jannocoalesce" "target1.janno"), _jannocoalesceJannoColumns = AllJannoColumns, _jannocoalesceOverwriteColumns = False, _jannocoalesceSourceKey = "Poseidon_ID", @@ -1098,5 +1099,33 @@ testPipelineJannocoalesce testDir checkFilePath = do _jannocoalesceIdStrip = Nothing } runAndChecksumFiles checkFilePath testDir (testLog $ runJannocoalesce jannocoalesceOpts1) "jannocoalesce" [ - "jannocoalesce" "targetNoFieldsNoOverride.janno" + "jannocoalesce" "target1.janno" + ] + -- only coalesce certain columns (--includeColumns) + let jannocoalesceOpts2 = JannoCoalesceOptions { + _jannocoalesceSource = JannoSourceSingle "test/testDat/testJannoFiles/normal_full.janno", + _jannocoalesceTarget = "test/testDat/testJannoFiles/minimal_full.janno", + _jannocoalesceOutSpec = Just (testDir "jannocoalesce" "target2.janno"), + _jannocoalesceJannoColumns = IncludeJannoColumns ["Latitude", "Longitude"], + _jannocoalesceOverwriteColumns = False, + _jannocoalesceSourceKey = "Poseidon_ID", + _jannocoalesceTargetKey = "Poseidon_ID", + _jannocoalesceIdStrip = Nothing + } + runAndChecksumFiles checkFilePath testDir (testLog $ runJannocoalesce jannocoalesceOpts2) "jannocoalesce" [ + "jannocoalesce" "target2.janno" + ] + -- do not coalesce certain columns (--excludeColumns) + let jannocoalesceOpts3 = JannoCoalesceOptions { + _jannocoalesceSource = JannoSourceSingle "test/testDat/testJannoFiles/normal_full.janno", + _jannocoalesceTarget = "test/testDat/testJannoFiles/minimal_full.janno", + _jannocoalesceOutSpec = Just (testDir "jannocoalesce" "target3.janno"), + _jannocoalesceJannoColumns = ExcludeJannoColumns ["Latitude", "Longitude"], + _jannocoalesceOverwriteColumns = False, + _jannocoalesceSourceKey = "Poseidon_ID", + _jannocoalesceTargetKey = "Poseidon_ID", + _jannocoalesceIdStrip = Nothing + } + runAndChecksumFiles checkFilePath testDir (testLog $ runJannocoalesce jannocoalesceOpts3) "jannocoalesce" [ + "jannocoalesce" "target3.janno" ] From 3769a4cda0531d62edb92ceb7043ba65d920aa8c Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Wed, 21 Feb 2024 17:20:08 +0100 Subject: [PATCH 22/34] added test cases for additional columns in jannocoalesce - currently failing --- test/Poseidon/JannocoalesceSpec.hs | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/test/Poseidon/JannocoalesceSpec.hs b/test/Poseidon/JannocoalesceSpec.hs index da78d799..75e0fd22 100644 --- a/test/Poseidon/JannocoalesceSpec.hs +++ b/test/Poseidon/JannocoalesceSpec.hs @@ -23,7 +23,10 @@ jannoTargetRow = in row { jCountry = Just "Austria", jSite = Just "Vienna", - jDateNote = Just "dating didn't work" + jDateNote = Just "dating didn't work", + jAdditionalColumns = CsvNamedRecord $ HM.fromList [ + ("AdditionalColumn2", "C") + ] } jannoSourceRow :: JannoRow @@ -33,7 +36,11 @@ jannoSourceRow = jCountry = Just "Austria", jSite = Just "Salzburg", jLatitude = makeLatitude 30.0, - jLongitude = makeLongitude 30.0 + jLongitude = makeLongitude 30.0, + jAdditionalColumns = CsvNamedRecord $ HM.fromList [ + ("AdditionalColumn1", "A"), + ("AdditionalColumn2", "B") + ] } jannoTargetRows :: [JannoRow] @@ -62,30 +69,44 @@ testMergeSingleRow = jGroupName merged `shouldBe` JannoList ["SamplePop"] jLatitude merged `shouldBe` makeLatitude 30.0 jLongitude merged `shouldBe` makeLongitude 30.0 + jAdditionalColumns merged `shouldBe` (CsvNamedRecord $ HM.fromList [ + ("AdditionalColumn1", "A"), + ("AdditionalColumn2", "C") + ]) it "should correctly merge without fields and override" $ do merged <- testLog $ mergeRow jannoTargetRow jannoSourceRow AllJannoColumns True "Poseidon_ID" "Poseidon_ID" jSite merged `shouldBe` Just "Salzburg" jGroupName merged `shouldBe` JannoList ["SamplePop2"] jLatitude merged `shouldBe` makeLatitude 30.0 jLongitude merged `shouldBe` makeLongitude 30.0 + jAdditionalColumns merged `shouldBe` (CsvNamedRecord $ HM.fromList [ + ("AdditionalColumn1", "A"), + ("AdditionalColumn2", "B") + ]) it "should correctly merge with fields selection and no override" $ do merged <- testLog $ mergeRow jannoTargetRow jannoSourceRow (IncludeJannoColumns ["Group_Name", "Latitude"]) False "Poseidon_ID" "Poseidon_ID" jSite merged `shouldBe` Just "Vienna" jGroupName merged `shouldBe` JannoList ["SamplePop"] jLatitude merged `shouldBe` makeLatitude 30.0 jLongitude merged `shouldBe` Nothing + jAdditionalColumns merged `shouldBe` (CsvNamedRecord $ HM.fromList []) it "should correctly merge with negative field selection and no override" $ do merged <- testLog $ mergeRow jannoTargetRow jannoSourceRow (ExcludeJannoColumns ["Latitude"]) False "Poseidon_ID" "Poseidon_ID" jSite merged `shouldBe` Just "Vienna" jGroupName merged `shouldBe` JannoList ["SamplePop"] jLatitude merged `shouldBe` Nothing jLongitude merged `shouldBe` makeLongitude 30.0 + jAdditionalColumns merged `shouldBe` (CsvNamedRecord $ HM.fromList [ + ("AdditionalColumn1", "A"), + ("AdditionalColumn2", "C") + ]) it "should correctly merge with fields and override" $ do merged <- testLog $ mergeRow jannoTargetRow jannoSourceRow (IncludeJannoColumns ["Group_Name", "Latitude"]) True "Poseidon_ID" "Poseidon_ID" jSite merged `shouldBe` Just "Vienna" jGroupName merged `shouldBe` JannoList ["SamplePop2"] jLatitude merged `shouldBe` makeLatitude 30.0 jLongitude merged `shouldBe` Nothing + jAdditionalColumns merged `shouldBe` (CsvNamedRecord $ HM.fromList []) testCoalesceMultipleRows :: Spec testCoalesceMultipleRows = describe "Poseidon.Jannocoalesce.makeNewJannoRows" $ do From 8254653fbf736a9340312d636860d8d1da32393a Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Thu, 22 Feb 2024 10:28:44 +0100 Subject: [PATCH 23/34] work towards fixing the additional column issue --- src/Poseidon/CLI/Jannocoalesce.hs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/Poseidon/CLI/Jannocoalesce.hs b/src/Poseidon/CLI/Jannocoalesce.hs index 0a3970f3..3f957c3b 100644 --- a/src/Poseidon/CLI/Jannocoalesce.hs +++ b/src/Poseidon/CLI/Jannocoalesce.hs @@ -1,4 +1,5 @@ {-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE TupleSections #-} module Poseidon.CLI.Jannocoalesce where @@ -101,14 +102,16 @@ mergeRow :: JannoRow -> JannoRow -> CoalesceJannoColumnSpec -> Bool -> String -> mergeRow targetRow sourceRow fields overwrite sKey tKey = do let targetRowRecord = Csv.toNamedRecord targetRow sourceRowRecord = Csv.toNamedRecord sourceRow - newRowRecord = HM.unionWithKey mergeIfMissing targetRowRecord sourceRowRecord + sourceKeys = HM.keys sourceRowRecord + targetComplete = HM.union targetRowRecord (HM.fromList $ map (,BSC.pack "") sourceKeys) + newRowRecord = HM.unionWithKey mergeIfMissing targetComplete sourceRowRecord parseResult = Csv.runParser . Csv.parseNamedRecord $ newRowRecord - logInfo $ "matched target " ++ BSC.unpack (targetRowRecord HM.! (BSC.pack tKey)) ++ + logInfo $ "matched target " ++ BSC.unpack (targetComplete HM.! (BSC.pack tKey)) ++ " with source " ++ BSC.unpack (sourceRowRecord HM.! (BSC.pack sKey)) case parseResult of Left err -> throwM . PoseidonGenericException $ ".janno row-merge error: " ++ err Right r -> do - let newFields = HM.differenceWith (\v1 v2 -> if v1 == v2 then Nothing else Just v1) newRowRecord targetRowRecord + let newFields = HM.differenceWith (\v1 v2 -> if v1 == v2 then Nothing else Just v1) newRowRecord targetComplete if HM.null newFields then do logDebug "-- no changes" else do From 7e5e213af28ddd758605316f54873df0122eeee5 Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Thu, 22 Feb 2024 10:33:42 +0100 Subject: [PATCH 24/34] fixed the expected test results --- test/Poseidon/JannocoalesceSpec.hs | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/test/Poseidon/JannocoalesceSpec.hs b/test/Poseidon/JannocoalesceSpec.hs index 75e0fd22..5ea8ebd1 100644 --- a/test/Poseidon/JannocoalesceSpec.hs +++ b/test/Poseidon/JannocoalesceSpec.hs @@ -69,7 +69,7 @@ testMergeSingleRow = jGroupName merged `shouldBe` JannoList ["SamplePop"] jLatitude merged `shouldBe` makeLatitude 30.0 jLongitude merged `shouldBe` makeLongitude 30.0 - jAdditionalColumns merged `shouldBe` (CsvNamedRecord $ HM.fromList [ + jAdditionalColumns merged `shouldBe` CsvNamedRecord (HM.fromList [ ("AdditionalColumn1", "A"), ("AdditionalColumn2", "C") ]) @@ -79,7 +79,7 @@ testMergeSingleRow = jGroupName merged `shouldBe` JannoList ["SamplePop2"] jLatitude merged `shouldBe` makeLatitude 30.0 jLongitude merged `shouldBe` makeLongitude 30.0 - jAdditionalColumns merged `shouldBe` (CsvNamedRecord $ HM.fromList [ + jAdditionalColumns merged `shouldBe` CsvNamedRecord (HM.fromList [ ("AdditionalColumn1", "A"), ("AdditionalColumn2", "B") ]) @@ -89,14 +89,16 @@ testMergeSingleRow = jGroupName merged `shouldBe` JannoList ["SamplePop"] jLatitude merged `shouldBe` makeLatitude 30.0 jLongitude merged `shouldBe` Nothing - jAdditionalColumns merged `shouldBe` (CsvNamedRecord $ HM.fromList []) + jAdditionalColumns merged `shouldBe` CsvNamedRecord (HM.fromList [ + ("AdditionalColumn2", "C") + ]) it "should correctly merge with negative field selection and no override" $ do merged <- testLog $ mergeRow jannoTargetRow jannoSourceRow (ExcludeJannoColumns ["Latitude"]) False "Poseidon_ID" "Poseidon_ID" jSite merged `shouldBe` Just "Vienna" jGroupName merged `shouldBe` JannoList ["SamplePop"] jLatitude merged `shouldBe` Nothing jLongitude merged `shouldBe` makeLongitude 30.0 - jAdditionalColumns merged `shouldBe` (CsvNamedRecord $ HM.fromList [ + jAdditionalColumns merged `shouldBe` CsvNamedRecord (HM.fromList [ ("AdditionalColumn1", "A"), ("AdditionalColumn2", "C") ]) @@ -106,7 +108,9 @@ testMergeSingleRow = jGroupName merged `shouldBe` JannoList ["SamplePop2"] jLatitude merged `shouldBe` makeLatitude 30.0 jLongitude merged `shouldBe` Nothing - jAdditionalColumns merged `shouldBe` (CsvNamedRecord $ HM.fromList []) + jAdditionalColumns merged `shouldBe` CsvNamedRecord (HM.fromList [ + ("AdditionalColumn2", "C") + ]) testCoalesceMultipleRows :: Spec testCoalesceMultipleRows = describe "Poseidon.Jannocoalesce.makeNewJannoRows" $ do From 35c3df1e956dd72fda798b5dad62f93811b9816f Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Thu, 22 Feb 2024 16:52:35 +0100 Subject: [PATCH 25/34] an idea for an alternative implementation of jannocoalesce - seems to work, but I'm not sure if it's fast enough --- src/Poseidon/CLI/Jannocoalesce.hs | 26 ++++++++++++------- .../CLI/OptparseApplicativeParsers.hs | 6 ++--- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/src/Poseidon/CLI/Jannocoalesce.hs b/src/Poseidon/CLI/Jannocoalesce.hs index 3f957c3b..c7909671 100644 --- a/src/Poseidon/CLI/Jannocoalesce.hs +++ b/src/Poseidon/CLI/Jannocoalesce.hs @@ -22,14 +22,15 @@ import Data.Text (pack, replace, unpack) import System.Directory (createDirectoryIfMissing) import System.FilePath (takeDirectory) import Text.Regex.TDFA ((=~)) +import Data.List ((\\)) -- the source can be a single janno file, or a set of base directories as usual. data JannoSourceSpec = JannoSourceSingle FilePath | JannoSourceBaseDirs [FilePath] data CoalesceJannoColumnSpec = AllJannoColumns - | IncludeJannoColumns [String] - | ExcludeJannoColumns [String] + | IncludeJannoColumns [BSC.ByteString] + | ExcludeJannoColumns [BSC.ByteString] data JannoCoalesceOptions = JannoCoalesceOptions { _jannocoalesceSource :: JannoSourceSpec @@ -103,8 +104,11 @@ mergeRow targetRow sourceRow fields overwrite sKey tKey = do let targetRowRecord = Csv.toNamedRecord targetRow sourceRowRecord = Csv.toNamedRecord sourceRow sourceKeys = HM.keys sourceRowRecord - targetComplete = HM.union targetRowRecord (HM.fromList $ map (,BSC.pack "") sourceKeys) - newRowRecord = HM.unionWithKey mergeIfMissing targetComplete sourceRowRecord + sourceKeysWanted = determineDesiredSourceKeys sourceKeys fields + targetComplete = HM.union targetRowRecord (HM.fromList $ map (, "" :: BSC.ByteString) sourceKeysWanted) + newRowRecord = HM.mapWithKey fillFromSource targetComplete + --targetComplete = HM.union targetRowRecord (HM.fromList $ map (,BSC.pack "") sourceKeys) + --newRowRecord = HM.unionWithKey mergeIfMissing targetComplete sourceRowRecord parseResult = Csv.runParser . Csv.parseNamedRecord $ newRowRecord logInfo $ "matched target " ++ BSC.unpack (targetComplete HM.! (BSC.pack tKey)) ++ " with source " ++ BSC.unpack (sourceRowRecord HM.! (BSC.pack sKey)) @@ -119,17 +123,21 @@ mergeRow targetRow sourceRow fields overwrite sKey tKey = do logDebug $ "-- copied \"" ++ BSC.unpack val ++ "\" from column " ++ BSC.unpack key return r where - mergeIfMissing :: BSC.ByteString -> BSC.ByteString -> BSC.ByteString -> BSC.ByteString - mergeIfMissing key targetVal sourceVal = + determineDesiredSourceKeys :: [BSC.ByteString] -> CoalesceJannoColumnSpec -> [BSC.ByteString] + determineDesiredSourceKeys keys AllJannoColumns = keys + determineDesiredSourceKeys _ (IncludeJannoColumns included) = included + determineDesiredSourceKeys keys (ExcludeJannoColumns excluded) = keys \\ excluded + fillFromSource :: BSC.ByteString -> BSC.ByteString -> BSC.ByteString + fillFromSource key targetVal = -- don't overwrite key if key /= BSC.pack tKey -- overwrite field only if it's requested - && includeField (BSC.unpack key) fields + && includeField key fields -- overwrite only empty fields, except overwrite is set && (targetVal `elem` ["n/a", ""] || overwrite) - then sourceVal + then HM.findWithDefault "" key (Csv.toNamedRecord sourceRow) else targetVal - includeField :: String -> CoalesceJannoColumnSpec -> Bool + includeField :: BSC.ByteString -> CoalesceJannoColumnSpec -> Bool includeField _ AllJannoColumns = True includeField key (IncludeJannoColumns xs) = key `elem` xs includeField key (ExcludeJannoColumns xs) = key `notElem` xs diff --git a/src/Poseidon/CLI/OptparseApplicativeParsers.hs b/src/Poseidon/CLI/OptparseApplicativeParsers.hs index ec038457..214a3f37 100644 --- a/src/Poseidon/CLI/OptparseApplicativeParsers.hs +++ b/src/Poseidon/CLI/OptparseApplicativeParsers.hs @@ -33,7 +33,7 @@ import System.FilePath (dropExtension, takeExtension, (<.>)) import qualified Text.Parsec as P import Text.Read (readMaybe) - +import qualified Data.ByteString.Char8 as BSC parseChronOperation :: OP.Parser ChronOperation parseChronOperation = (CreateChron <$> parseChronOutPath) <|> (UpdateChron <$> parseChronUpdatePath) @@ -800,13 +800,13 @@ parseJannocoalOutSpec = OP.option (Just <$> OP.str) ( parseJannocoalJannoColumns :: OP.Parser CoalesceJannoColumnSpec parseJannocoalJannoColumns = includeJannoColumns OP.<|> excludeJannoColumns OP.<|> pure AllJannoColumns where - includeJannoColumns = OP.option (IncludeJannoColumns . splitOn "," <$> OP.str) ( + includeJannoColumns = OP.option (IncludeJannoColumns . map BSC.pack . splitOn "," <$> OP.str) ( OP.long "includeColumns" <> OP.help "A comma-separated list of .janno column names to coalesce. \ \If not specified, all columns that can be found in the source \ \and target will get filled." ) - excludeJannoColumns = OP.option (ExcludeJannoColumns . splitOn "," <$> OP.str) ( + excludeJannoColumns = OP.option (ExcludeJannoColumns . map BSC.pack . splitOn "," <$> OP.str) ( OP.long "excludeColumns" <> OP.help "A comma-separated list of .janno column names NOT to coalesce. \ \All columns that can be found in the source and target will get filled, \ From f478969df8cb1bb31bd5fb8f4900df3558761542 Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Thu, 22 Feb 2024 19:07:17 +0100 Subject: [PATCH 26/34] update of golden tests output --- src/Poseidon/CLI/Jannocoalesce.hs | 16 +++++++--------- .../GoldenTestCheckSumFile.txt | 2 +- .../GoldenTestData/chronicle/chronicle2.yml | 14 +++++++------- .../GoldenTestData/jannocoalesce/target2.janno | 8 ++++---- 4 files changed, 19 insertions(+), 21 deletions(-) diff --git a/src/Poseidon/CLI/Jannocoalesce.hs b/src/Poseidon/CLI/Jannocoalesce.hs index c7909671..70301e42 100644 --- a/src/Poseidon/CLI/Jannocoalesce.hs +++ b/src/Poseidon/CLI/Jannocoalesce.hs @@ -101,15 +101,13 @@ matchWithOptionalStrip maybeRegex id1 id2 = mergeRow :: JannoRow -> JannoRow -> CoalesceJannoColumnSpec -> Bool -> String -> String -> PoseidonIO JannoRow mergeRow targetRow sourceRow fields overwrite sKey tKey = do - let targetRowRecord = Csv.toNamedRecord targetRow - sourceRowRecord = Csv.toNamedRecord sourceRow - sourceKeys = HM.keys sourceRowRecord - sourceKeysWanted = determineDesiredSourceKeys sourceKeys fields - targetComplete = HM.union targetRowRecord (HM.fromList $ map (, "" :: BSC.ByteString) sourceKeysWanted) - newRowRecord = HM.mapWithKey fillFromSource targetComplete - --targetComplete = HM.union targetRowRecord (HM.fromList $ map (,BSC.pack "") sourceKeys) - --newRowRecord = HM.unionWithKey mergeIfMissing targetComplete sourceRowRecord - parseResult = Csv.runParser . Csv.parseNamedRecord $ newRowRecord + let targetRowRecord = Csv.toNamedRecord targetRow + sourceRowRecord = Csv.toNamedRecord sourceRow + sourceKeys = HM.keys sourceRowRecord + sourceKeysDesired = determineDesiredSourceKeys sourceKeys fields + targetComplete = HM.union targetRowRecord (HM.fromList $ map (, "" :: BSC.ByteString) sourceKeysDesired) + newRowRecord = HM.mapWithKey fillFromSource targetComplete + parseResult = Csv.runParser . Csv.parseNamedRecord $ newRowRecord logInfo $ "matched target " ++ BSC.unpack (targetComplete HM.! (BSC.pack tKey)) ++ " with source " ++ BSC.unpack (sourceRowRecord HM.! (BSC.pack sKey)) case parseResult of diff --git a/test/PoseidonGoldenTests/GoldenTestCheckSumFile.txt b/test/PoseidonGoldenTests/GoldenTestCheckSumFile.txt index 38598237..e1929b66 100644 --- a/test/PoseidonGoldenTests/GoldenTestCheckSumFile.txt +++ b/test/PoseidonGoldenTests/GoldenTestCheckSumFile.txt @@ -114,5 +114,5 @@ b43da4d5734371c0648553120f812466 fetch fetch/multi_packages_2/Lamnidis_2018-1.0. b2286cf9af7c6c8757b8109a1f58e2d9 listRemote listRemote/listRemote3 0433b2a80ee5a2eb5bf8c6404130e562 listRemote listRemote/listRemote4 282cedf121f37e81c1e45ec0dfb97560 jannocoalesce jannocoalesce/target1.janno -bca03ca94c0f4d872aafd03d649a4f6c jannocoalesce jannocoalesce/target2.janno +df34d0542c0a94cf9556619bff2e301d jannocoalesce jannocoalesce/target2.janno a202f0c1636d55258454ad0a0dfea977 jannocoalesce jannocoalesce/target3.janno \ No newline at end of file diff --git a/test/PoseidonGoldenTests/GoldenTestData/chronicle/chronicle2.yml b/test/PoseidonGoldenTests/GoldenTestData/chronicle/chronicle2.yml index f446f239..8a07b513 100644 --- a/test/PoseidonGoldenTests/GoldenTestData/chronicle/chronicle2.yml +++ b/test/PoseidonGoldenTests/GoldenTestData/chronicle/chronicle2.yml @@ -1,29 +1,29 @@ title: Chronicle title description: Chronicle description chronicleVersion: 0.2.0 -lastModified: 2024-02-21 +lastModified: 2024-02-22 packages: - title: Lamnidis_2018 version: 1.0.0 - commit: 671da4388a118b5c2616cea7b935f2c35fc84060 + commit: 14f4b6a0b1670ebe4b2544eeb1d74efd4f618b28 path: Lamnidis_2018 - title: Lamnidis_2018 version: 1.0.1 - commit: 671da4388a118b5c2616cea7b935f2c35fc84060 + commit: 14f4b6a0b1670ebe4b2544eeb1d74efd4f618b28 path: Lamnidis_2018_newVersion - title: Schiffels version: 1.1.1 - commit: 9122486b44a36185bea5033d41220e474bb0ada9 + commit: de7c7af1794902b9f85bc57975a0af960d5c24bc path: Schiffels - title: Schiffels_2016 version: 1.0.1 - commit: 671da4388a118b5c2616cea7b935f2c35fc84060 + commit: 14f4b6a0b1670ebe4b2544eeb1d74efd4f618b28 path: Schiffels_2016 - title: Schmid_2028 version: 1.0.0 - commit: 671da4388a118b5c2616cea7b935f2c35fc84060 + commit: 14f4b6a0b1670ebe4b2544eeb1d74efd4f618b28 path: Schmid_2028 - title: Wang_2020 version: 0.1.0 - commit: 671da4388a118b5c2616cea7b935f2c35fc84060 + commit: 14f4b6a0b1670ebe4b2544eeb1d74efd4f618b28 path: Wang_2020 diff --git a/test/PoseidonGoldenTests/GoldenTestData/jannocoalesce/target2.janno b/test/PoseidonGoldenTests/GoldenTestData/jannocoalesce/target2.janno index cec0295f..65080c7c 100644 --- a/test/PoseidonGoldenTests/GoldenTestData/jannocoalesce/target2.janno +++ b/test/PoseidonGoldenTests/GoldenTestData/jannocoalesce/target2.janno @@ -1,4 +1,4 @@ -Poseidon_ID Genetic_Sex Group_Name Alternative_IDs Relation_To Relation_Degree Relation_Type Relation_Note Collection_ID Country Country_ISO Location Site Latitude Longitude Date_Type Date_C14_Labnr Date_C14_Uncal_BP Date_C14_Uncal_BP_Err Date_BC_AD_Start Date_BC_AD_Median Date_BC_AD_Stop Date_Note MT_Haplogroup Y_Haplogroup Source_Tissue Nr_Libraries Library_Names Capture_Type UDG Library_Built Genotype_Ploidy Data_Preparation_Pipeline_URL Endogenous Nr_SNPs Coverage_on_Target_SNPs Damage Contamination Contamination_Err Contamination_Meas Contamination_Note Genetic_Source_Accession_IDs Primary_Contact Publication Note Keywords AdditionalColumn1 AdditionalColumn2 -XXX011 M POP1 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a 0.0 0.0 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a test1 test2 -XXX012 F POP2 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a -90.0 -180.0 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a test3 test4 -XXX013 M POP1 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a 90.0 180.0 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a test5 test6 +Poseidon_ID Genetic_Sex Group_Name Alternative_IDs Relation_To Relation_Degree Relation_Type Relation_Note Collection_ID Country Country_ISO Location Site Latitude Longitude Date_Type Date_C14_Labnr Date_C14_Uncal_BP Date_C14_Uncal_BP_Err Date_BC_AD_Start Date_BC_AD_Median Date_BC_AD_Stop Date_Note MT_Haplogroup Y_Haplogroup Source_Tissue Nr_Libraries Library_Names Capture_Type UDG Library_Built Genotype_Ploidy Data_Preparation_Pipeline_URL Endogenous Nr_SNPs Coverage_on_Target_SNPs Damage Contamination Contamination_Err Contamination_Meas Contamination_Note Genetic_Source_Accession_IDs Primary_Contact Publication Note Keywords +XXX011 M POP1 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a 0.0 0.0 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a +XXX012 F POP2 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a -90.0 -180.0 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a +XXX013 M POP1 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a 90.0 180.0 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a From a1a796f7da5a866fd236124889f2df1ff96da1de Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Thu, 22 Feb 2024 19:11:07 +0100 Subject: [PATCH 27/34] stylish-haskell --- src/Poseidon/CLI/Jannocoalesce.hs | 6 +++--- src/Poseidon/CLI/OptparseApplicativeParsers.hs | 5 +++-- test/Poseidon/JannocoalesceSpec.hs | 3 ++- test/PoseidonGoldenTests/GoldenTestsRunCommands.hs | 6 +++--- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/Poseidon/CLI/Jannocoalesce.hs b/src/Poseidon/CLI/Jannocoalesce.hs index 70301e42..1d327bb2 100644 --- a/src/Poseidon/CLI/Jannocoalesce.hs +++ b/src/Poseidon/CLI/Jannocoalesce.hs @@ -1,5 +1,5 @@ {-# LANGUAGE OverloadedStrings #-} -{-# LANGUAGE TupleSections #-} +{-# LANGUAGE TupleSections #-} module Poseidon.CLI.Jannocoalesce where @@ -18,11 +18,11 @@ import Control.Monad.IO.Class (liftIO) import qualified Data.ByteString.Char8 as BSC import qualified Data.Csv as Csv import qualified Data.HashMap.Strict as HM +import Data.List ((\\)) import Data.Text (pack, replace, unpack) import System.Directory (createDirectoryIfMissing) import System.FilePath (takeDirectory) import Text.Regex.TDFA ((=~)) -import Data.List ((\\)) -- the source can be a single janno file, or a set of base directories as usual. data JannoSourceSpec = JannoSourceSingle FilePath | JannoSourceBaseDirs [FilePath] @@ -121,7 +121,7 @@ mergeRow targetRow sourceRow fields overwrite sKey tKey = do logDebug $ "-- copied \"" ++ BSC.unpack val ++ "\" from column " ++ BSC.unpack key return r where - determineDesiredSourceKeys :: [BSC.ByteString] -> CoalesceJannoColumnSpec -> [BSC.ByteString] + determineDesiredSourceKeys :: [BSC.ByteString] -> CoalesceJannoColumnSpec -> [BSC.ByteString] determineDesiredSourceKeys keys AllJannoColumns = keys determineDesiredSourceKeys _ (IncludeJannoColumns included) = included determineDesiredSourceKeys keys (ExcludeJannoColumns excluded) = keys \\ excluded diff --git a/src/Poseidon/CLI/OptparseApplicativeParsers.hs b/src/Poseidon/CLI/OptparseApplicativeParsers.hs index 214a3f37..6c3a40a3 100644 --- a/src/Poseidon/CLI/OptparseApplicativeParsers.hs +++ b/src/Poseidon/CLI/OptparseApplicativeParsers.hs @@ -3,7 +3,8 @@ module Poseidon.CLI.OptparseApplicativeParsers where import Poseidon.CLI.Chronicle (ChronOperation (..)) -import Poseidon.CLI.Jannocoalesce (JannoSourceSpec (..), CoalesceJannoColumnSpec (..)) +import Poseidon.CLI.Jannocoalesce (CoalesceJannoColumnSpec (..), + JannoSourceSpec (..)) import Poseidon.CLI.List (ListEntity (..), RepoLocationSpec (..)) import Poseidon.CLI.Rectify (ChecksumsToRectify (..), @@ -25,6 +26,7 @@ import Poseidon.Version (VersionComponent (..), parseVersion) import Control.Applicative ((<|>)) +import qualified Data.ByteString.Char8 as BSC import Data.List.Split (splitOn) import Data.Version (Version) import qualified Options.Applicative as OP @@ -33,7 +35,6 @@ import System.FilePath (dropExtension, takeExtension, (<.>)) import qualified Text.Parsec as P import Text.Read (readMaybe) -import qualified Data.ByteString.Char8 as BSC parseChronOperation :: OP.Parser ChronOperation parseChronOperation = (CreateChron <$> parseChronOutPath) <|> (UpdateChron <$> parseChronUpdatePath) diff --git a/test/Poseidon/JannocoalesceSpec.hs b/test/Poseidon/JannocoalesceSpec.hs index 5ea8ebd1..b37b533e 100644 --- a/test/Poseidon/JannocoalesceSpec.hs +++ b/test/Poseidon/JannocoalesceSpec.hs @@ -1,7 +1,8 @@ {-# LANGUAGE OverloadedStrings #-} module Poseidon.JannocoalesceSpec (spec) where -import Poseidon.CLI.Jannocoalesce (makeNewJannoRows, mergeRow, CoalesceJannoColumnSpec (..)) +import Poseidon.CLI.Jannocoalesce (CoalesceJannoColumnSpec (..), + makeNewJannoRows, mergeRow) import Poseidon.Janno (CsvNamedRecord (..), JannoList (..), JannoRow (..), createMinimalSample, makeLatitude, diff --git a/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs b/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs index 8c3e1d29..b86c1ffd 100644 --- a/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs +++ b/test/PoseidonGoldenTests/GoldenTestsRunCommands.hs @@ -9,10 +9,10 @@ import Poseidon.CLI.Forge (ForgeOptions (..), runForge) import Poseidon.CLI.Genoconvert (GenoconvertOptions (..), runGenoconvert) import Poseidon.CLI.Init (InitOptions (..), runInit) -import Poseidon.CLI.Jannocoalesce (JannoCoalesceOptions (..), +import Poseidon.CLI.Jannocoalesce (CoalesceJannoColumnSpec (..), + JannoCoalesceOptions (..), JannoSourceSpec (..), - runJannocoalesce, - CoalesceJannoColumnSpec (..)) + runJannocoalesce) import Poseidon.CLI.List (ListEntity (..), ListOptions (..), RepoLocationSpec (..), runList) import Poseidon.CLI.Rectify (ChecksumsToRectify (..), From 240e05d99e7bacdc2d0321f240f01c02b18bcec4 Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Thu, 22 Feb 2024 23:13:15 +0100 Subject: [PATCH 28/34] some improvements to the jannocoalesce code --- src/Poseidon/CLI/Jannocoalesce.hs | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/Poseidon/CLI/Jannocoalesce.hs b/src/Poseidon/CLI/Jannocoalesce.hs index 1d327bb2..23373144 100644 --- a/src/Poseidon/CLI/Jannocoalesce.hs +++ b/src/Poseidon/CLI/Jannocoalesce.hs @@ -101,15 +101,13 @@ matchWithOptionalStrip maybeRegex id1 id2 = mergeRow :: JannoRow -> JannoRow -> CoalesceJannoColumnSpec -> Bool -> String -> String -> PoseidonIO JannoRow mergeRow targetRow sourceRow fields overwrite sKey tKey = do - let targetRowRecord = Csv.toNamedRecord targetRow - sourceRowRecord = Csv.toNamedRecord sourceRow - sourceKeys = HM.keys sourceRowRecord + let sourceKeys = HM.keys sourceRowRecord sourceKeysDesired = determineDesiredSourceKeys sourceKeys fields - targetComplete = HM.union targetRowRecord (HM.fromList $ map (, "" :: BSC.ByteString) sourceKeysDesired) + targetComplete = HM.union targetRowRecord (HM.fromList $ map (, BSC.empty) sourceKeysDesired) newRowRecord = HM.mapWithKey fillFromSource targetComplete parseResult = Csv.runParser . Csv.parseNamedRecord $ newRowRecord - logInfo $ "matched target " ++ BSC.unpack (targetComplete HM.! (BSC.pack tKey)) ++ - " with source " ++ BSC.unpack (sourceRowRecord HM.! (BSC.pack sKey)) + logInfo $ "matched target " ++ BSC.unpack (targetComplete HM.! BSC.pack tKey) ++ + " with source " ++ BSC.unpack (sourceRowRecord HM.! BSC.pack sKey) case parseResult of Left err -> throwM . PoseidonGenericException $ ".janno row-merge error: " ++ err Right r -> do @@ -121,6 +119,10 @@ mergeRow targetRow sourceRow fields overwrite sKey tKey = do logDebug $ "-- copied \"" ++ BSC.unpack val ++ "\" from column " ++ BSC.unpack key return r where + targetRowRecord :: Csv.NamedRecord + targetRowRecord = Csv.toNamedRecord targetRow + sourceRowRecord :: Csv.NamedRecord + sourceRowRecord = Csv.toNamedRecord sourceRow determineDesiredSourceKeys :: [BSC.ByteString] -> CoalesceJannoColumnSpec -> [BSC.ByteString] determineDesiredSourceKeys keys AllJannoColumns = keys determineDesiredSourceKeys _ (IncludeJannoColumns included) = included @@ -132,8 +134,8 @@ mergeRow targetRow sourceRow fields overwrite sKey tKey = do -- overwrite field only if it's requested && includeField key fields -- overwrite only empty fields, except overwrite is set - && (targetVal `elem` ["n/a", ""] || overwrite) - then HM.findWithDefault "" key (Csv.toNamedRecord sourceRow) + && (targetVal `elem` ["n/a", "", BSC.empty] || overwrite) + then HM.findWithDefault "" key sourceRowRecord else targetVal includeField :: BSC.ByteString -> CoalesceJannoColumnSpec -> Bool includeField _ AllJannoColumns = True From 4df6d498a88a0e970881696c456e21ae41ab565c Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Fri, 23 Feb 2024 11:34:04 +0100 Subject: [PATCH 29/34] added some helpful summary statistics for jannocoalesce with an IORef --- src/Poseidon/CLI/Jannocoalesce.hs | 45 +++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/src/Poseidon/CLI/Jannocoalesce.hs b/src/Poseidon/CLI/Jannocoalesce.hs index 23373144..e3c6b385 100644 --- a/src/Poseidon/CLI/Jannocoalesce.hs +++ b/src/Poseidon/CLI/Jannocoalesce.hs @@ -10,9 +10,9 @@ import Poseidon.Package (PackageReadOptions (..), getJointJanno, readPoseidonPackageCollection) import Poseidon.Utils (PoseidonException (..), PoseidonIO, - logDebug, logInfo) + logDebug, logInfo, logWarning) -import Control.Monad (filterM, forM, forM_) +import Control.Monad (filterM, forM_, when) import Control.Monad.Catch (MonadThrow, throwM) import Control.Monad.IO.Class (liftIO) import qualified Data.ByteString.Char8 as BSC @@ -23,6 +23,7 @@ import Data.Text (pack, replace, unpack) import System.Directory (createDirectoryIfMissing) import System.FilePath (takeDirectory) import Text.Regex.TDFA ((=~)) +import qualified Data.IORef as R -- the source can be a single janno file, or a set of base directories as usual. data JannoSourceSpec = JannoSourceSingle FilePath | JannoSourceBaseDirs [FilePath] @@ -65,18 +66,33 @@ runJannocoalesce (JannoCoalesceOptions sourceSpec target outSpec fields overwrit createDirectoryIfMissing True (takeDirectory outPath) writeJannoFile outPath (JannoRows newJanno) +type CounterMismatches = R.IORef Int +type CounterCopied = R.IORef Int + makeNewJannoRows :: [JannoRow] -> [JannoRow] -> CoalesceJannoColumnSpec -> Bool -> String -> String -> Maybe String -> PoseidonIO [JannoRow] makeNewJannoRows sourceRows targetRows fields overwrite sKey tKey maybeStrip = do logInfo "Starting to coalesce..." - forM targetRows $ \targetRow -> do - posId <- getKeyFromJanno targetRow tKey - sourceRowCandidates <- filterM (\r -> (matchWithOptionalStrip maybeStrip posId) <$> getKeyFromJanno r sKey) sourceRows - case sourceRowCandidates of - [] -> do - logInfo $ "no match for target " ++ posId ++ " in source" - return targetRow - [keyRow] -> mergeRow targetRow keyRow fields overwrite sKey tKey - _ -> throwM $ PoseidonGenericException $ "source file contains multiple rows with key " ++ posId + counterMismatches <- liftIO $ R.newIORef 0 + counterCopied <- liftIO $ R.newIORef 0 + newRows <- mapM (makeNewJannoRow counterMismatches counterCopied) targetRows + counterCopiedVal <- liftIO $ R.readIORef counterCopied + counterMismatchesVal <- liftIO $ R.readIORef counterMismatches + logInfo $ "Copied " ++ show counterCopiedVal ++ " values" + when (counterMismatchesVal > 0) $ + logWarning $ "Failed to find matches for " ++ show counterMismatchesVal ++ " target rows in source" + return newRows + where + makeNewJannoRow :: CounterMismatches -> CounterCopied -> JannoRow -> PoseidonIO JannoRow + makeNewJannoRow cm cp targetRow = do + posId <- getKeyFromJanno targetRow tKey + sourceRowCandidates <- filterM (\r -> (matchWithOptionalStrip maybeStrip posId) <$> getKeyFromJanno r sKey) sourceRows + case sourceRowCandidates of + [] -> do + logWarning $ "no match for target " ++ posId ++ " in source" + liftIO $ R.modifyIORef cm (+1) + return targetRow + [keyRow] -> mergeRow cp targetRow keyRow fields overwrite sKey tKey + _ -> throwM $ PoseidonGenericException $ "source file contains multiple rows with key " ++ posId getKeyFromJanno :: (MonadThrow m) => JannoRow -> String -> m String getKeyFromJanno jannoRow key = do @@ -99,8 +115,8 @@ matchWithOptionalStrip maybeRegex id1 id2 = let match = s =~ r in if null match then s else unpack $ replace (pack match) "" (pack s) -mergeRow :: JannoRow -> JannoRow -> CoalesceJannoColumnSpec -> Bool -> String -> String -> PoseidonIO JannoRow -mergeRow targetRow sourceRow fields overwrite sKey tKey = do +mergeRow :: CounterCopied -> JannoRow -> JannoRow -> CoalesceJannoColumnSpec -> Bool -> String -> String -> PoseidonIO JannoRow +mergeRow cp targetRow sourceRow fields overwrite sKey tKey = do let sourceKeys = HM.keys sourceRowRecord sourceKeysDesired = determineDesiredSourceKeys sourceKeys fields targetComplete = HM.union targetRowRecord (HM.fromList $ map (, BSC.empty) sourceKeysDesired) @@ -115,7 +131,8 @@ mergeRow targetRow sourceRow fields overwrite sKey tKey = do if HM.null newFields then do logDebug "-- no changes" else do - forM_ (HM.toList newFields) $ \(key, val) -> + forM_ (HM.toList newFields) $ \(key, val) -> do + liftIO $ R.modifyIORef cp (+1) logDebug $ "-- copied \"" ++ BSC.unpack val ++ "\" from column " ++ BSC.unpack key return r where From 8460a36aa12a970b162ce04b961796d282d7504b Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Fri, 23 Feb 2024 11:46:37 +0100 Subject: [PATCH 30/34] fixed tests, which call mergeRow directly --- test/Poseidon/JannocoalesceSpec.hs | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/test/Poseidon/JannocoalesceSpec.hs b/test/Poseidon/JannocoalesceSpec.hs index b37b533e..378aba0e 100644 --- a/test/Poseidon/JannocoalesceSpec.hs +++ b/test/Poseidon/JannocoalesceSpec.hs @@ -12,6 +12,8 @@ import Poseidon.Utils (testLog) import qualified Data.HashMap.Strict as HM import SequenceFormats.Eigenstrat (EigenstratIndEntry (..), Sex (..)) import Test.Hspec +import Control.Monad.IO.Class (liftIO) +import qualified Data.IORef as R spec :: Spec spec = do @@ -65,7 +67,8 @@ testMergeSingleRow :: Spec testMergeSingleRow = describe "Poseidon.Jannocoalesce.mergeRow" $ do it "should correctly merge without fields and no override" $ do - merged <- testLog $ mergeRow jannoTargetRow jannoSourceRow AllJannoColumns False "Poseidon_ID" "Poseidon_ID" + cp <- liftIO $ R.newIORef 0 + merged <- testLog $ mergeRow cp jannoTargetRow jannoSourceRow AllJannoColumns False "Poseidon_ID" "Poseidon_ID" jSite merged `shouldBe` Just "Vienna" jGroupName merged `shouldBe` JannoList ["SamplePop"] jLatitude merged `shouldBe` makeLatitude 30.0 @@ -75,7 +78,8 @@ testMergeSingleRow = ("AdditionalColumn2", "C") ]) it "should correctly merge without fields and override" $ do - merged <- testLog $ mergeRow jannoTargetRow jannoSourceRow AllJannoColumns True "Poseidon_ID" "Poseidon_ID" + cp <- liftIO $ R.newIORef 0 + merged <- testLog $ mergeRow cp jannoTargetRow jannoSourceRow AllJannoColumns True "Poseidon_ID" "Poseidon_ID" jSite merged `shouldBe` Just "Salzburg" jGroupName merged `shouldBe` JannoList ["SamplePop2"] jLatitude merged `shouldBe` makeLatitude 30.0 @@ -85,7 +89,8 @@ testMergeSingleRow = ("AdditionalColumn2", "B") ]) it "should correctly merge with fields selection and no override" $ do - merged <- testLog $ mergeRow jannoTargetRow jannoSourceRow (IncludeJannoColumns ["Group_Name", "Latitude"]) False "Poseidon_ID" "Poseidon_ID" + cp <- liftIO $ R.newIORef 0 + merged <- testLog $ mergeRow cp jannoTargetRow jannoSourceRow (IncludeJannoColumns ["Group_Name", "Latitude"]) False "Poseidon_ID" "Poseidon_ID" jSite merged `shouldBe` Just "Vienna" jGroupName merged `shouldBe` JannoList ["SamplePop"] jLatitude merged `shouldBe` makeLatitude 30.0 @@ -94,7 +99,8 @@ testMergeSingleRow = ("AdditionalColumn2", "C") ]) it "should correctly merge with negative field selection and no override" $ do - merged <- testLog $ mergeRow jannoTargetRow jannoSourceRow (ExcludeJannoColumns ["Latitude"]) False "Poseidon_ID" "Poseidon_ID" + cp <- liftIO $ R.newIORef 0 + merged <- testLog $ mergeRow cp jannoTargetRow jannoSourceRow (ExcludeJannoColumns ["Latitude"]) False "Poseidon_ID" "Poseidon_ID" jSite merged `shouldBe` Just "Vienna" jGroupName merged `shouldBe` JannoList ["SamplePop"] jLatitude merged `shouldBe` Nothing @@ -104,7 +110,8 @@ testMergeSingleRow = ("AdditionalColumn2", "C") ]) it "should correctly merge with fields and override" $ do - merged <- testLog $ mergeRow jannoTargetRow jannoSourceRow (IncludeJannoColumns ["Group_Name", "Latitude"]) True "Poseidon_ID" "Poseidon_ID" + cp <- liftIO $ R.newIORef 0 + merged <- testLog $ mergeRow cp jannoTargetRow jannoSourceRow (IncludeJannoColumns ["Group_Name", "Latitude"]) True "Poseidon_ID" "Poseidon_ID" jSite merged `shouldBe` Just "Vienna" jGroupName merged `shouldBe` JannoList ["SamplePop2"] jLatitude merged `shouldBe` makeLatitude 30.0 From b3cff293e0fe5feaad0803b143e28ca2a3f86379 Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Fri, 23 Feb 2024 11:47:16 +0100 Subject: [PATCH 31/34] stylish-haskell --- src/Poseidon/CLI/Jannocoalesce.hs | 2 +- test/Poseidon/JannocoalesceSpec.hs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Poseidon/CLI/Jannocoalesce.hs b/src/Poseidon/CLI/Jannocoalesce.hs index e3c6b385..51bb2a52 100644 --- a/src/Poseidon/CLI/Jannocoalesce.hs +++ b/src/Poseidon/CLI/Jannocoalesce.hs @@ -18,12 +18,12 @@ import Control.Monad.IO.Class (liftIO) import qualified Data.ByteString.Char8 as BSC import qualified Data.Csv as Csv import qualified Data.HashMap.Strict as HM +import qualified Data.IORef as R import Data.List ((\\)) import Data.Text (pack, replace, unpack) import System.Directory (createDirectoryIfMissing) import System.FilePath (takeDirectory) import Text.Regex.TDFA ((=~)) -import qualified Data.IORef as R -- the source can be a single janno file, or a set of base directories as usual. data JannoSourceSpec = JannoSourceSingle FilePath | JannoSourceBaseDirs [FilePath] diff --git a/test/Poseidon/JannocoalesceSpec.hs b/test/Poseidon/JannocoalesceSpec.hs index 378aba0e..2f088453 100644 --- a/test/Poseidon/JannocoalesceSpec.hs +++ b/test/Poseidon/JannocoalesceSpec.hs @@ -9,11 +9,11 @@ import Poseidon.Janno (CsvNamedRecord (..), makeLongitude) import Poseidon.Utils (testLog) +import Control.Monad.IO.Class (liftIO) import qualified Data.HashMap.Strict as HM +import qualified Data.IORef as R import SequenceFormats.Eigenstrat (EigenstratIndEntry (..), Sex (..)) import Test.Hspec -import Control.Monad.IO.Class (liftIO) -import qualified Data.IORef as R spec :: Spec spec = do From fb22b43eeff894236e369268571370d29ad267b7 Mon Sep 17 00:00:00 2001 From: Stephan Schiffels Date: Fri, 23 Feb 2024 23:01:07 +0100 Subject: [PATCH 32/34] added a single comment to mergeRow --- src/Poseidon/CLI/Jannocoalesce.hs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Poseidon/CLI/Jannocoalesce.hs b/src/Poseidon/CLI/Jannocoalesce.hs index 51bb2a52..8a5cc64e 100644 --- a/src/Poseidon/CLI/Jannocoalesce.hs +++ b/src/Poseidon/CLI/Jannocoalesce.hs @@ -119,6 +119,7 @@ mergeRow :: CounterCopied -> JannoRow -> JannoRow -> CoalesceJannoColumnSpec -> mergeRow cp targetRow sourceRow fields overwrite sKey tKey = do let sourceKeys = HM.keys sourceRowRecord sourceKeysDesired = determineDesiredSourceKeys sourceKeys fields + -- fill in the target row with dummy values for desired fields that might not be present yet targetComplete = HM.union targetRowRecord (HM.fromList $ map (, BSC.empty) sourceKeysDesired) newRowRecord = HM.mapWithKey fillFromSource targetComplete parseResult = Csv.runParser . Csv.parseNamedRecord $ newRowRecord From 12bbf35ed4a130af6e3758d3cd2b1f8cbfab5c5f Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Mon, 26 Feb 2024 11:55:38 +0100 Subject: [PATCH 33/34] work on the release changelog --- CHANGELOGRELEASE.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/CHANGELOGRELEASE.md b/CHANGELOGRELEASE.md index 353cd2db..493c7679 100644 --- a/CHANGELOGRELEASE.md +++ b/CHANGELOGRELEASE.md @@ -1,3 +1,17 @@ +### V 1.4.1.0 + +This release improves the error messages for broken `.janno` files and adds an entirely new subcommand to merge two `.janno` files: `jannocoalecse`. + +#### Merging `.janno` files with `jannocoalesce` + +The need for a tool to combine the information of two `.janno` files arose in the Poseidon ecosystem as we started to conceptualize the Poseidon [Minotaur Archive](https://www.poseidon-adna.org/#/archive_overview?id=the-poseidon-minotaur-archive-pma). This archive will be populated by paper-wise Poseidon packages for which the genotype data was regenerated through the Minotaur workflow (work in progress). We plan to reprocess various packages that are already in the [Poseidon Community Archive](https://www.poseidon-adna.org/#/archive_overview?id=the-poseidon-community-archive-pca) and want to copy for these packages e.g. spatiotemporal information from the already available `.janno` files. `jannocoalesce` is the answer to this specific need, but can also be useful for various other applications. + +It generally works by reading a source `.janno` file with `-s|--sourceFile` (or all `.janno` files in a `-d|--baseDir`) and a target `.janno` file with `-t|--targetFile`. It then merges these files by a key column in each of these files, which can be selected with `--sourceKey` and `--targetKey`. The default for both of these is the `Poseidon_ID` column. In case the entries in these key columns slightly and systematically differ, e.g. because the `Poseidon_ID`s in either have a special suffix (for example `_SG`), then the `--stripIdRegex` option allows to strip these with a regular expression. + +`jannocoalesce` generally attempts to fill all empty cells in the target `.janno` file with information from the source. `--includeColumns` and `--excludeColumns` allow to select specific columns for which this should be done. In some cases it may be desirable to not just fill empty fields in the target, but overwrite the information already there with the `-f|--force` option. If the target file should be preserved, then the output can be directed to a new output `.janno` file with `-o|--outFile`. + + + ### V 1.4.0.3 This small release fixes a performance issue related to finding the latest version of all packages. The bug had severe detrimental effects on `forge` and `fetch`, which are now resolved. From d81a951678787c96a8eb4183298483651b98bd15 Mon Sep 17 00:00:00 2001 From: Clemens Schmid Date: Mon, 26 Feb 2024 12:38:34 +0100 Subject: [PATCH 34/34] completed the release-changelog --- CHANGELOGRELEASE.md | 40 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/CHANGELOGRELEASE.md b/CHANGELOGRELEASE.md index 493c7679..7a6cafd8 100644 --- a/CHANGELOGRELEASE.md +++ b/CHANGELOGRELEASE.md @@ -1,16 +1,48 @@ ### V 1.4.1.0 -This release improves the error messages for broken `.janno` files and adds an entirely new subcommand to merge two `.janno` files: `jannocoalecse`. +This release adds an entirely new subcommand to merge two `.janno` files (`jannocoalecse`) and improves the error messages for broken `.janno` files. #### Merging `.janno` files with `jannocoalesce` -The need for a tool to combine the information of two `.janno` files arose in the Poseidon ecosystem as we started to conceptualize the Poseidon [Minotaur Archive](https://www.poseidon-adna.org/#/archive_overview?id=the-poseidon-minotaur-archive-pma). This archive will be populated by paper-wise Poseidon packages for which the genotype data was regenerated through the Minotaur workflow (work in progress). We plan to reprocess various packages that are already in the [Poseidon Community Archive](https://www.poseidon-adna.org/#/archive_overview?id=the-poseidon-community-archive-pca) and want to copy for these packages e.g. spatiotemporal information from the already available `.janno` files. `jannocoalesce` is the answer to this specific need, but can also be useful for various other applications. +The need for a tool to combine the information of two `.janno` files arose in the Poseidon ecosystem as we started to conceptualize the Poseidon [Minotaur Archive](https://github.com/poseidon-framework/minotaur-archive). This archive will be populated by paper-wise Poseidon packages for which the genotype data was regenerated through the Minotaur workflow (work in progress). We plan to reprocess various packages that are already in the [Poseidon Community Archive](https://github.com/poseidon-framework/community-archive) and for these packages we want to copy e.g. spatiotemporal information from the already available `.janno` files. `jannocoalesce` is the answer to this specific need, but can also be useful for various other applications. -It generally works by reading a source `.janno` file with `-s|--sourceFile` (or all `.janno` files in a `-d|--baseDir`) and a target `.janno` file with `-t|--targetFile`. It then merges these files by a key column in each of these files, which can be selected with `--sourceKey` and `--targetKey`. The default for both of these is the `Poseidon_ID` column. In case the entries in these key columns slightly and systematically differ, e.g. because the `Poseidon_ID`s in either have a special suffix (for example `_SG`), then the `--stripIdRegex` option allows to strip these with a regular expression. +It generally works by reading a source `.janno` file with `-s|--sourceFile` (or all `.janno` files in a `-d|--baseDir`) and a target `.janno` file with `-t|--targetFile`. It then merges these files by a key column, which can be selected with `--sourceKey` and `--targetKey`. The default for both of these key columns is the `Poseidon_ID`. In case the entries in the key columns slightly and systematically differ, e.g. because the `Poseidon_ID`s in either have a special suffix (for example `_SG`), then the `--stripIdRegex` option allows to strip these with a regular expression. -`jannocoalesce` generally attempts to fill all empty cells in the target `.janno` file with information from the source. `--includeColumns` and `--excludeColumns` allow to select specific columns for which this should be done. In some cases it may be desirable to not just fill empty fields in the target, but overwrite the information already there with the `-f|--force` option. If the target file should be preserved, then the output can be directed to a new output `.janno` file with `-o|--outFile`. +`jannocoalesce` generally attempts to fill **all** empty cells in the target `.janno` file with information from the source. `--includeColumns` and `--excludeColumns` allow to select specific columns for which this should be done. In some cases it may be desirable to not just fill empty fields in the target, but overwrite the information already there with the `-f|--force` option. If the target file should be preserved, then the output can be directed to a new output `.janno` file with `-o|--outFile`. +#### Better error messages for broken `.janno` files +`.janno` file validation is a core feature of `trident`. With this release we try to improve the error messages for a two common situations: + +1. Broken number fields. This can happen, if some text or wrong character ends up in a number field. + +So far the error messages for this case have been pretty technical. Here for example if an integer field is filled with `430;`, where the integer number `430` is accidentally written with a trailing `;`: + +``` +parse error (Failed reading: conversion error: expected Int, got "430;" (incomplete field parse, leftover: [59])) +``` + +The new error message is more clear: + +``` +parse error in one column (expected data type: Int, broken value: "430;", problematic characters: ";") +``` + +2. Inconsistent `Date_*`, `Contamination_*` and `Relation_*` columns. These sets of columns have to be cross-consistent, following a logic that is especially complex for the `Date_*` fields (see [here](https://www.poseidon-adna.org/#/janno_details?id=the-columns-in-detail)). + +So far any inconsistency was reported with this generic error message: + +``` +The Date_* columns are not consistent +``` + +Now we include far more precise messages, like e.g.: + +``` +Date_Type is not "C14", but either Date_C14_Uncal_BP or Date_C14_Uncal_BP_Err are not empty. +``` + +This should simplify tedious `.janno` file debugging in the future. ### V 1.4.0.3