Skip to content

Commit

Permalink
Merge pull request #283 from poseidon-framework/moreInformativeConsis…
Browse files Browse the repository at this point in the history
…tencyChecks

More informative consistency checks
  • Loading branch information
nevrome authored Feb 21, 2024
2 parents c31b17c + b7acba7 commit e7e1996
Show file tree
Hide file tree
Showing 5 changed files with 143 additions and 81 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
- V 1.4.0.4:
- Added better error messages for generic cassava parsing (e.g. for broken Int and Double fields) in .janno files.
- Added better error handling and messages for inconsistent `Date_*`, `Contamination_*` and `Relation_*` columns in .janno files using an `Except` & `Writer` monad stack.
- Cleaned a bit in the `SequencingSource` module.
- V 1.4.0.3:
- Fixed a severe performance leak in code around `resolveEntityIndices`, which was called in various functions and wastefully recomputed `isLatestInCollection` way too often. This affected simple commands, like fetching a few packages from the server, forging, and has effects also in xerxes.
- Bumped to a newer Compiler (GHC 9.4.7) and new Stackage Snapshot (LTS-21.17)
Expand Down
2 changes: 1 addition & 1 deletion poseidon-hs.cabal
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: poseidon-hs
version: 1.4.0.3
version: 1.4.0.4
synopsis: A package with tools for working with Poseidon Genotype Data
description: The tools in this package read and analyse Poseidon-formatted genotype databases, a modular system for storing genotype data from thousands of individuals.
license: MIT
Expand Down
170 changes: 118 additions & 52 deletions src/Poseidon/Janno.hs
Original file line number Diff line number Diff line change
Expand Up @@ -41,19 +41,23 @@ module Poseidon.Janno (
decodingOptions,
explicitNA,
removeUselessSuffix,
parseCsvParseError,
renderCsvParseError,
getMaybeJannoList
) where

import Poseidon.Utils (PoseidonException (..),
PoseidonIO, logDebug,
logError,
logError, logWarning,
renderPoseidonException)


import Control.Applicative (empty)
import Control.Exception (throwIO)
import Control.Monad (unless, when)
import qualified Control.Monad.Except as E
import Control.Monad.IO.Class (liftIO)
import qualified Control.Monad.Writer as W
import Country (Country, alphaTwoUpper,
decodeAlphaTwo)
import Data.Aeson (FromJSON, Options (..),
Expand All @@ -66,14 +70,14 @@ import Data.Aeson.Types (emptyObject)
import Data.Bifunctor (second)
import qualified Data.ByteString.Char8 as Bchs
import qualified Data.ByteString.Lazy.Char8 as Bch
import Data.Char (isSpace, ord)
import Data.Char (chr, isSpace, ord)
import qualified Data.Csv as Csv
import Data.Either (lefts, rights)
import qualified Data.HashMap.Strict as HM
import Data.List (elemIndex, foldl',
intercalate, nub, sort,
(\\))
import Data.Maybe (fromJust, isNothing)
import Data.Maybe (fromJust)
import Data.Text (pack, replace, unpack)
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
Expand All @@ -84,6 +88,8 @@ import Network.URI (isURIReference)
import Options.Applicative.Help.Levenshtein (editDistance)
import SequenceFormats.Eigenstrat (EigenstratIndEntry (..),
Sex (..))
import qualified Text.Parsec as P
import qualified Text.Parsec.String as P
import qualified Text.Regex.TDFA as Reg

-- | A datatype for genetic sex
Expand Down Expand Up @@ -996,15 +1002,27 @@ findSimilarNames reference = map (findSimilar reference)
-- | A function to load one row of a janno file
readJannoFileRow :: FilePath -> (Int, Bch.ByteString) -> PoseidonIO (Either PoseidonException JannoRow)
readJannoFileRow jannoPath (lineNumber, row) = do
case Csv.decodeByNameWith decodingOptions row of
let decoded = Csv.decodeByNameWith decodingOptions row
simplifiedDecoded = (\(_,rs) -> V.head rs) <$> decoded
case simplifiedDecoded of
Left e -> do
return $ Left $ PoseidonFileRowException jannoPath lineNumber $ removeUselessSuffix e
Right (_, jannoRow :: V.Vector JannoRow) -> do
case checkJannoRowConsistency jannoPath lineNumber $ V.head jannoRow of
Left e -> do
return $ Left e
Right (pS :: JannoRow) -> do
return $ Right pS
let betterError = case P.parse parseCsvParseError "" e of
Left _ -> removeUselessSuffix e
Right result -> renderCsvParseError result
return $ Left $ PoseidonFileRowException jannoPath (show lineNumber) betterError
Right jannoRow -> do
let (errOrJannoRow, warnings) = W.runWriter (E.runExceptT (checkJannoRowConsistency jannoRow))
mapM_ (logWarning . renderWarning) warnings
case errOrJannoRow of
Left e -> return $ Left $ PoseidonFileRowException jannoPath renderLocation e
Right r -> return $ Right r
where
renderWarning :: String -> String
renderWarning e = "Issue in " ++ jannoPath ++ " " ++
"in line " ++ renderLocation ++ ": " ++ e
renderLocation :: String
renderLocation = show lineNumber ++
" (Poseidon_ID: " ++ jPoseidonID jannoRow ++ ")"

decodingOptions :: Csv.DecodeOptions
decodingOptions = Csv.defaultDecodeOptions {
Expand All @@ -1014,6 +1032,33 @@ decodingOptions = Csv.defaultDecodeOptions {
removeUselessSuffix :: String -> String
removeUselessSuffix = unpack . replace " at \"\"" "" . pack

-- reformat the parser error
-- from: parse error (Failed reading: conversion error: expected Int, got "430;" (incomplete field parse, leftover: [59]))
-- to: parse error in one column (expected data type: Int, broken value: "430;", problematic characters: ";")

data CsvParseError = CsvParseError {
_expected :: String
, _actual :: String
, _leftover :: String
} deriving Show

parseCsvParseError :: P.Parser CsvParseError
parseCsvParseError = do
_ <- P.string "parse error (Failed reading: conversion error: expected "
expected <- P.manyTill P.anyChar (P.try (P.string ", got "))
actual <- P.manyTill P.anyChar (P.try (P.string " (incomplete field parse, leftover: ["))
leftoverList <- P.sepBy (read <$> P.many1 P.digit) (P.char ',')
_ <- P.char ']'
_ <- P.many P.anyChar
return $ CsvParseError expected actual (map chr leftoverList)

renderCsvParseError :: CsvParseError -> String
renderCsvParseError (CsvParseError expected actual leftover) =
"parse error in one column (" ++
"expected data type: " ++ expected ++ ", " ++
"broken value: " ++ actual ++ ", " ++
"problematic characters: " ++ show leftover ++ ")"

-- | A helper functions to replace empty bytestrings values in janno files with explicit "n/a"
explicitNA :: Bch.ByteString -> Bch.ByteString
explicitNA = replaceInJannoBytestring Bch.empty "n/a"
Expand All @@ -1026,7 +1071,7 @@ replaceInJannoBytestring from to tsv =
tsvRowsUpdated = map (Bch.intercalate (Bch.pack "\t")) tsvCellsUpdated
in Bch.unlines tsvRowsUpdated

-- Janno consistency checks
-- Global janno consistency checks

checkJannoConsistency :: FilePath -> JannoRows -> Either PoseidonException JannoRows
checkJannoConsistency jannoPath xs
Expand All @@ -1037,23 +1082,27 @@ checkJannoConsistency jannoPath xs
checkIndividualUnique :: JannoRows -> Bool
checkIndividualUnique (JannoRows rows) = length rows == length (nub $ map jPoseidonID rows)

checkJannoRowConsistency :: FilePath -> Int -> JannoRow -> Either PoseidonException JannoRow
checkJannoRowConsistency jannoPath row x
| not $ checkMandatoryStringNotEmpty x = Left $ PoseidonFileRowException jannoPath row
"The mandatory columns Poseidon_ID and Group_Name contain empty values"
| not $ checkC14ColsConsistent x = Left $ PoseidonFileRowException jannoPath row
"The Date_* columns are not consistent"
| not $ checkContamColsConsistent x = Left $ PoseidonFileRowException jannoPath row
"The Contamination_* columns are not consistent"
| not $ checkRelationColsConsistent x = Left $ PoseidonFileRowException jannoPath row
"The Relation_* columns are not consistent"
| otherwise = Right x

checkMandatoryStringNotEmpty :: JannoRow -> Bool
-- Row-wise janno consistency checks

type JannoRowWarnings = [String]
type JannoRowLog = E.ExceptT String (W.Writer JannoRowWarnings)

checkJannoRowConsistency :: JannoRow -> JannoRowLog JannoRow
checkJannoRowConsistency x =
return x
>>= checkMandatoryStringNotEmpty
>>= checkC14ColsConsistent
>>= checkContamColsConsistent
>>= checkRelationColsConsistent

checkMandatoryStringNotEmpty :: JannoRow -> JannoRowLog JannoRow
checkMandatoryStringNotEmpty x =
(not . null . jPoseidonID $ x)
&& (not . null . getJannoList . jGroupName $ x)
&& (not . null . head . getJannoList . jGroupName $ x)
let notEmpty = (not . null . jPoseidonID $ x) &&
(not . null . getJannoList . jGroupName $ x) &&
(not . null . head . getJannoList . jGroupName $ x)
in case notEmpty of
False -> E.throwError "Poseidon_ID or Group_Name are empty"
True -> return x

getCellLength :: Maybe (JannoList a) -> Int
getCellLength = maybe 0 (length . getJannoList)
Expand All @@ -1062,33 +1111,50 @@ allEqual :: Eq a => [a] -> Bool
allEqual [] = True
allEqual x = length (nub x) == 1

checkC14ColsConsistent :: JannoRow -> Bool
checkC14ColsConsistent :: JannoRow -> JannoRowLog JannoRow
checkC14ColsConsistent x =
let lLabnr = getCellLength $ jDateC14Labnr x
lUncalBP = getCellLength $ jDateC14UncalBP x
lUncalBPErr = getCellLength $ jDateC14UncalBPErr x
shouldBeTypeC14 = lUncalBP > 0
isTypeC14 = jDateType x == Just C14
in
(lLabnr == 0 || lUncalBP == 0 || lLabnr == lUncalBP)
&& (lLabnr == 0 || lUncalBPErr == 0 || lLabnr == lUncalBPErr)
&& lUncalBP == lUncalBPErr
&& (not shouldBeTypeC14 || isTypeC14)

checkContamColsConsistent :: JannoRow -> Bool
let isTypeC14 = jDateType x == Just C14
lLabnr = getCellLength $ jDateC14Labnr x
lUncalBP = getCellLength $ jDateC14UncalBP x
lUncalBPErr = getCellLength $ jDateC14UncalBPErr x
anyMainColFilled = lUncalBP > 0 || lUncalBPErr > 0
anyMainColEmpty = lUncalBP == 0 || lUncalBPErr == 0
allSameLength = allEqual [lLabnr, lUncalBP, lUncalBPErr] ||
(lLabnr == 0 && lUncalBP == lUncalBPErr)
in case (isTypeC14, anyMainColFilled, anyMainColEmpty, allSameLength) of
(False, False, _, _ ) -> return x
(False, True, _, _ ) -> E.throwError "Date_Type is not \"C14\", but either \
\Date_C14_Uncal_BP or Date_C14_Uncal_BP_Err are not empty"
(True, _, False, False) -> E.throwError "Date_C14_Labnr, Date_C14_Uncal_BP and Date_C14_Uncal_BP_Err \
\do not have the same lengths"
(True, _, False, True ) -> return x
-- this should be an error, but we have legacy packages with this issue, so it's only a warning
(True, _, True, _ ) -> do
W.tell ["Date_Type is \"C14\", but either Date_C14_Uncal_BP or Date_C14_Uncal_BP_Err are empty"]
return x

checkContamColsConsistent :: JannoRow -> JannoRowLog JannoRow
checkContamColsConsistent x =
let lContamination = getCellLength $ jContamination x
lContaminationErr = getCellLength $ jContaminationErr x
lContaminationMeas = getCellLength $ jContaminationMeas x
in allEqual [lContamination, lContaminationErr, lContaminationMeas]

checkRelationColsConsistent :: JannoRow -> Bool
let lContamination = getCellLength $ jContamination x
lContaminationErr = getCellLength $ jContaminationErr x
lContaminationMeas = getCellLength $ jContaminationMeas x
allSameLength = allEqual [lContamination, lContaminationErr, lContaminationMeas]
in case allSameLength of
False -> E.throwError "Contamination, Contamination_Err and Contamination_Meas \
\do not have the same lengths"
True -> return x

checkRelationColsConsistent :: JannoRow -> JannoRowLog JannoRow
checkRelationColsConsistent x =
let lRelationTo = getCellLength $ jRelationTo x
lRelationDegree = getCellLength $ jRelationDegree x
lRelationType = getCellLength $ jRelationType x
in allEqual [lRelationTo, lRelationType, lRelationDegree]
|| (allEqual [lRelationTo, lRelationDegree] && isNothing (jRelationType x))
let lRelationTo = getCellLength $ jRelationTo x
lRelationDegree = getCellLength $ jRelationDegree x
lRelationType = getCellLength $ jRelationType x
allSameLength = allEqual [lRelationTo, lRelationDegree, lRelationType] ||
(allEqual [lRelationTo, lRelationDegree] && lRelationType == 0)
in case allSameLength of
False -> E.throwError "Relation_To, Relation_Degree and Relation_Type \
\do not have the same lengths. Relation_Type can be empty"
True -> return x

-- deriving with TemplateHaskell necessary for the generics magic in the Survey module
deriveGeneric ''JannoRow
44 changes: 18 additions & 26 deletions src/Poseidon/SequencingSource.hs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ import Poseidon.Janno (AccessionID (..),
explicitNA, filterLookup,
filterLookupOptional, getCsvNR,
makeAccessionID,
removeUselessSuffix)
parseCsvParseError,
removeUselessSuffix,
renderCsvParseError)
import Poseidon.Utils (PoseidonException (..), PoseidonIO,
logDebug, logError, logWarning,
renderPoseidonException)
Expand Down Expand Up @@ -39,6 +41,7 @@ import Data.Time.Format (defaultTimeLocale, formatTime,
import qualified Data.Vector as V
import Data.Yaml.Aeson (FromJSON (..))
import GHC.Generics (Generic)
import qualified Text.Parsec as P

-- |A datatype to represent UDG in a ssf file
data SSFUDG =
Expand Down Expand Up @@ -400,36 +403,25 @@ readSeqSourceFile seqSourcePath = do
mapM_ (logError . renderPoseidonException) $ take 5 $ lefts seqSourceRepresentation
liftIO $ throwIO $ PoseidonFileConsistencyException seqSourcePath "Broken lines."
else do
let consistentSeqSource = checkSeqSourceConsistency seqSourcePath $ SeqSourceRows $ rights seqSourceRepresentation
case consistentSeqSource of
Left e -> do liftIO $ throwIO (e :: PoseidonException)
Right x -> do
-- warnings
warnSeqSourceConsistency seqSourcePath x
-- finally return the good ones
return x
let seqSource = SeqSourceRows $ rights seqSourceRepresentation
warnSeqSourceConsistency seqSourcePath seqSource
return seqSource

-- | A function to read one row of a seqSourceFile
readSeqSourceFileRow :: FilePath -> (Int, Bch.ByteString) -> PoseidonIO (Either PoseidonException SeqSourceRow)
readSeqSourceFileRow seqSourcePath (lineNumber, row) = do
case Csv.decodeByNameWith decodingOptions row of
let decoded = Csv.decodeByNameWith decodingOptions row
simplifiedDecoded = (\(_,rs) -> V.head rs) <$> decoded
case simplifiedDecoded of
Left e -> do
return $ Left $ PoseidonFileRowException seqSourcePath lineNumber $ removeUselessSuffix e
Right (_, seqSourceRow :: V.Vector SeqSourceRow) -> do
case checkSeqSourceRowConsistency seqSourcePath lineNumber $ V.head seqSourceRow of
Left e -> do return $ Left e
Right (pS :: SeqSourceRow) -> do return $ Right pS

-- SeqSource consistency checks
checkSeqSourceConsistency :: FilePath -> SeqSourceRows -> Either PoseidonException SeqSourceRows
checkSeqSourceConsistency _ xs
-- | ... no tests implemented
| otherwise = Right xs

checkSeqSourceRowConsistency :: FilePath -> Int -> SeqSourceRow -> Either PoseidonException SeqSourceRow
checkSeqSourceRowConsistency _ _ x
-- | ... no tests implemented
| otherwise = Right x
let betterError = case P.parse parseCsvParseError "" e of
Left _ -> removeUselessSuffix e
Right result -> renderCsvParseError result
return $ Left $ PoseidonFileRowException seqSourcePath (show lineNumber) betterError
Right seqSourceRow -> do
return $ Right seqSourceRow

-- Global SSF consistency checks

warnSeqSourceConsistency :: FilePath -> SeqSourceRows -> PoseidonIO ()
warnSeqSourceConsistency seqSourcePath xs = do
Expand Down
4 changes: 2 additions & 2 deletions src/Poseidon/Utils.hs
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ data PoseidonException =
| PoseidonGenotypeException String -- ^ An exception to represent errors in the genotype data
| PoseidonGenotypeExceptionForward SomeException -- ^ An exception to represent errors in the genotype data forwarded from the sequence-formats library
| PoseidonHttpExceptionForward HttpException -- ^ An exception to represent errors in the remote data loading forwarded from simpleHttp
| PoseidonFileRowException FilePath Int String -- ^ An exception to represent errors when trying to parse the janno or seqSource file
| PoseidonFileRowException FilePath String String -- ^ An exception to represent errors when trying to parse the janno or seqSource file
| PoseidonFileConsistencyException FilePath String -- ^ An exception to represent consistency errors in janno or seqSource files
| PoseidonCrossFileConsistencyException String String -- ^ An exception to represent inconsistencies across multiple files in a package
| PoseidonCollectionException String -- ^ An exception to represent logical issues in a poseidon package Collection
Expand Down Expand Up @@ -204,7 +204,7 @@ renderPoseidonException (PoseidonHttpExceptionForward (HttpExceptionRequest _ co
renderPoseidonException (PoseidonHttpExceptionForward e) =
"Issues in HTTP-communication with server: " ++ show e
renderPoseidonException (PoseidonFileRowException f i s) =
"Can't read sample in " ++ f ++ " in line " ++ show i ++ ": " ++ s
"Can't read sample in " ++ f ++ " in line " ++ i ++ ": " ++ s
renderPoseidonException (PoseidonFileConsistencyException f s) =
"Consistency issues in file " ++ f ++ ": " ++ s
renderPoseidonException (PoseidonCrossFileConsistencyException p s) =
Expand Down

0 comments on commit e7e1996

Please sign in to comment.