Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added a feature to remove empty .janno columns with rectify #326

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
1 change: 1 addition & 0 deletions src-executables/Main-trident.hs
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,7 @@ rectifyOptParser = RectifyOptions <$> parseBasePaths
<*> parseMaybePackageVersionUpdate
<*> parseChecksumsToRectify
<*> parseMaybeContributors
<*> parseJannoRemoveEmptyCols
<*> parseOnlyLatest

validateOptParser :: OP.Parser ValidateOptions
Expand Down
6 changes: 6 additions & 0 deletions src/Poseidon/CLI/OptparseApplicativeParsers.hs
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,12 @@
Left p -> Left (showParsecErr p)
Right x -> Right x

parseJannoRemoveEmptyCols :: OP.Parser Bool
parseJannoRemoveEmptyCols = OP.switch (
OP.long "jannoRemoveEmpty" <>
OP.help "Reorder the .janno file and remove empty colums. \

Check warning on line 242 in src/Poseidon/CLI/OptparseApplicativeParsers.hs

View check run for this annotation

Codecov / codecov/patch

src/Poseidon/CLI/OptparseApplicativeParsers.hs#L240-L242

Added lines #L240 - L242 were not covered by tests
\Remember to pair this option with --checksumJanno to also update the checksum."
)

parseMaybeLog :: OP.Parser (Maybe String)
parseMaybeLog = OP.option (Just <$> OP.str) (
Expand Down
20 changes: 17 additions & 3 deletions src/Poseidon/CLI/Rectify.hs
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,19 @@
renderNameWithVersion)
import Poseidon.GenotypeData (GenotypeDataSpec (..),
GenotypeFileSpec (..))
import Poseidon.Janno (writeJannoFileWithoutEmptyCols)
import Poseidon.Package (PackageReadOptions (..),
PoseidonPackage (..),
defaultPackageReadOptions,
readPoseidonPackageCollection,
writePoseidonPackage)
import Poseidon.Utils (PoseidonIO, getChecksum, logDebug,
logInfo)
logInfo, logWarning)
import Poseidon.Version (VersionComponent (..),
updateThreeComponentVersion)

import Control.DeepSeq ((<$!!>))
import Control.Monad (when)
import Control.Monad.IO.Class (MonadIO, liftIO)
import Data.List (nub)
import Data.Maybe (fromJust)
Expand All @@ -36,6 +38,7 @@
, _rectifyPackageVersionUpdate :: Maybe PackageVersionUpdate
, _rectifyChecksums :: ChecksumsToRectify
, _rectifyNewContributors :: Maybe [ContributorSpec]
, _rectifyJannoRemoveEmptyCols :: Bool

Check warning on line 41 in src/Poseidon/CLI/Rectify.hs

View check run for this annotation

Codecov / codecov/patch

src/Poseidon/CLI/Rectify.hs#L41

Added line #L41 was not covered by tests
, _rectifyOnlyLatest :: Bool
}

Expand All @@ -55,7 +58,12 @@
}

runRectify :: RectifyOptions -> PoseidonIO ()
runRectify (RectifyOptions baseDirs ignorePosVer newPosVer pacVerUpdate checksumUpdate newContributors onlyLatest) = do
runRectify (RectifyOptions
baseDirs
ignorePosVer newPosVer pacVerUpdate checksumUpdate newContributors
jannoRemoveEmptyCols
onlyLatest
) = do
let pacReadOpts = defaultPackageReadOptions {
_readOptIgnoreChecksums = True
, _readOptIgnoreGeno = True
Expand All @@ -72,6 +80,13 @@
rectifyOnePackage :: PoseidonPackage -> PoseidonIO ()
rectifyOnePackage inPac = do
logInfo $ "Rectifying package: " ++ renderNameWithVersion inPac
when jannoRemoveEmptyCols $ do
case posPacJannoFile inPac of
Nothing -> do
logWarning "No .janno file to modify with --jannoRemoveEmpty"

Check warning on line 86 in src/Poseidon/CLI/Rectify.hs

View check run for this annotation

Codecov / codecov/patch

src/Poseidon/CLI/Rectify.hs#L85-L86

Added lines #L85 - L86 were not covered by tests
Just jannoPath -> do
logInfo "Reordering and removing empty columns from .janno file"
liftIO $ writeJannoFileWithoutEmptyCols (posPacBaseDir inPac </> jannoPath) (posPacJanno inPac)
updatedPacPosVer <- updatePoseidonVersion newPosVer inPac
updatedPacContri <- addContributors newContributors updatedPacPosVer
updatedPacChecksums <- updateChecksums checksumUpdate updatedPacContri
Expand Down Expand Up @@ -156,7 +171,6 @@
if e then Just <$!!> getChk file else return defaultChkSum



completeAndWritePackage :: Maybe PackageVersionUpdate -> PoseidonPackage -> PoseidonIO ()
completeAndWritePackage Nothing pac = do
logDebug "Writing rectified POSEIDON.yml file"
Expand Down
2 changes: 1 addition & 1 deletion src/Poseidon/CLI/Survey.hs
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ renderJannoCompleteness (JannoRows rows) =
in init ratioString -- remove last entry covering the additional columns (CsvNamedRecord)
where
-- the following magic was heavily inspired by https://stackoverflow.com/a/41524511/3216883
getRatiosForEachField :: (Generics.SOP.Generic a, Code a ~ '[ xs ], All PresenceCountable xs) => [a] -> [Ratio Int]
getRatiosForEachField :: (Generics.SOP.Generic a, Code a ~ '[ xs ], All PresenceCountable xs) => [a] -> [Ratio Int] --'
getRatiosForEachField =
hcollapse
. hcmap (Proxy :: Proxy PresenceCountable) (K . measureFillState)
Expand Down
46 changes: 25 additions & 21 deletions src/Poseidon/Janno.hs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
JannoRelationDegree (..),
JannoLibraryBuilt (..),
writeJannoFile,
writeJannoFileWithoutEmptyCols,
readJannoFile,
createMinimalJanno,
createMinimalSample,
Expand Down Expand Up @@ -64,7 +65,7 @@
import qualified Data.HashMap.Strict as HM
import Data.List (elemIndex, foldl',
intercalate, nub, sort,
(\\))
transpose, (\\))
import Data.Maybe (fromJust)
import qualified Data.Text as T
import qualified Data.Vector as V
Expand Down Expand Up @@ -282,8 +283,11 @@
transNA "n/a" = Nothing
transNA x = Just x

explicitNA :: Csv.NamedRecord -> Csv.NamedRecord
explicitNA = HM.map (\x -> if Bchs.null x then "n/a" else x)

instance Csv.ToNamedRecord JannoRow where
toNamedRecord j = Csv.namedRecord [
toNamedRecord j = explicitNA $ Csv.namedRecord [
"Poseidon_ID" Csv..= jPoseidonID j
, "Genetic_Sex" Csv..= jGeneticSex j
, "Group_Name" Csv..= jGroupName j
Expand Down Expand Up @@ -394,15 +398,27 @@

-- Janno file writing

makeHeaderWithAdditionalColumns :: [JannoRow] -> Csv.Header
makeHeaderWithAdditionalColumns rows =
V.fromList $ jannoHeader ++ sort (HM.keys (HM.unions (map (getCsvNR . jAdditionalColumns) rows)))

writeJannoFile :: FilePath -> JannoRows -> IO ()
writeJannoFile path (JannoRows rows) = do
let jannoAsBytestring = Csv.encodeByNameWith encodingOptions makeHeaderWithAdditionalColumns rows
let jannoAsBytestringwithNA = explicitNA jannoAsBytestring
Bch.writeFile path jannoAsBytestringwithNA
where
makeHeaderWithAdditionalColumns :: Csv.Header
makeHeaderWithAdditionalColumns =
V.fromList $ jannoHeader ++ sort (HM.keys (HM.unions (map (getCsvNR . jAdditionalColumns) rows)))
let jannoAsBytestring = Csv.encodeByNameWith encodingOptions (makeHeaderWithAdditionalColumns rows) rows
Bch.writeFile path jannoAsBytestring

writeJannoFileWithoutEmptyCols :: FilePath -> JannoRows -> IO ()
writeJannoFileWithoutEmptyCols path (JannoRows rows) = do
let jannoAsBytestring = Csv.encodeByNameWith encodingOptions (makeHeaderWithAdditionalColumns rows) rows
case Csv.decodeWith decodingOptions Csv.NoHeader jannoAsBytestring :: Either String (V.Vector (V.Vector Bch.ByteString)) of
Left _ -> error "internal error, please report"

Check warning on line 414 in src/Poseidon/Janno.hs

View check run for this annotation

Codecov / codecov/patch

src/Poseidon/Janno.hs#L414

Added line #L414 was not covered by tests
Right x -> do
let janno = V.toList $ V.map V.toList x
jannoTransposed = transpose janno
nevrome marked this conversation as resolved.
Show resolved Hide resolved
jannoTransposedFiltered = filter (any (/= "n/a") . tail) jannoTransposed
jannoBackTransposed = transpose jannoTransposedFiltered
jannoConcat = Bch.intercalate "\n" $ map (Bch.intercalate "\t") jannoBackTransposed
Bch.writeFile path (jannoConcat <> "\n")

encodingOptions :: Csv.EncodeOptions
encodingOptions = Csv.defaultEncodeOptions {
Expand Down Expand Up @@ -528,18 +544,6 @@
"broken value: " ++ actual ++ ", " ++
"problematic characters: " ++ show leftover ++ ")"

-- | A helper functions to replace empty bytestrings values in janno files with explicit "n/a"
explicitNA :: Bch.ByteString -> Bch.ByteString
explicitNA = replaceInJannoBytestring Bch.empty "n/a"

replaceInJannoBytestring :: Bch.ByteString -> Bch.ByteString -> Bch.ByteString -> Bch.ByteString
replaceInJannoBytestring from to tsv =
let tsvRows = Bch.lines tsv
tsvCells = map (Bch.splitWith (=='\t')) tsvRows
tsvCellsUpdated = map (map (\y -> if y == from || y == Bch.append from "\r" then to else y)) tsvCells
tsvRowsUpdated = map (Bch.intercalate (Bch.pack "\t")) tsvCellsUpdated
in Bch.unlines tsvRowsUpdated

-- Global janno consistency checks

checkJannoConsistency :: FilePath -> JannoRows -> Either PoseidonException JannoRows
Expand Down
5 changes: 2 additions & 3 deletions src/Poseidon/SequencingSource.hs
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ instance Csv.FromNamedRecord SeqSourceRow where
<*> pure (CsvNamedRecord (m `HM.difference` seqSourceRefHashMap))

instance Csv.ToNamedRecord SeqSourceRow where
toNamedRecord s = Csv.namedRecord [
toNamedRecord s = explicitNA $ Csv.namedRecord [
"poseidon_IDs" Csv..= sPoseidonID s
, "udg" Csv..= sUDG s
, "library_built" Csv..= sLibraryBuilt s
Expand Down Expand Up @@ -385,8 +385,7 @@ instance Csv.ToNamedRecord SeqSourceRow where
writeSeqSourceFile :: FilePath -> SeqSourceRows -> IO ()
writeSeqSourceFile path (SeqSourceRows rows) = do
let seqSourceAsBytestring = Csv.encodeByNameWith encodingOptions makeHeaderWithAdditionalColumns rows
let seqSourceAsBytestringwithNA = explicitNA seqSourceAsBytestring
Bch.writeFile path seqSourceAsBytestringwithNA
Bch.writeFile path seqSourceAsBytestring
where
makeHeaderWithAdditionalColumns :: Csv.Header
makeHeaderWithAdditionalColumns =
Expand Down
15 changes: 9 additions & 6 deletions test/PoseidonGoldenTests/GoldenTestCheckSumFile.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ d41d8cd98f00b204e9800998ecf8427e validate validate/validate8
d41d8cd98f00b204e9800998ecf8427e validate validate/validate9
bc636b9c03ea9359acd254a9911e5af3 list list/list1
b18847f5498ae55882689b75916fdf64 list list/list2
63ef5f277f6f29163192382234211224 list list/list3
1c1f24de305405ece44393d378c0e15a list list/list4
d6eafec9087c88ed6a95e4a22f1f306c list list/list3
27c6118de70743426f8d8157ddd0f1c7 list list/list4
bc636b9c03ea9359acd254a9911e5af3 list list/list5
ad5590b0ad65e64d6b2c8d874571c9f8 list list/list6
b69c17ad4893e4e9bdcb767a229eaccb list list/list6
b197fb8dd883c7469a4791e4a677f1c0 summarise summarise/summarise1
d9e4b3f15d4e129a365d2064198d95b6 summarise summarise/summarise2
a1186fdad9ed555dff4dd61dc9838645 survey survey/survey1
Expand Down Expand Up @@ -55,6 +55,9 @@ da981f8d52f60ec4d96865b224648c92 rectify init/Schiffels/POSEIDON.yml
3bb396e099d5b8771a3409f5fe85d70b rectify init/Schiffels/CHANGELOG.md
dc322649188ce2995cea8a46a7f97f3e rectify init/Schiffels/POSEIDON.yml
3bb396e099d5b8771a3409f5fe85d70b rectify init/Schiffels/CHANGELOG.md
a61f78b4e9b7e3c7e00ec7bc6aaab95b rectify init/Schiffels/POSEIDON.yml
3bb396e099d5b8771a3409f5fe85d70b rectify init/Schiffels/CHANGELOG.md
083fe7ef4206c979356a3a2454d780b1 rectify init/Schiffels/Schiffels.janno
2757f727e02dd6453fffe68c4c6ec4c8 forge forge/ForgePac1/POSEIDON.yml
1286a2580e4bfbed7d804d5f3fe125f7 forge forge/ForgePac1/ForgePac1.geno
8846333d9a1de6510f25a3816cc70fef forge forge/ForgePac1/ForgePac1.janno
Expand Down Expand Up @@ -125,7 +128,7 @@ e375863bca9e4a91c9855396abde31c7 forge forge/ForgePac20/ForgePac20.janno
d4a05cfef045648238a94a9d621cf667 chronicle chronicle/chronicle1.yml
b43da4d5734371c0648553120f812466 timetravel timetravel/Lamnidis_2018-1.0.0/POSEIDON.yml
8d57ce1a1ab28c0d8a5f391dd790a59c timetravel timetravel/Lamnidis_2018-1.0.1/POSEIDON.yml
dc322649188ce2995cea8a46a7f97f3e timetravel timetravel/Schiffels-1.1.1/POSEIDON.yml
a61f78b4e9b7e3c7e00ec7bc6aaab95b timetravel timetravel/Schiffels-1.1.1/POSEIDON.yml
1ab24c45ef3a13e0fb34afac7a21dca8 timetravel timetravel/Schmid_2028-1.0.0/POSEIDON.yml
8d57ce1a1ab28c0d8a5f391dd790a59c fetch fetch/by_package/Lamnidis_2018-1.0.1/POSEIDON.yml
1ab24c45ef3a13e0fb34afac7a21dca8 fetch fetch/by_package/Schmid_2028-1.0.0/POSEIDON.yml
Expand All @@ -141,9 +144,9 @@ b43da4d5734371c0648553120f812466 fetch fetch/multi_packages_2/Lamnidis_2018-1.0.
8d57ce1a1ab28c0d8a5f391dd790a59c fetch fetch/multi_packages_2/Lamnidis_2018-1.0.1/POSEIDON.yml
1d2a588b88e6d1017147c01f19d0b878 listRemote listRemote/listRemote1
0ddad9ea097bca0253e0c3c6157efa68 listRemote listRemote/listRemote2
b2286cf9af7c6c8757b8109a1f58e2d9 listRemote listRemote/listRemote3
705ecf31acfb9f21bfdc5bf4e77c10cd listRemote listRemote/listRemote3
0433b2a80ee5a2eb5bf8c6404130e562 listRemote listRemote/listRemote4
8a13e5b31acabca6839100f411c38453 listRemote listRemote/listRemote5
06eb810bcba832e75d72f10800ea7774 listRemote listRemote/listRemote5
eb610918796e03da3b0035655e9f8faa jannocoalesce jannocoalesce/target1.janno
df34d0542c0a94cf9556619bff2e301d jannocoalesce jannocoalesce/target2.janno
cc76b2bf0ad6637ea6502fdefcca3508 jannocoalesce jannocoalesce/target3.janno
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ genotypeData:
indFile: ind.txt
snpSet: Other
jannoFile: Schiffels.janno
jannoFileChkSum: fd632717ecaf337a39cfd7a828a54e99
jannoFileChkSum: 083fe7ef4206c979356a3a2454d780b1
bibFile: Schiffels.bib
bibFileChkSum: 9edc4a757f785a8ecb59c54d16c5690a
changelogFile: CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
Poseidon_ID Genetic_Sex Group_Name Alternative_IDs Relation_To Relation_Degree Relation_Type Relation_Note Collection_ID Country Country_ISO Location Site Latitude Longitude Date_Type Date_C14_Labnr Date_C14_Uncal_BP Date_C14_Uncal_BP_Err Date_BC_AD_Start Date_BC_AD_Median Date_BC_AD_Stop Date_Note MT_Haplogroup Y_Haplogroup Source_Tissue Nr_Libraries Library_Names Capture_Type UDG Library_Built Genotype_Ploidy Data_Preparation_Pipeline_URL Endogenous Nr_SNPs Coverage_on_Target_SNPs Damage Contamination Contamination_Err Contamination_Meas Contamination_Note Genetic_Source_Accession_IDs Primary_Contact Publication Note Keywords
XXX001 M POP1 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a
XXX002 F POP2 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a
XXX003 M POP1 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a
XXX004 F POP2 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a
XXX005 M POP2 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a
XXX006 F POP2 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a
XXX007 M POP1 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a
XXX008 F POP3 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a
XXX009 F POP1 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a
XXX010 M POP3 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a
Poseidon_ID Genetic_Sex Group_Name
XXX001 M POP1
XXX002 F POP2
XXX003 M POP1
XXX004 F POP2
XXX005 M POP2
XXX006 F POP2
XXX007 M POP1
XXX008 F POP3
XXX009 F POP1
XXX010 M POP3
14 changes: 7 additions & 7 deletions test/PoseidonGoldenTests/GoldenTestData/chronicle/chronicle2.yml
Original file line number Diff line number Diff line change
@@ -1,29 +1,29 @@
title: Chronicle title
description: Chronicle description
chronicleVersion: 0.2.0
lastModified: 2024-11-13
lastModified: 2025-01-09
packages:
- title: Lamnidis_2018
version: 1.0.0
commit: c59bfb82fec3f2742cc0e10ceb2932ee06e56aa1
commit: 4262db441fc73a3a0fabe7165ba13261f0a994f2
path: Lamnidis_2018
- title: Lamnidis_2018
version: 1.0.1
commit: c59bfb82fec3f2742cc0e10ceb2932ee06e56aa1
commit: 4262db441fc73a3a0fabe7165ba13261f0a994f2
path: Lamnidis_2018_newVersion
- title: Schiffels
version: 1.1.1
commit: a32a46cf82b8895af72c8920be4ca4843cd5e7f7
commit: 88e86f51139053f19d55f453b9fbcaae205d37a3
path: Schiffels
- title: Schiffels_2016
version: 1.0.1
commit: c59bfb82fec3f2742cc0e10ceb2932ee06e56aa1
commit: 4262db441fc73a3a0fabe7165ba13261f0a994f2
path: Schiffels_2016
- title: Schmid_2028
version: 1.0.0
commit: c59bfb82fec3f2742cc0e10ceb2932ee06e56aa1
commit: 4262db441fc73a3a0fabe7165ba13261f0a994f2
path: Schmid_2028
- title: Wang_2020
version: 0.1.0
commit: c59bfb82fec3f2742cc0e10ceb2932ee06e56aa1
commit: 4262db441fc73a3a0fabe7165ba13261f0a994f2
path: Wang_2020
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ genotypeData:
indFile: ind.txt
snpSet: Other
jannoFile: Schiffels.janno
jannoFileChkSum: fd632717ecaf337a39cfd7a828a54e99
jannoFileChkSum: 083fe7ef4206c979356a3a2454d780b1
bibFile: Schiffels.bib
bibFileChkSum: 9edc4a757f785a8ecb59c54d16c5690a
changelogFile: CHANGELOG.md
Loading
Loading