From 05e935f6793c9180ef9ea97a9c36ed9886d8fc19 Mon Sep 17 00:00:00 2001 From: Frederic Lemoine Date: Sat, 2 May 2020 22:18:49 +0200 Subject: [PATCH] Added option --char to goalign clean sites command #5 --- align/align.go | 95 +++++++++++++++++++++++++++++++++++++----- align/align_test.go | 34 +++++++++++++++ cmd/cleansites.go | 50 ++++++++++++++++------ docs/commands/clean.md | 19 +++++++-- test.sh | 59 ++++++++++++++++++++++++++ 5 files changed, 232 insertions(+), 25 deletions(-) diff --git a/align/align.go b/align/align.go index c5f4b01..81a7497 100644 --- a/align/align.go +++ b/align/align.go @@ -68,8 +68,10 @@ type Alignment interface { Recombine(rate float64, lenprop float64) // converts coordinates on the given sequence to coordinates on the alignment RefCoordinates(name string, refstart, refend int) (alistart, aliend int, err error) - RemoveGapSeqs(cutoff float64) // Removes sequences having >= cutoff gaps - RemoveGapSites(cutoff float64, ends bool) (first, last int) // Removes sites having >= cutoff gaps, returns the number of consecutive removed sites at start and end of alignment + RemoveGapSeqs(cutoff float64) // Removes sequences having >= cutoff gaps + RemoveGapSites(cutoff float64, ends bool) (first, last int) // Removes sites having >= cutoff gaps, returns the number of consecutive removed sites at start and end of alignment + RemoveCharacterSites(c rune, cutoff float64, ends bool) (first, last int) // Removes sites having >= cutoff character, returns the number of consecutive removed sites at start and end of alignment + RemoveMajorityCharacterSites(cutoff float64, ends bool) (first, last int) // Removes sites having >= cutoff of the main character at these sites, returns the number of consecutive removed sites at start and end of alignment // Replaces match characters (.) by their corresponding characters on the first sequence ReplaceMatchChars() Sample(nb int) (Alignment, error) // generate a sub sample of the sequences @@ -241,7 +243,7 @@ func (a *align) ShuffleSites(rate float64, roguerate float64, randroguefirst boo return rogues } -// Removes positions constituted of [cutoff*100%,100%] Gaps +// RemoveGapSites Removes positions constituted of [cutoff*100%,100%] Gaps // Exception fo a cutoff of 0: does not remove positions with 0% gaps // Cutoff must be between 0 and 1, otherwise set to 0. // 0 means that positions with > 0 gaps will be removed @@ -254,24 +256,97 @@ func (a *align) ShuffleSites(rate float64, roguerate float64, randroguefirst boo // // Returns the number of consecutive removed sites at start and end of alignment func (a *align) RemoveGapSites(cutoff float64, ends bool) (first, last int) { - var nbgaps int + return a.RemoveCharacterSites(GAP, cutoff, ends) +} + +// RemoveCharacterSites Removes positions constituted of [cutoff*100%,100%] of the given character +// Exception fo a cutoff of 0: does not remove positions with 0% of this character +// Cutoff must be between 0 and 1, otherwise set to 0. +// 0 means that positions with > 0 of the given character will be removed +// other cutoffs : ]0,1] mean that positions with >= cutoff of this character will be removed +// +// If ends is true: then only removes consecutive positions that match the cutoff +// from start or from end of alignment. +// Example with a cutoff of 0.3 and ends and with the given proportion of this character: +// 0.4 0.5 0.1 0.5 0.6 0.1 0.8 will remove positions 0,1 and 6 +// +// Returns the number of consecutive removed sites at start and end of alignment +func (a *align) RemoveCharacterSites(c rune, cutoff float64, ends bool) (first, last int) { + var nbchars int if cutoff < 0 || cutoff > 1 { cutoff = 0 } toremove := make([]int, 0, 10) - // To remove only gap positions at start and ends positions + // To remove only positions with this character at start and ends positions firstcontinuous := -1 lastcontinuous := a.Length() for site := 0; site < a.Length(); site++ { - nbgaps = 0 + nbchars = 0 + for seq := 0; seq < a.NbSequences(); seq++ { - if a.seqs[seq].sequence[site] == GAP { - nbgaps++ + if a.seqs[seq].sequence[site] == c { + nbchars++ + } + } + if (cutoff > 0.0 && float64(nbchars) >= cutoff*float64(a.NbSequences())) || (cutoff == 0 && nbchars > 0) { + toremove = append(toremove, site) + if site == firstcontinuous+1 { + firstcontinuous++ + } + } + } + + first = firstcontinuous + 1 + + /* Now we remove positions, starting at the end */ + sort.Ints(toremove) + nbremoved := 0 + for i := (len(toremove) - 1); i >= 0; i-- { + if toremove[i] == lastcontinuous-1 { + lastcontinuous-- + } + + if !ends || lastcontinuous == toremove[i] || toremove[i] <= firstcontinuous { + nbremoved++ + for seq := 0; seq < a.NbSequences(); seq++ { + a.seqs[seq].sequence = append(a.seqs[seq].sequence[:toremove[i]], a.seqs[seq].sequence[toremove[i]+1:]...) } } - if (cutoff > 0.0 && float64(nbgaps) >= cutoff*float64(a.NbSequences())) || (cutoff == 0 && nbgaps > 0) { + } + last = a.Length() - lastcontinuous + a.length -= nbremoved + + return first, last +} + +// RemoveMajorityCharacterSites Removes positions constituted of [cutoff*100%,100%] of the most +// abundant character in these sites. +// Exception fo a cutoff of 0: does not remove positions with 0% of the most abundant character +// Cutoff must be between 0 and 1, otherwise set to 0. +// 0 means that positions with > 0 of the given character will be removed +// other cutoffs : ]0,1] mean that positions with >= cutoff of the most abundant character will be removed +// +// If ends is true: then only removes consecutive positions that match the cutoff +// from start or from end of alignment. +// Example with a cutoff of 0.3 and ends and with the given proportion of this character: +// 0.4 0.5 0.1 0.5 0.6 0.1 0.8 will remove positions 0,1 and 6 +// +// Returns the number of consecutive removed sites at start and end of alignment +func (a *align) RemoveMajorityCharacterSites(cutoff float64, ends bool) (first, last int) { + _, occur := a.MaxCharStats(false) + + length := a.Length() + nbsesq := a.NbSequences() + + toremove := make([]int, 0, 10) + // To remove only positions with this character at start and ends positions + firstcontinuous := -1 + lastcontinuous := a.Length() + + for site := 0; site < length; site++ { + if (cutoff > 0.0 && float64(occur[site]) >= cutoff*float64(nbsesq)) || (cutoff == 0 && occur[site] > 0) { toremove = append(toremove, site) if site == firstcontinuous+1 { firstcontinuous++ @@ -281,7 +356,7 @@ func (a *align) RemoveGapSites(cutoff float64, ends bool) (first, last int) { first = firstcontinuous + 1 - /* Now we remove gap positions, starting at the end */ + /* Now we remove positions, starting at the end */ sort.Ints(toremove) nbremoved := 0 for i := (len(toremove) - 1); i >= 0; i-- { diff --git a/align/align_test.go b/align/align_test.go index 8620316..0679a4a 100644 --- a/align/align_test.go +++ b/align/align_test.go @@ -1885,3 +1885,37 @@ func Test_align_NumMutationsComparedToReferenceSequence(t *testing.T) { } } } + +func Test_align_RemoveMajorityCharacterSites(t *testing.T) { + var a, a2, exp, exp2 Alignment + + a = NewAlign(NUCLEOTIDS) + a.AddSequence("A", "N-ANGA-GACC", "") + a.AddSequence("B", "N-TN-T-TTTC", "") + a.AddSequence("C", "NCTN-TTT--T", "") + a.AddSequence("D", "C-ANCCCCCCC", "") + + a2, _ = a.Clone() + + exp = NewAlign(NUCLEOTIDS) + exp.AddSequence("A", "AGA-GAC", "") + exp.AddSequence("B", "T-T-TTT", "") + exp.AddSequence("C", "T-TTT--", "") + exp.AddSequence("D", "ACCCCCC", "") + + exp2 = NewAlign(NUCLEOTIDS) + exp2.AddSequence("A", "ANGA-GAC", "") + exp2.AddSequence("B", "TN-T-TTT", "") + exp2.AddSequence("C", "TN-TTT--", "") + exp2.AddSequence("D", "ANCCCCCC", "") + + a.RemoveMajorityCharacterSites(0.6, false) + a2.RemoveMajorityCharacterSites(0.6, true) + + if !a.Identical(exp) { + t.Error(fmt.Errorf("Remove majority failed")) + } + if !a2.Identical(exp2) { + t.Error(fmt.Errorf("Remove majority failed with ends")) + } +} diff --git a/cmd/cleansites.go b/cmd/cleansites.go index 241e318..46216f3 100644 --- a/cmd/cleansites.go +++ b/cmd/cleansites.go @@ -10,23 +10,28 @@ import ( ) var cleanEnds bool +var cleanChar string // cleansitesCmd represents the cleansites command var cleansitesCmd = &cobra.Command{ Use: "sites", - Short: "Removes sites with gaps", - Long: `Removes sites constituted of gaps + Short: "Removes sites with specific characters", + Long: `Removes sites constituted of specific characters -Removes sites constitued of >= cutoff gap sites. +Removes sites constitued of >= cutoff specific characters. This characters can be : -Exception for a cutoff of 0: removes sites constitued of > 0 gap sites. +1. Gap (--char=GAP or --char=-, default) +2. Any other character X specified by --char=X (case sensitive) +3. The most abundant character in the site --char=MAJ (including gaps) + +Exception for a cutoff of 0: removes sites constitued of > 0 specified character (with --char=MAJ, then will remove all columns). Examples: -- With a cutoff of 0.5: a site with 5 gaps over 10 sequences will be removed; -- With a cutoff of 0.5: a site with 4 gaps over 10 sequences will not be removed; -- With a cutoff of 0.0 a site with 1 gap over 10 sequences will be removed. +- With a cutoff of 0.5: a site with 5 specified characters over 10 sequences will be removed; +- With a cutoff of 0.5: a site with 4 specified characters over 10 sequences will not be removed; +- With a cutoff of 0.0 a site with 1 specified over 10 sequences will be removed. -If cutoff is <0 or >1, it will be considered as 0, which means that every site with at least 1 gap +If cutoff is <0 or >1, it will be considered as 0, which means that every site with at least 1 specified character will be removed.`, RunE: func(cmd *cobra.Command, args []string) (err error) { var aligns *align.AlignChannel @@ -44,17 +49,36 @@ will be removed.`, defer closeWriteFile(f, cleanOutput) i := 0 + char := "" + for al := range aligns.Achan { beforelength := al.Length() - nbstart, nbend = al.RemoveGapSites(cleanCutoff, cleanEnds) + + if cleanChar == string(align.GAP) || cleanChar == "GAP" { + char = "gaps" + nbstart, nbend = al.RemoveGapSites(cleanCutoff, cleanEnds) + } else if cleanChar == "MAJ" { + char = "maj" + nbstart, nbend = al.RemoveMajorityCharacterSites(cleanCutoff, cleanEnds) + } else { + //single character + c := []rune(cleanChar) + if len(c) != 1 { + err = fmt.Errorf("--char should be a single character") + io.LogError(err) + return + } + char = string(c[0]) + nbstart, nbend = al.RemoveCharacterSites(c[0], cleanCutoff, cleanEnds) + } afterlength := al.Length() writeAlign(al, f) if !cleanQuiet { io.PrintSimpleMessage(fmt.Sprintf("Alignment (%d) length before cleaning=%d", i, beforelength)) io.PrintSimpleMessage(fmt.Sprintf("Alignment (%d) length after cleaning=%d", i, afterlength)) - io.PrintSimpleMessage(fmt.Sprintf("Alignment (%d) number of gaps=%d", i, beforelength-afterlength)) - io.PrintSimpleMessage(fmt.Sprintf("Alignment (%d) number of start gaps=%d", i, nbstart)) - io.PrintSimpleMessage(fmt.Sprintf("Alignment (%d) number of end gaps=%d", i, nbend)) + io.PrintSimpleMessage(fmt.Sprintf("Alignment (%d) number of %s=%d", i, char, beforelength-afterlength)) + io.PrintSimpleMessage(fmt.Sprintf("Alignment (%d) number of start %s=%d", i, char, nbstart)) + io.PrintSimpleMessage(fmt.Sprintf("Alignment (%d) number of end %s=%d", i, char, nbend)) } } @@ -68,5 +92,7 @@ will be removed.`, func init() { cleansitesCmd.PersistentFlags().BoolVar(&cleanEnds, "ends", false, "If true, then only remove consecutive gap positions from alignment start and end") + cleansitesCmd.PersistentFlags().StringVar(&cleanChar, "char", "GAP", "The character the cutoff is applied to. May be GAP, MAJ, or any other character") + cleanCmd.AddCommand(cleansitesCmd) } diff --git a/docs/commands/clean.md b/docs/commands/clean.md index 1e269eb..e403b8e 100644 --- a/docs/commands/clean.md +++ b/docs/commands/clean.md @@ -3,12 +3,25 @@ ## Commands ### clean -This command removes alignment sites or sequences constitued of >= than a given proportion of gaps. Exception for a cutoff of 0: removes sites /sequences constitued of > 0 gaps. +This command removes alignment sites or sequences constitued of >= than a given proportion of gaps or other characters. Exception for a cutoff of 0: removes sites /sequences constitued of > 0 of this character. Two subcommands: -* `goalign clean sites`: Removes sites with gaps -* `goalign clean seqs`: Removes sequences with gaps +* `goalign clean sites`: Removes sites constitued of >= cutoff specific characters. This characters can be : + 1. Gap (--char=GAP or --char=-, default) + 2. Any other character X specified by --char=X (case sensitive) + 3. The most abundant character in the site --char=MAJ (including gaps) + + Exception for a cutoff of 0: removes sites constitued of > 0 specified character (with --char=MAJ, then will remove all columns). + + Examples: + - With a cutoff of 0.5: a site with 5 specified characters over 10 sequences will be removed; + - With a cutoff of 0.5: a site with 4 specified characters over 10 sequences will not be removed; + - With a cutoff of 0.0 a site with 1 specified over 10 sequences will be removed. + + If cutoff is <0 or >1, it will be considered as 0, which means that every site with at least 1 specified character + will be removed.`, +* `goalign clean seqs`: Removes sequences with gaps. So far only gap filter is implemented. Examples with sites: - With a cutoff of 0.5: a site with 5 gaps over 10 sequences will be removed; diff --git a/test.sh b/test.sh index 9ae9c58..a93d075 100755 --- a/test.sh +++ b/test.sh @@ -139,6 +139,65 @@ diff -q -b result expected diff -q -b log expectedlog rm -f expected result log expectedlog +echo "->goalign clean sites --ends --char MAJ" +cat > input <A +N-ANGA-GACC +>B +N-TN-T-TTTC +>C +NCTN-TTT--T +>D +C-ANCCCCCCC +EOF + +cat > expected <A +AGA-GAC +>B +T-T-TTT +>C +T-TTT-- +>D +ACCCCCC +EOF + +cat > expected2 <A +ANGA-GAC +>B +TN-T-TTT +>C +TN-TTT-- +>D +ANCCCCCC +EOF + +cat > expectedlog < expectedlog2 < result 2>log +diff -q -b result expected +diff -q -b log expectedlog +rm -f expected result log expectedlog + +${GOALIGN} clean sites -i input -c 0.6 --char MAJ --ends > result 2>log +diff -q -b result expected2 +diff -q -b log expectedlog2 +rm -f expected2 result log expectedlog2 echo "->goalign clean seqs" cat > expected <