Skip to content

Commit

Permalink
Added option --char to goalign clean sites command #5
Browse files Browse the repository at this point in the history
  • Loading branch information
fredericlemoine committed May 2, 2020
1 parent f984bdd commit 05e935f
Show file tree
Hide file tree
Showing 5 changed files with 232 additions and 25 deletions.
95 changes: 85 additions & 10 deletions align/align.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,10 @@ type Alignment interface {
Recombine(rate float64, lenprop float64)
// converts coordinates on the given sequence to coordinates on the alignment
RefCoordinates(name string, refstart, refend int) (alistart, aliend int, err error)
RemoveGapSeqs(cutoff float64) // Removes sequences having >= cutoff gaps
RemoveGapSites(cutoff float64, ends bool) (first, last int) // Removes sites having >= cutoff gaps, returns the number of consecutive removed sites at start and end of alignment
RemoveGapSeqs(cutoff float64) // Removes sequences having >= cutoff gaps
RemoveGapSites(cutoff float64, ends bool) (first, last int) // Removes sites having >= cutoff gaps, returns the number of consecutive removed sites at start and end of alignment
RemoveCharacterSites(c rune, cutoff float64, ends bool) (first, last int) // Removes sites having >= cutoff character, returns the number of consecutive removed sites at start and end of alignment
RemoveMajorityCharacterSites(cutoff float64, ends bool) (first, last int) // Removes sites having >= cutoff of the main character at these sites, returns the number of consecutive removed sites at start and end of alignment
// Replaces match characters (.) by their corresponding characters on the first sequence
ReplaceMatchChars()
Sample(nb int) (Alignment, error) // generate a sub sample of the sequences
Expand Down Expand Up @@ -241,7 +243,7 @@ func (a *align) ShuffleSites(rate float64, roguerate float64, randroguefirst boo
return rogues
}

// Removes positions constituted of [cutoff*100%,100%] Gaps
// RemoveGapSites Removes positions constituted of [cutoff*100%,100%] Gaps
// Exception fo a cutoff of 0: does not remove positions with 0% gaps
// Cutoff must be between 0 and 1, otherwise set to 0.
// 0 means that positions with > 0 gaps will be removed
Expand All @@ -254,24 +256,97 @@ func (a *align) ShuffleSites(rate float64, roguerate float64, randroguefirst boo
//
// Returns the number of consecutive removed sites at start and end of alignment
func (a *align) RemoveGapSites(cutoff float64, ends bool) (first, last int) {
var nbgaps int
return a.RemoveCharacterSites(GAP, cutoff, ends)
}

// RemoveCharacterSites Removes positions constituted of [cutoff*100%,100%] of the given character
// Exception fo a cutoff of 0: does not remove positions with 0% of this character
// Cutoff must be between 0 and 1, otherwise set to 0.
// 0 means that positions with > 0 of the given character will be removed
// other cutoffs : ]0,1] mean that positions with >= cutoff of this character will be removed
//
// If ends is true: then only removes consecutive positions that match the cutoff
// from start or from end of alignment.
// Example with a cutoff of 0.3 and ends and with the given proportion of this character:
// 0.4 0.5 0.1 0.5 0.6 0.1 0.8 will remove positions 0,1 and 6
//
// Returns the number of consecutive removed sites at start and end of alignment
func (a *align) RemoveCharacterSites(c rune, cutoff float64, ends bool) (first, last int) {
var nbchars int
if cutoff < 0 || cutoff > 1 {
cutoff = 0
}

toremove := make([]int, 0, 10)
// To remove only gap positions at start and ends positions
// To remove only positions with this character at start and ends positions
firstcontinuous := -1
lastcontinuous := a.Length()

for site := 0; site < a.Length(); site++ {
nbgaps = 0
nbchars = 0

for seq := 0; seq < a.NbSequences(); seq++ {
if a.seqs[seq].sequence[site] == GAP {
nbgaps++
if a.seqs[seq].sequence[site] == c {
nbchars++
}
}
if (cutoff > 0.0 && float64(nbchars) >= cutoff*float64(a.NbSequences())) || (cutoff == 0 && nbchars > 0) {
toremove = append(toremove, site)
if site == firstcontinuous+1 {
firstcontinuous++
}
}
}

first = firstcontinuous + 1

/* Now we remove positions, starting at the end */
sort.Ints(toremove)
nbremoved := 0
for i := (len(toremove) - 1); i >= 0; i-- {
if toremove[i] == lastcontinuous-1 {
lastcontinuous--
}

if !ends || lastcontinuous == toremove[i] || toremove[i] <= firstcontinuous {
nbremoved++
for seq := 0; seq < a.NbSequences(); seq++ {
a.seqs[seq].sequence = append(a.seqs[seq].sequence[:toremove[i]], a.seqs[seq].sequence[toremove[i]+1:]...)
}
}
if (cutoff > 0.0 && float64(nbgaps) >= cutoff*float64(a.NbSequences())) || (cutoff == 0 && nbgaps > 0) {
}
last = a.Length() - lastcontinuous
a.length -= nbremoved

return first, last
}

// RemoveMajorityCharacterSites Removes positions constituted of [cutoff*100%,100%] of the most
// abundant character in these sites.
// Exception fo a cutoff of 0: does not remove positions with 0% of the most abundant character
// Cutoff must be between 0 and 1, otherwise set to 0.
// 0 means that positions with > 0 of the given character will be removed
// other cutoffs : ]0,1] mean that positions with >= cutoff of the most abundant character will be removed
//
// If ends is true: then only removes consecutive positions that match the cutoff
// from start or from end of alignment.
// Example with a cutoff of 0.3 and ends and with the given proportion of this character:
// 0.4 0.5 0.1 0.5 0.6 0.1 0.8 will remove positions 0,1 and 6
//
// Returns the number of consecutive removed sites at start and end of alignment
func (a *align) RemoveMajorityCharacterSites(cutoff float64, ends bool) (first, last int) {
_, occur := a.MaxCharStats(false)

length := a.Length()
nbsesq := a.NbSequences()

toremove := make([]int, 0, 10)
// To remove only positions with this character at start and ends positions
firstcontinuous := -1
lastcontinuous := a.Length()

for site := 0; site < length; site++ {
if (cutoff > 0.0 && float64(occur[site]) >= cutoff*float64(nbsesq)) || (cutoff == 0 && occur[site] > 0) {
toremove = append(toremove, site)
if site == firstcontinuous+1 {
firstcontinuous++
Expand All @@ -281,7 +356,7 @@ func (a *align) RemoveGapSites(cutoff float64, ends bool) (first, last int) {

first = firstcontinuous + 1

/* Now we remove gap positions, starting at the end */
/* Now we remove positions, starting at the end */
sort.Ints(toremove)
nbremoved := 0
for i := (len(toremove) - 1); i >= 0; i-- {
Expand Down
34 changes: 34 additions & 0 deletions align/align_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1885,3 +1885,37 @@ func Test_align_NumMutationsComparedToReferenceSequence(t *testing.T) {
}
}
}

func Test_align_RemoveMajorityCharacterSites(t *testing.T) {
var a, a2, exp, exp2 Alignment

a = NewAlign(NUCLEOTIDS)
a.AddSequence("A", "N-ANGA-GACC", "")
a.AddSequence("B", "N-TN-T-TTTC", "")
a.AddSequence("C", "NCTN-TTT--T", "")
a.AddSequence("D", "C-ANCCCCCCC", "")

a2, _ = a.Clone()

exp = NewAlign(NUCLEOTIDS)
exp.AddSequence("A", "AGA-GAC", "")
exp.AddSequence("B", "T-T-TTT", "")
exp.AddSequence("C", "T-TTT--", "")
exp.AddSequence("D", "ACCCCCC", "")

exp2 = NewAlign(NUCLEOTIDS)
exp2.AddSequence("A", "ANGA-GAC", "")
exp2.AddSequence("B", "TN-T-TTT", "")
exp2.AddSequence("C", "TN-TTT--", "")
exp2.AddSequence("D", "ANCCCCCC", "")

a.RemoveMajorityCharacterSites(0.6, false)
a2.RemoveMajorityCharacterSites(0.6, true)

if !a.Identical(exp) {
t.Error(fmt.Errorf("Remove majority failed"))
}
if !a2.Identical(exp2) {
t.Error(fmt.Errorf("Remove majority failed with ends"))
}
}
50 changes: 38 additions & 12 deletions cmd/cleansites.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,23 +10,28 @@ import (
)

var cleanEnds bool
var cleanChar string

// cleansitesCmd represents the cleansites command
var cleansitesCmd = &cobra.Command{
Use: "sites",
Short: "Removes sites with gaps",
Long: `Removes sites constituted of gaps
Short: "Removes sites with specific characters",
Long: `Removes sites constituted of specific characters
Removes sites constitued of >= cutoff gap sites.
Removes sites constitued of >= cutoff specific characters. This characters can be :
Exception for a cutoff of 0: removes sites constitued of > 0 gap sites.
1. Gap (--char=GAP or --char=-, default)
2. Any other character X specified by --char=X (case sensitive)
3. The most abundant character in the site --char=MAJ (including gaps)
Exception for a cutoff of 0: removes sites constitued of > 0 specified character (with --char=MAJ, then will remove all columns).
Examples:
- With a cutoff of 0.5: a site with 5 gaps over 10 sequences will be removed;
- With a cutoff of 0.5: a site with 4 gaps over 10 sequences will not be removed;
- With a cutoff of 0.0 a site with 1 gap over 10 sequences will be removed.
- With a cutoff of 0.5: a site with 5 specified characters over 10 sequences will be removed;
- With a cutoff of 0.5: a site with 4 specified characters over 10 sequences will not be removed;
- With a cutoff of 0.0 a site with 1 specified over 10 sequences will be removed.
If cutoff is <0 or >1, it will be considered as 0, which means that every site with at least 1 gap
If cutoff is <0 or >1, it will be considered as 0, which means that every site with at least 1 specified character
will be removed.`,
RunE: func(cmd *cobra.Command, args []string) (err error) {
var aligns *align.AlignChannel
Expand All @@ -44,17 +49,36 @@ will be removed.`,
defer closeWriteFile(f, cleanOutput)

i := 0
char := ""

for al := range aligns.Achan {
beforelength := al.Length()
nbstart, nbend = al.RemoveGapSites(cleanCutoff, cleanEnds)

if cleanChar == string(align.GAP) || cleanChar == "GAP" {
char = "gaps"
nbstart, nbend = al.RemoveGapSites(cleanCutoff, cleanEnds)
} else if cleanChar == "MAJ" {
char = "maj"
nbstart, nbend = al.RemoveMajorityCharacterSites(cleanCutoff, cleanEnds)
} else {
//single character
c := []rune(cleanChar)
if len(c) != 1 {
err = fmt.Errorf("--char should be a single character")
io.LogError(err)
return
}
char = string(c[0])
nbstart, nbend = al.RemoveCharacterSites(c[0], cleanCutoff, cleanEnds)
}
afterlength := al.Length()
writeAlign(al, f)
if !cleanQuiet {
io.PrintSimpleMessage(fmt.Sprintf("Alignment (%d) length before cleaning=%d", i, beforelength))
io.PrintSimpleMessage(fmt.Sprintf("Alignment (%d) length after cleaning=%d", i, afterlength))
io.PrintSimpleMessage(fmt.Sprintf("Alignment (%d) number of gaps=%d", i, beforelength-afterlength))
io.PrintSimpleMessage(fmt.Sprintf("Alignment (%d) number of start gaps=%d", i, nbstart))
io.PrintSimpleMessage(fmt.Sprintf("Alignment (%d) number of end gaps=%d", i, nbend))
io.PrintSimpleMessage(fmt.Sprintf("Alignment (%d) number of %s=%d", i, char, beforelength-afterlength))
io.PrintSimpleMessage(fmt.Sprintf("Alignment (%d) number of start %s=%d", i, char, nbstart))
io.PrintSimpleMessage(fmt.Sprintf("Alignment (%d) number of end %s=%d", i, char, nbend))
}
}

Expand All @@ -68,5 +92,7 @@ will be removed.`,

func init() {
cleansitesCmd.PersistentFlags().BoolVar(&cleanEnds, "ends", false, "If true, then only remove consecutive gap positions from alignment start and end")
cleansitesCmd.PersistentFlags().StringVar(&cleanChar, "char", "GAP", "The character the cutoff is applied to. May be GAP, MAJ, or any other character")

cleanCmd.AddCommand(cleansitesCmd)
}
19 changes: 16 additions & 3 deletions docs/commands/clean.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,25 @@
## Commands

### clean
This command removes alignment sites or sequences constitued of >= than a given proportion of gaps. Exception for a cutoff of 0: removes sites /sequences constitued of > 0 gaps.
This command removes alignment sites or sequences constitued of >= than a given proportion of gaps or other characters. Exception for a cutoff of 0: removes sites /sequences constitued of > 0 of this character.

Two subcommands:

* `goalign clean sites`: Removes sites with gaps
* `goalign clean seqs`: Removes sequences with gaps
* `goalign clean sites`: Removes sites constitued of >= cutoff specific characters. This characters can be :
1. Gap (--char=GAP or --char=-, default)
2. Any other character X specified by --char=X (case sensitive)
3. The most abundant character in the site --char=MAJ (including gaps)

Exception for a cutoff of 0: removes sites constitued of > 0 specified character (with --char=MAJ, then will remove all columns).

Examples:
- With a cutoff of 0.5: a site with 5 specified characters over 10 sequences will be removed;
- With a cutoff of 0.5: a site with 4 specified characters over 10 sequences will not be removed;
- With a cutoff of 0.0 a site with 1 specified over 10 sequences will be removed.

If cutoff is <0 or >1, it will be considered as 0, which means that every site with at least 1 specified character
will be removed.`,
* `goalign clean seqs`: Removes sequences with gaps. So far only gap filter is implemented.

Examples with sites:
- With a cutoff of 0.5: a site with 5 gaps over 10 sequences will be removed;
Expand Down
59 changes: 59 additions & 0 deletions test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,65 @@ diff -q -b result expected
diff -q -b log expectedlog
rm -f expected result log expectedlog

echo "->goalign clean sites --ends --char MAJ"
cat > input <<EOF
>A
N-ANGA-GACC
>B
N-TN-T-TTTC
>C
NCTN-TTT--T
>D
C-ANCCCCCCC
EOF

cat > expected <<EOF
>A
AGA-GAC
>B
T-T-TTT
>C
T-TTT--
>D
ACCCCCC
EOF

cat > expected2 <<EOF
>A
ANGA-GAC
>B
TN-T-TTT
>C
TN-TTT--
>D
ANCCCCCC
EOF

cat > expectedlog <<EOF
Alignment (0) length before cleaning=11
Alignment (0) length after cleaning=7
Alignment (0) number of maj=4
Alignment (0) number of start maj=2
Alignment (0) number of end maj=1
EOF

cat > expectedlog2 <<EOF
Alignment (0) length before cleaning=11
Alignment (0) length after cleaning=8
Alignment (0) number of maj=3
Alignment (0) number of start maj=2
Alignment (0) number of end maj=1
EOF

${GOALIGN} clean sites -i input -c 0.6 --char MAJ > result 2>log
diff -q -b result expected
diff -q -b log expectedlog
rm -f expected result log expectedlog

${GOALIGN} clean sites -i input -c 0.6 --char MAJ --ends > result 2>log
diff -q -b result expected2
diff -q -b log expectedlog2
rm -f expected2 result log expectedlog2

echo "->goalign clean seqs"
cat > expected <<EOF
Expand Down

0 comments on commit 05e935f

Please sign in to comment.