Skip to content

Commit

Permalink
Added --unique option to goalign mask #5
Browse files Browse the repository at this point in the history
  • Loading branch information
fredericlemoine committed May 6, 2020
1 parent f70594c commit a73068a
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 5 deletions.
34 changes: 34 additions & 0 deletions align/align.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ type Alignment interface {
Stops(startingGapsAsIncomplete bool, geneticode int) (stops []int, err error)
Length() int // Length of the alignment
Mask(start, length int) error // Masks given positions
MaskUnique() error // Masks unique mutations in the given aligment (not the gaps)
MaxCharStats(excludeGaps bool) ([]rune, []int)
Mutate(rate float64) // Adds uniform substitutions in the alignment (~sequencing errors)
NbVariableSites() int // Nb of variable sites
Expand Down Expand Up @@ -842,6 +843,39 @@ func (a *align) Mask(start, length int) (err error) {
return
}

// MaskUnique masks nucleotides that are unique in their columns (unique mutations)
// - For aa sequences: It masks with X
// - For nt sequences: It masks with N
func (a *align) MaskUnique() (err error) {
rep := '.'
if a.Alphabet() == AMINOACIDS {
rep = ALL_AMINO
} else if a.Alphabet() == NUCLEOTIDS {
rep = ALL_NUCLE
} else {
err = errors.New("Mask: Cannot mask alignment, wrong alphabet")
}

for i := 0; i < a.Length(); i++ {
occurences := make([]int, 130)
indices := make([]int, 130)

for j, s := range a.seqs {
r := s.sequence[i]
occurences[int(r)]++
indices[int(r)] = j
}

for c, num := range occurences {
if num == 1 && rune(c) != rep && rune(c) != GAP {
ind := indices[c]
a.seqs[ind].sequence[i] = rep
}
}
}
return
}

// Returns the Character with the most occurences
// for each site of the alignment
func (a *align) MaxCharStats(excludeGaps bool) (out []rune, occur []int) {
Expand Down
22 changes: 22 additions & 0 deletions align/align_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1919,3 +1919,25 @@ func Test_align_RemoveMajorityCharacterSites(t *testing.T) {
t.Error(fmt.Errorf("Remove majority failed with ends"))
}
}

func Test_align_MaskUnique(t *testing.T) {
var a, exp Alignment

a = NewAlign(NUCLEOTIDS)
a.AddSequence("A", "ACANGA-TACC", "")
a.AddSequence("B", "ACTN-T-TTTC", "")
a.AddSequence("C", "ACTN-TTT--T", "")
a.AddSequence("D", "C-ANCCCCCCC", "")

exp = NewAlign(NUCLEOTIDS)
exp.AddSequence("A", "ACANNN-TNCC", "")
exp.AddSequence("B", "ACTN-T-TNNC", "")
exp.AddSequence("C", "ACTN-TNT--N", "")
exp.AddSequence("D", "N-ANNNNNNCC", "")

a.MaskUnique()

if !a.Identical(exp) {
t.Error(fmt.Errorf("Remove majority failed"))
}
}
23 changes: 18 additions & 5 deletions cmd/mask.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
var maskout string = "stdout"
var maskstart int
var masklength int
var maskunique bool

// subseqCmd represents the subseq command
var maskCmd = &cobra.Command{
Expand All @@ -19,9 +20,10 @@ var maskCmd = &cobra.Command{
Long: `Mask a part of the input alignment (replace by N|X)
It takes an alignment and replaces positions defined by start (0-based inclusive)
and length by N (nucleotide alignment) or X (amino-acid alignment).
It takes an alignment and replaces some characters by "N" or "X".
By default, it masks positions defined by start (0-based inclusive)
and length with N (nucleotide alignment) or X (amino-acid alignment).
If the length is after the end of the alignment, will stop at the
end of the alignment.
Expand All @@ -30,6 +32,9 @@ goalign mask -p -i al.phy -s 9 -l 10
This will replace 10 positions with N|X from the 10th position.
If --unique is specified, 'goalign mask --unique' will replace characters that
are unique in their column with N or X. In this case, --length and --start are ignored.
The output format is the same than input format.
`,
RunE: func(cmd *cobra.Command, args []string) (err error) {
Expand All @@ -50,9 +55,16 @@ The output format is the same than input format.
err = aligns.Err
io.LogError(err)
}
if err = al.Mask(maskstart, masklength); err != nil {
io.LogError(err)
return
if maskunique {
if err = al.MaskUnique(); err != nil {
io.LogError(err)
return
}
} else {
if err = al.Mask(maskstart, masklength); err != nil {
io.LogError(err)
return
}
}
writeAlign(al, f)
}
Expand All @@ -67,4 +79,5 @@ func init() {
maskCmd.PersistentFlags().StringVarP(&maskout, "output", "o", "stdout", "Alignment output file")
maskCmd.PersistentFlags().IntVarP(&maskstart, "start", "s", 0, "Start position (0-based inclusive)")
maskCmd.PersistentFlags().IntVarP(&masklength, "length", "l", 10, "Length of the sub alignment")
maskCmd.PersistentFlags().BoolVar(&maskunique, "unique", false, "If given, then masks characters that are unique in their columns (start and length are ignored)")
}
27 changes: 27 additions & 0 deletions test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3371,6 +3371,33 @@ ${GOALIGN} mask -i input -o result -s 0 -l 2 -p
diff -q -b expected result
rm -f input expected result

echo "->goalign mask --unique"
cat > input <<EOF
>A
ACANGA-TACC
>B
ACTN-T-TTTC
>C
ACTN-TTT--T
>D
C-ANCCCCCCC
EOF

cat > expected << EOF
>A
ACANNN-TNCC
>B
ACTN-T-TNNC
>C
ACTN-TNT--N
>D
N-ANNNNNNCC
EOF

${GOALIGN} mask --unique -i input -o result
diff -q -b expected result
rm -f input expected result

echo "->goalign replace"
cat > input <<EOF
10 20
Expand Down

0 comments on commit a73068a

Please sign in to comment.