Skip to content

Commit

Permalink
added option goalign clean sites --reverse, and multi characters for …
Browse files Browse the repository at this point in the history
…--char #5
  • Loading branch information
fredericlemoine committed Apr 17, 2023
1 parent 84b0f6c commit 3cfac2a
Show file tree
Hide file tree
Showing 6 changed files with 156 additions and 49 deletions.
93 changes: 55 additions & 38 deletions align/align.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"unicode"

"github.com/armon/go-radix"
"github.com/evolbioinfo/goalign/gutils"
"github.com/evolbioinfo/goalign/io"
)

Expand Down Expand Up @@ -109,7 +110,7 @@ type Alignment interface {
// Removes sites having >= cutoff gaps, returns the number of consecutive removed sites at start and end of alignment
RemoveGapSites(cutoff float64, ends bool) (first, last int, kept, removed []int)
// Removes sites having >= cutoff character, returns the number of consecutive removed sites at start and end of alignment
RemoveCharacterSites(c uint8, cutoff float64, ends bool, ignoreCase, ignoreGaps, ignoreNs bool) (first, last int, kept, removed []int)
RemoveCharacterSites(c []uint8, cutoff float64, ends bool, ignoreCase, ignoreGaps, ignoreNs, reverse bool) (first, last int, kept, removed []int)
// Removes sites having >= cutoff of the main character at these sites, returns the number of consecutive removed sites at start and end of alignment
RemoveMajorityCharacterSites(cutoff float64, ends, ignoreGaps, ignoreNs bool) (first, last int, kept, removed []int)
// Replaces match characters (.) by their corresponding characters on the first sequence
Expand Down Expand Up @@ -309,7 +310,7 @@ func (a *align) ShuffleSites(rate float64, roguerate float64, randroguefirst boo
// Returns the number of consecutive removed sites at start and end of alignment and the indexes of
// the remaining positions
func (a *align) RemoveGapSites(cutoff float64, ends bool) (first, last int, kept, rm []int) {
return a.RemoveCharacterSites(GAP, cutoff, ends, false, false, false)
return a.RemoveCharacterSites([]uint8{GAP}, cutoff, ends, false, false, false, false)
}

// RemoveCharacterSites Removes positions constituted of [cutoff*100%,100%] of the given character
Expand All @@ -329,7 +330,7 @@ func (a *align) RemoveGapSites(cutoff float64, ends bool) (first, last int, kept
//
// Returns the number of consecutive removed sites at start and end of alignment and the indexes of
// the remaining positions
func (a *align) RemoveCharacterSites(c uint8, cutoff float64, ends bool, ignoreCase, ignoreGaps, ignoreNs bool) (first, last int, kept, rm []int) {
func (a *align) RemoveCharacterSites(c []uint8, cutoff float64, ends bool, ignoreCase, ignoreGaps, ignoreNs, reverse bool) (first, last int, kept, rm []int) {
var nbchars int
var total int

Expand All @@ -356,7 +357,11 @@ func (a *align) RemoveCharacterSites(c uint8, cutoff float64, ends bool, ignoreC
nbchars = 0
total = 0
for seq := 0; seq < a.NbSequences(); seq++ {
if (a.seqs[seq].sequence[site] == c) || (ignoreCase && unicode.ToLower(rune(a.seqs[seq].sequence[site])) == unicode.ToLower(rune(c))) {
selected := gutils.ContainsRune(c, a.seqs[seq].sequence[site], ignoreCase)
if reverse {
selected = !selected
}
if selected {
nbchars++
}
// If it's a gap and we ignore gaps, or if it's a N and we ignore N, then we do not count that
Expand Down Expand Up @@ -494,9 +499,9 @@ func (a *align) RemoveMajorityCharacterSites(cutoff float64, ends, ignoreGaps, i
// It returns an error if the sequence does not exist or if the coordinates are outside the ref
// sequence (<0 or > sequence length without gaps)
// Parameters:
// - name: The name of the sequence to take as reference
// - refstart: The start coordinate to convert (on the ref sequence, 0-based)
// - reflen: The length of the ref sequence to consider from refstart
// - name: The name of the sequence to take as reference
// - refstart: The start coordinate to convert (on the ref sequence, 0-based)
// - reflen: The length of the ref sequence to consider from refstart
func (a *align) RefCoordinates(name string, refstart, reflen int) (alistart, alilen int, err error) {
var exists bool
var seq []uint8
Expand Down Expand Up @@ -552,8 +557,8 @@ func (a *align) RefCoordinates(name string, refstart, reflen int) (alistart, ali
// It returns an error if the sequence does not exist or if the coordinates are outside the ref
// sequence (<0 or > sequence length without gaps)
// Parameters:
// - name: The name of the sequence to take as reference
// - sites: The positions to convert (on the ref sequence, 0-based)
// - name: The name of the sequence to take as reference
// - sites: The positions to convert (on the ref sequence, 0-based)
func (a *align) RefSites(name string, sites []int) (refsites []int, err error) {
var exists bool
var seq []uint8
Expand Down Expand Up @@ -848,7 +853,8 @@ func (a *align) Mutate(rate float64) {
// Simulate rogue taxa in the alignment:
// take the proportion prop of sequences as rogue taxa => R
// For each t in R
// * We shuffle the alignment sites of t
// - We shuffle the alignment sites of t
//
// Output: List of rogue sequence names, and List of intact sequence names
func (a *align) SimulateRogue(prop float64, proplen float64) ([]string, []string) {
var seq *seq
Expand Down Expand Up @@ -942,7 +948,8 @@ Parameters are:
from the number of sequences in the output alignment)
* counts: counts associated to each sequence (if the count of a sequence is missing, it
is considered as 0, if the count of an unkown sequence is present, it will return an error).
Sum of counts of all sequences must be > n.
Sum of counts of all sequences must be > n.
*/
func (a *align) Rarefy(nb int, counts map[string]int) (al Alignment, err error) {
var rarefySeqBag *seqbag
Expand Down Expand Up @@ -1013,9 +1020,10 @@ func (a *align) CharStatsSite(site int) (outmap map[uint8]int, err error) {
// with a given length.
// maxreplace defines the replacing character. If maskreplace is "", then, masked characters
// are replaced by "N" or "X" depending on the alphabet. Orherwise:
// 1) if maskreplace is AMBIG: just like ""
// 2) if maskreplace is MAJ: Replacing character is most frequent character of the column
// 3) if maskreplace is GAP: Replacing character is a GAP
// 1. if maskreplace is AMBIG: just like ""
// 2. if maskreplace is MAJ: Replacing character is most frequent character of the column
// 3. if maskreplace is GAP: Replacing character is a GAP
//
// if nogap is true, then Mask will not replace gaps with the replacement character
// if noref is true, then does not replace the character if it is the same as the reference sequences (only if refseq is specified).
func (a *align) Mask(refseq string, start, length int, maskreplace string, nogap, noref bool) (err error) {
Expand Down Expand Up @@ -1094,13 +1102,15 @@ func (a *align) Mask(refseq string, start, length int, maskreplace string, nogap
// MaskOccurences masks nucleotides that appear less or equal than the given number of times
// in their columns (low frenquency mutations)
// If refseq is not "" then masks unique characters if:
// 1) they are different from the given reference sequence
// 2) or if the reference is a GAP
// 1. they are different from the given reference sequence
// 2. or if the reference is a GAP
//
// maxreplace defines the replacing character. If maskreplace is "", then, masked characters
// are replaced by "N" or "X" depending on the alphabet. Orherwise:
// 1) if maskreplace is AMBIG: just like ""
// 2) if maskreplace is MAJ: Replacing character is most frequent character of the column
// 3) if maskreplace is GAP: Replacing character is a GAP
// 1. if maskreplace is AMBIG: just like ""
// 2. if maskreplace is MAJ: Replacing character is most frequent character of the column
// 3. if maskreplace is GAP: Replacing character is a GAP
//
// if nogap is true, then MaskOccurences will not replace gaps with the replacement character
func (a *align) MaskOccurences(refseq string, maxOccurence int, maskreplace string) (err error) {
var ok bool
Expand Down Expand Up @@ -1176,13 +1186,15 @@ func (a *align) MaskOccurences(refseq string, maxOccurence int, maskreplace stri

// MaskUnique masks nucleotides that are unique in their columns (unique mutations)
// If refseq is not "" then masks unique characters if:
// 1) they are different from the given reference sequence
// 2) or if the reference is a GAP
// 1. they are different from the given reference sequence
// 2. or if the reference is a GAP
//
// maxreplace defines the replacing character. If maskreplace is "", then, masked characters
// are replaced by "N" or "X" depending on the alphabet. Orherwise:
// 1) if maskreplace is AMBIG: just like ""
// 2) if maskreplace is MAJ: Replacing character is most frequent character
// 3) if maskreplace is GAP: Replacing character is a GAP
// 1. if maskreplace is AMBIG: just like ""
// 2. if maskreplace is MAJ: Replacing character is most frequent character
// 3. if maskreplace is GAP: Replacing character is a GAP
//
// if nogap is true, then MaskUnique will not replace gaps with the replacement character
func (a *align) MaskUnique(refseq string, maskreplace string) (err error) {
return a.MaskOccurences(refseq, 1, maskreplace)
Expand All @@ -1196,7 +1208,7 @@ func (a *align) MaskUnique(refseq string, maskreplace string) (err error) {
// - out: The character with the highest occurence at each site
// - occur: The number of occurences of the most common character at each site
// - total: The total number of sequences taken into account at each site (not always the number
// of sequences in the alignment if ignoreGaps or ignoreNs)
// of sequences in the alignment if ignoreGaps or ignoreNs)
func (a *align) MaxCharStats(ignoreGaps, ignoreNs bool) (out []uint8, occur []int, total []int) {
out = make([]uint8, a.Length())
occur = make([]int, a.Length())
Expand Down Expand Up @@ -1472,16 +1484,19 @@ func (a *align) Stops(startingGapsAsIncomplete bool, geneticcode int) (stops []i
return
}

/* Computes a position-specific scoring matrix (PSSM)matrix
/*
Computes a position-specific scoring matrix (PSSM)matrix
(see https://en.wikipedia.org/wiki/Position_weight_matrix)
This matrix may be in log2 scale or not (log argument)
A pseudo count may be added to values (to avoid log2(0))) with pseudocount argument
values may be normalized: normalization arg:
PSSM_NORM_NONE = 0 => No normalization
PSSM_NORM_FREQ = 1 => Normalization by frequency in the site
PSSM_NORM_DATA = 2 => Normalization by frequency in the site and divided by aa/nt frequency in data
PSSM_NORM_UNIF = 3 => Normalization by frequency in the site and divided by uniform frequency (1/4 or 1/20)
PSSM_NORM_LOGO = 4 => Normalization like "Logo"
PSSM_NORM_NONE = 0 => No normalization
PSSM_NORM_FREQ = 1 => Normalization by frequency in the site
PSSM_NORM_DATA = 2 => Normalization by frequency in the site and divided by aa/nt frequency in data
PSSM_NORM_UNIF = 3 => Normalization by frequency in the site and divided by uniform frequency (1/4 or 1/20)
PSSM_NORM_LOGO = 4 => Normalization like "Logo"
*/
func (a *align) Pssm(log bool, pseudocount float64, normalization int) (pssm map[uint8][]float64, err error) {
// Number of occurences of each different aa/nt
Expand Down Expand Up @@ -1734,7 +1749,8 @@ func (a *align) RandSubAlign(length int, consecutive bool) (Alignment, error) {

/*
Remove identical patterns/sites and return number of occurence
of each pattern (order of patterns/sites may have changed)
of each pattern (order of patterns/sites may have changed)
*/
func (a *align) Compress() (weights []int) {
var count interface{}
Expand Down Expand Up @@ -1871,10 +1887,10 @@ func (a *align) DiffWithFirst() {

// Compares all sequences to the first one and counts all differences per sequence
//
// - alldiffs: The set of all differences that have been seen at least once
// - diffs : The number of occurences of each difference, for each sequence
// Sequences are ordered as the original alignment. Differences are
// written as REFNEW, ex: diffs["AC"]=12.
// - alldiffs: The set of all differences that have been seen at least once
// - diffs : The number of occurences of each difference, for each sequence
// Sequences are ordered as the original alignment. Differences are
// written as REFNEW, ex: diffs["AC"]=12.
func (a *align) CountDifferences() (alldiffs []string, diffs []map[string]int) {
var alldiffsmap map[string]bool
var diffmap map[string]int
Expand Down Expand Up @@ -1917,7 +1933,8 @@ func (a *align) CountDifferences() (alldiffs []string, diffs []map[string]int) {
}

/*
Returns the number of variable sites in the alignment.
Returns the number of variable sites in the alignment.
It does not take into account gaps and other charactes like "."
*/
func (a *align) NbVariableSites() int {
Expand Down Expand Up @@ -2117,7 +2134,7 @@ func (a *align) CodonAlign(ntseqs SeqBag) (rtAl *align, err error) {

// Compute conservation status of a given site of the alignment
//
// If position is outside the alignment, it returns an error
// # If position is outside the alignment, it returns an error
//
// Possible values are:
//
Expand Down
8 changes: 5 additions & 3 deletions align/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -274,11 +274,12 @@ var iupacCodeByte = [][]uint8{
// This matrix was created by Todd Lowe 12/10/92
//
// Uses ambiguous nucleotide codes, probabilities rounded to
// nearest integer
//
// nearest integer
//
// Lowest score = -4, Highest score = 5
//
//A T G C S W R Y K M B V H D N U
// A T G C S W R Y K M B V H D N U
var dnafull_subst_matrix = [][]float64{
{5, -4, -4, -4, -4, 1, 1, -4, -4, 1, -4, -1, -1, -1, -2, -4},
{-4, 5, -4, -4, -4, 1, -4, 1, 1, -4, -1, -4, -1, -1, -2, 5},
Expand Down Expand Up @@ -405,7 +406,8 @@ func Nt2Index(nt uint8) (idx int, err error) {
return
}

/*PossibleNtIUPAC returns the possible meaning of the given iupac nucleotide
/*
PossibleNtIUPAC returns the possible meaning of the given iupac nucleotide
Ex: NT_B : {NT_C, NT_G, NT_T}
*/
func PossibleNtIUPAC(nt uint8) (idx []uint8, err error) {
Expand Down
24 changes: 17 additions & 7 deletions cmd/cleansites.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@ import (
"fmt"

"github.com/evolbioinfo/goalign/align"
"github.com/evolbioinfo/goalign/gutils"
"github.com/evolbioinfo/goalign/io"
"github.com/evolbioinfo/goalign/io/utils"
"github.com/spf13/cobra"
)

var cleanEnds bool
var sitesposoutfile string
var sitesreverse bool
var rmsitesposoutfile string

// cleansitesCmd represents the cleansites command
Expand All @@ -22,7 +24,8 @@ var cleansitesCmd = &cobra.Command{
Removes sites constitued of >= cutoff specific characters. This characters can be :
1. Gap (--char=GAP or --char=-, default)
2. Any other character X specified by --char=X (case sensitive)
2. Any other set of characters XYZ specified by --char=XYZ (case sensitive). In this case, it is possible to reverse the match with --reverse.
for example '--char ACGT --reverse' means any character but A,C,G,T.
3. The most abundant character in the site --char=MAJ (including gaps)
Exception for a cutoff of 0: removes sites constitued of > 0 specified character (with --char=MAJ, then will remove all columns).
Expand Down Expand Up @@ -82,18 +85,24 @@ will be removed.`,
} else {
//single character
c := []uint8(cleanChar)
if len(c) != 1 {
err = fmt.Errorf("--char should be a single character")
// if len(c) != 1 {
// err = fmt.Errorf("--char should be a single character")
// io.LogError(err)
// return
// }
char = string(c)
if (gutils.Contains(c, 'N') || gutils.Contains(c, 'n')) && cleanIgnoreNs {
err = fmt.Errorf("--ignore-n should not be given with --char N")
io.LogError(err)
return
}
char = string(c[0])
if (c[0] == 'N' || c[0] == 'n') && cleanIgnoreNs {
err = fmt.Errorf("--ignore-n should not be given with --char N")
if (gutils.Contains(c, '-')) && cleanIgnoreGaps {
err = fmt.Errorf("--ignore-gaps should not be given with --char -")
io.LogError(err)
return
}
nbstart, nbend, kept, rm = al.RemoveCharacterSites(c[0], cleanCutoff, cleanEnds, cleanIgnoreCase, cleanIgnoreGaps, cleanIgnoreNs)

nbstart, nbend, kept, rm = al.RemoveCharacterSites(c, cleanCutoff, cleanEnds, cleanIgnoreCase, cleanIgnoreGaps, cleanIgnoreNs, sitesreverse)
}
afterlength := al.Length()
writeAlign(al, f)
Expand Down Expand Up @@ -124,6 +133,7 @@ will be removed.`,

func init() {
cleansitesCmd.PersistentFlags().BoolVar(&cleanEnds, "ends", false, "If true, then only remove consecutive gap positions from alignment start and end")
cleansitesCmd.PersistentFlags().BoolVar(&sitesreverse, "reverse", false, "If true, then reverses the char match (not functional with --char GAP and --char MAJ)")
cleansitesCmd.PersistentFlags().StringVar(&sitesposoutfile, "positions", "none", "Output file of all remaining positions (0-based, on position per line)")
cleansitesCmd.PersistentFlags().StringVar(&rmsitesposoutfile, "positions-rm", "none", "Output file of all removed positions (0-based, on position per line)")
cleanCmd.AddCommand(cleansitesCmd)
Expand Down
3 changes: 2 additions & 1 deletion docs/commands/clean.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ Two subcommands:

* `goalign clean sites`: Removes sites constitued of >= cutoff specific characters. This characters can be :
1. Gap (--char=GAP or --char=-, default)
2. Any other character X specified by --char=X (case sensitive)
2. Any other set of characters XYZ specified by --char=XYZ (case sensitive). In this case, it is possible to reverse the match with --reverse.
for example '--char ACGT --reverse' means any character but A,C,G,T.
3. The most abundant character in the site --char=MAJ (including gaps)

Exception for a cutoff of 0: removes sites constitued of > 0 specified character (with --char=MAJ, then will remove all columns).
Expand Down
21 changes: 21 additions & 0 deletions gutils/gutils.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
package gutils

import "unicode"

func Contains[T comparable](s []T, e T) bool {
for _, v := range s {
if v == e {
return true
}
}
return false
}

func ContainsRune(s []uint8, e uint8, ignoreCase bool) bool {
for _, v := range s {
if v == e || (ignoreCase && unicode.ToLower(rune(v)) == unicode.ToLower(rune(e))) {
return true
}
}
return false
}
Loading

0 comments on commit 3cfac2a

Please sign in to comment.