Skip to content

Commit

Permalink
Added --reverse to subseqs and subsite command #7
Browse files Browse the repository at this point in the history
  • Loading branch information
fredericlemoine committed Jun 1, 2020
1 parent db73762 commit 708f86a
Show file tree
Hide file tree
Showing 10 changed files with 700 additions and 4 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ You may go to the [doc](docs/index.md) for a more detailed documentation of the
* nseq
* taxa
* subseq: Extract a subsequence from the alignment (coordinates on alignment reference or on a given sequence reference)
* subsites: Extract sites from the input alignment (coordinates on alignment reference or on a given sequence reference)
* subset: Take a subset of sequences from the input alignment
* sw: Aligns 2 sequences using Smith & Waterman algorithm
* translate: Translate input sequences/alignment (supports IUPAC code)
Expand Down
136 changes: 136 additions & 0 deletions align/align.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ type Alignment interface {
Recombine(rate float64, lenprop float64)
// converts coordinates on the given sequence to coordinates on the alignment
RefCoordinates(name string, refstart, refend int) (alistart, aliend int, err error)
// converts sites on the given sequence to coordinates on the alignment
RefSites(name string, sites []int) (refsites []int, err error)
RemoveGapSeqs(cutoff float64) // Removes sequences having >= cutoff gaps
RemoveGapSites(cutoff float64, ends bool) (first, last int) // Removes sites having >= cutoff gaps, returns the number of consecutive removed sites at start and end of alignment
RemoveCharacterSites(c rune, cutoff float64, ends bool) (first, last int) // Removes sites having >= cutoff character, returns the number of consecutive removed sites at start and end of alignment
Expand All @@ -81,6 +83,10 @@ type Alignment interface {
SiteConservation(position int) (int, error) // If the site is conserved:
Split(part *PartitionSet) ([]Alignment, error) //Splits the alignment given the paritions in argument
SubAlign(start, length int) (Alignment, error) // Extract a subalignment from this alignment
SelectSites(sites []int) (Alignment, error) // Extract givens sites from the alignment
InverseCoordinates(start, length int) (invstarts, invlengths []int, err error)
InversePositions(sites []int) (invsites []int, err error)

Swap(rate float64)
TrimSequences(trimsize int, fromStart bool) error
}
Expand Down Expand Up @@ -436,6 +442,57 @@ func (a *align) RefCoordinates(name string, refstart, reflen int) (alistart, ali
return
}

// RefSites converts coordinates on the given sequence to coordinates on the alignment.
// Coordinates on the given sequence corresponds to the sequence without gaps. Output coordinates
// on the alignent consider gaps.
//
// It returns an error if the sequence does not exist or if the coordinates are outside the ref
// sequence (<0 or > sequence length without gaps)
// Parameters:
// - name: The name of the sequence to take as reference
// - sites: The positions to convert (on the ref sequence, 0-based)
func (a *align) RefSites(name string, sites []int) (refsites []int, err error) {
var exists bool
var seq []rune
var tmpi int
var site rune
var ngaps int
var isite int

if seq, exists = a.GetSequenceChar(name); !exists {
err = fmt.Errorf("Sequence %s does not exist in the alignment", name)
return
}

mappos := make(map[int]bool)
for _, s := range sites {
if s < 0 {
err = fmt.Errorf("Site on reference sequence must be > 0 : %d", s)
return
}
if s >= a.Length() {
err = fmt.Errorf("Site is outside alignment : %d", s)
return
}
mappos[s] = true
}

//look for start
tmpi = -1
for isite, site = range seq {
if site != GAP {
tmpi++
if _, ok := mappos[tmpi]; ok {
refsites = append(refsites, isite)
}
} else {
ngaps++
}
}

return
}

// Removes sequences constituted of [cutoff*100%,100%] Gaps
// Exception fo a cutoff of 0: does not remove sequences with 0% gaps
// Cutoff must be between 0 and 1, otherwise set to 0.
Expand Down Expand Up @@ -1241,6 +1298,85 @@ func (a *align) SubAlign(start, length int) (subalign Alignment, err error) {
return
}

// SelectSites Extract a subalignment from this alignment, correponding to given positions
// (0-based inclusive coordinates).
func (a *align) SelectSites(sites []int) (subalign Alignment, err error) {
for _, site := range sites {
if site < 0 || site > a.Length() {
err = fmt.Errorf("Site is outside the alignment")
return
}
}

subalign = NewAlign(a.alphabet)
for i := 0; i < a.NbSequences(); i++ {
seq := make([]rune, len(sites))
alseq := a.seqs[i]
alseqchar := alseq.SequenceChar()
for j, site := range sites {
seq[j] = alseqchar[site]
}
subalign.AddSequenceChar(alseq.name, seq, alseq.Comment())
}
return
}

// InverseCoordinates takes a start and a length, and returns starts and lengths that are
// outside this sequence.
// starts are 0-based inclusive
func (a *align) InverseCoordinates(start, length int) (invstarts, invlengths []int, err error) {
invstarts = make([]int, 0)
invlengths = make([]int, 0)
if start < 0 || start > a.Length() {
err = fmt.Errorf("Start is outside the alignment")
return
}
if length < 0 {
err = fmt.Errorf("Length is negative")
return
}
if start+length < 0 || start+length > a.Length() {
err = fmt.Errorf("Start+Length is outside the alignment")
return
}

if start > 0 {
invstarts = append(invstarts, 0)
invlengths = append(invlengths, start)
}

if (start + length) < a.Length() {
invstarts = append(invstarts, start+length)
invlengths = append(invlengths, a.Length()-(start+length))
}

return
}

func (a *align) InversePositions(sites []int) (invsites []int, err error) {
invsites = make([]int, 0)

for _, s := range sites {
if s < 0 || s > a.Length() {
err = fmt.Errorf("Site is outside the alignment")
return
}
}

posmap := make(map[int]bool)
for _, s := range sites {
posmap[s] = true
}

for i := 0; i < a.Length(); i++ {
if _, ok := posmap[i]; !ok {
invsites = append(invsites, i)
}
}

return
}

// Extract a subalignment with given length and a random start position from this alignment
func (a *align) RandSubAlign(length int) (Alignment, error) {
if length > a.Length() {
Expand Down
108 changes: 108 additions & 0 deletions align/align_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1941,3 +1941,111 @@ func Test_align_MaskUnique(t *testing.T) {
t.Error(fmt.Errorf("Remove majority failed"))
}
}

func Test_align_InverseCoordinates(t *testing.T) {
var a Alignment

a = NewAlign(NUCLEOTIDS)
a.AddSequence("A", "ACANGA-TACC", "")
a.AddSequence("B", "ACTN-T-TTTC", "")
a.AddSequence("C", "ACTN-TTT--T", "")
a.AddSequence("D", "C-ANCCCCCCC", "")

expinvlengths := []int{2, 3}
expinvstarts := []int{0, 8}

invstarts, invlengths, err := a.InverseCoordinates(0, 11)
if err != nil {
t.Error(err)
}
if len(invstarts) != 0 {
t.Errorf("Invstarts should be 0 length")
}
if len(invlengths) != 0 {
t.Errorf("Invlengths should be 0 length")
}

invstarts, invlengths, err = a.InverseCoordinates(2, 6)
if err != nil {
t.Error(err)
}

if len(invstarts) != 2 {
t.Errorf("Invstarts should be 2 length")
}
if len(invlengths) != 2 {
t.Errorf("Invlengths should be 2 length")
}

if !reflect.DeepEqual(expinvlengths, invlengths) {
t.Error(fmt.Errorf("Invlengths is not expected, have %v, want %v", invlengths, expinvlengths))
}
if !reflect.DeepEqual(expinvstarts, invstarts) {
t.Error(fmt.Errorf("Invlengths is not expected, have %v, want %v", invlengths, expinvlengths))
}
}

func Test_align_InversePositions(t *testing.T) {
var a Alignment

a = NewAlign(NUCLEOTIDS)
a.AddSequence("A", "ACANGA-TACC", "")
a.AddSequence("B", "ACTN-T-TTTC", "")
a.AddSequence("C", "ACTN-TTT--T", "")
a.AddSequence("D", "C-ANCCCCCCC", "")

positions := []int{1, 2, 6, 10}
expinvpositions := []int{0, 3, 4, 5, 7, 8, 9}

invpositions, err := a.InversePositions(positions)
if err != nil {
t.Error(err)
}
if !reflect.DeepEqual(expinvpositions, invpositions) {
t.Error(fmt.Errorf("Invpositions is not expected, have %v, want %v", invpositions, expinvpositions))
}

positions = []int{}
expinvpositions = []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}

invpositions, err = a.InversePositions(positions)
if err != nil {
t.Error(err)
}
if !reflect.DeepEqual(expinvpositions, invpositions) {
t.Error(fmt.Errorf("Invpositions is not expected, have %v, want %v", invpositions, expinvpositions))
}

positions = []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
expinvpositions = []int{}

invpositions, err = a.InversePositions(positions)
if err != nil {
t.Error(err)
}
if !reflect.DeepEqual(expinvpositions, invpositions) {
t.Error(fmt.Errorf("Invpositions is not expected, have %v, want %v", invpositions, expinvpositions))
}
}

func Test_align_RefSites(t *testing.T) {
var err error
var alisites []int

in := NewAlign(UNKNOWN)

in.AddSequence("Seq0000", "--ACG--AT---GC", "")
in.AddSequence("Seq0001", "GGACGTTATCGGGC", "")
in.AutoAlphabet()

expalisites := []int{2, 3, 4, 7}
sites := []int{0, 1, 2, 3}

alisites, err = in.RefSites("Seq0000", sites)
if err != nil {
t.Error(err)
}
if !reflect.DeepEqual(expalisites, alisites) {
t.Error(fmt.Errorf("alisites: expected: %v, have: %v", expalisites, alisites))
}
}
37 changes: 37 additions & 0 deletions cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"math/rand"
"os"
"runtime"
"strconv"
"strings"
"time"

Expand Down Expand Up @@ -348,3 +349,39 @@ func parsePartition(partitionfile string, alilength int) (ps *align.PartitionSet
ps, err = p.Parse(alilength)
return
}

func parseIntFile(file string) (ints []int, err error) {
var f *os.File
var r *bufio.Reader
var gr *gzip.Reader
var c int

ints = make([]int, 0)

if file == "stdin" || file == "-" {
f = os.Stdin
} else {
if f, err = os.Open(file); err != nil {
return
}
}

if strings.HasSuffix(file, ".gz") {
if gr, err = gzip.NewReader(f); err != nil {
return
}
r = bufio.NewReader(gr)
} else {
r = bufio.NewReader(f)
}
l, e := utils.Readln(r)
for e == nil {
if c, err = strconv.Atoi(l); err != nil {
return
}
ints = append(ints, c)
l, e = utils.Readln(r)
}

return
}
30 changes: 26 additions & 4 deletions cmd/subseq.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ var subseqstart int
var subseqlength int
var subseqstep int
var subseqrefseq string
var subseqreverse bool

// subseqCmd represents the subseq command
var subseqCmd = &cobra.Command{
Expand Down Expand Up @@ -93,7 +94,7 @@ If several alignments are present in the input file and the output is a file
RunE: func(cmd *cobra.Command, args []string) (err error) {
var aligns *align.AlignChannel
var f *os.File
var subalign align.Alignment
var subalign, subaligntmp align.Alignment

if aligns, err = readalign(infile); err != nil {
io.LogError(err)
Expand Down Expand Up @@ -132,9 +133,29 @@ If several alignments are present in the input file and the output is a file
}
subalignnum := 0
for {
if subalign, err = al.SubAlign(start, len); err != nil {
io.LogError(err)
return
starts := []int{start}
lens := []int{len}
if subseqreverse {
if starts, lens, err = al.InverseCoordinates(start, len); err != nil {
io.LogError(err)
return
}
}
subalign = nil
for i, s := range starts {
l := lens[i]
if subaligntmp, err = al.SubAlign(s, l); err != nil {
io.LogError(err)
return
}
if subalign == nil {
subalign = subaligntmp
} else {
if err = subalign.Concat(subaligntmp); err != nil {
io.LogError(err)
return
}
}
}
writeAlign(subalign, f)
start += subseqstep
Expand Down Expand Up @@ -170,5 +191,6 @@ func init() {
subseqCmd.PersistentFlags().IntVarP(&subseqstart, "start", "s", 0, "Start position (0-based inclusive)")
subseqCmd.PersistentFlags().IntVarP(&subseqlength, "length", "l", 10, "Length of the sub alignment")
subseqCmd.PersistentFlags().StringVar(&subseqrefseq, "ref-seq", "none", "Reference sequence on which coordinates are given")
subseqCmd.PersistentFlags().BoolVarP(&subseqreverse, "reverse", "r", false, "Take all but the given subsequence")
subseqCmd.PersistentFlags().IntVar(&subseqstep, "step", 0, "Step: If > 0, then will generate several alignments, for each window of length l, with starts: [start,start+step, ..., end-l]* ")
}
Loading

0 comments on commit 708f86a

Please sign in to comment.