diff --git a/README.md b/README.md index 4982500..2e81bcc 100644 --- a/README.md +++ b/README.md @@ -175,6 +175,7 @@ You may go to the [doc](docs/index.md) for a more detailed documentation of the * nseq * taxa * subseq: Extract a subsequence from the alignment (coordinates on alignment reference or on a given sequence reference) +* subsites: Extract sites from the input alignment (coordinates on alignment reference or on a given sequence reference) * subset: Take a subset of sequences from the input alignment * sw: Aligns 2 sequences using Smith & Waterman algorithm * translate: Translate input sequences/alignment (supports IUPAC code) diff --git a/align/align.go b/align/align.go index 50a898b..f7bafea 100644 --- a/align/align.go +++ b/align/align.go @@ -69,6 +69,8 @@ type Alignment interface { Recombine(rate float64, lenprop float64) // converts coordinates on the given sequence to coordinates on the alignment RefCoordinates(name string, refstart, refend int) (alistart, aliend int, err error) + // converts sites on the given sequence to coordinates on the alignment + RefSites(name string, sites []int) (refsites []int, err error) RemoveGapSeqs(cutoff float64) // Removes sequences having >= cutoff gaps RemoveGapSites(cutoff float64, ends bool) (first, last int) // Removes sites having >= cutoff gaps, returns the number of consecutive removed sites at start and end of alignment RemoveCharacterSites(c rune, cutoff float64, ends bool) (first, last int) // Removes sites having >= cutoff character, returns the number of consecutive removed sites at start and end of alignment @@ -81,6 +83,10 @@ type Alignment interface { SiteConservation(position int) (int, error) // If the site is conserved: Split(part *PartitionSet) ([]Alignment, error) //Splits the alignment given the paritions in argument SubAlign(start, length int) (Alignment, error) // Extract a subalignment from this alignment + SelectSites(sites []int) (Alignment, error) // Extract givens sites from the alignment + InverseCoordinates(start, length int) (invstarts, invlengths []int, err error) + InversePositions(sites []int) (invsites []int, err error) + Swap(rate float64) TrimSequences(trimsize int, fromStart bool) error } @@ -436,6 +442,57 @@ func (a *align) RefCoordinates(name string, refstart, reflen int) (alistart, ali return } +// RefSites converts coordinates on the given sequence to coordinates on the alignment. +// Coordinates on the given sequence corresponds to the sequence without gaps. Output coordinates +// on the alignent consider gaps. +// +// It returns an error if the sequence does not exist or if the coordinates are outside the ref +// sequence (<0 or > sequence length without gaps) +// Parameters: +// - name: The name of the sequence to take as reference +// - sites: The positions to convert (on the ref sequence, 0-based) +func (a *align) RefSites(name string, sites []int) (refsites []int, err error) { + var exists bool + var seq []rune + var tmpi int + var site rune + var ngaps int + var isite int + + if seq, exists = a.GetSequenceChar(name); !exists { + err = fmt.Errorf("Sequence %s does not exist in the alignment", name) + return + } + + mappos := make(map[int]bool) + for _, s := range sites { + if s < 0 { + err = fmt.Errorf("Site on reference sequence must be > 0 : %d", s) + return + } + if s >= a.Length() { + err = fmt.Errorf("Site is outside alignment : %d", s) + return + } + mappos[s] = true + } + + //look for start + tmpi = -1 + for isite, site = range seq { + if site != GAP { + tmpi++ + if _, ok := mappos[tmpi]; ok { + refsites = append(refsites, isite) + } + } else { + ngaps++ + } + } + + return +} + // Removes sequences constituted of [cutoff*100%,100%] Gaps // Exception fo a cutoff of 0: does not remove sequences with 0% gaps // Cutoff must be between 0 and 1, otherwise set to 0. @@ -1241,6 +1298,85 @@ func (a *align) SubAlign(start, length int) (subalign Alignment, err error) { return } +// SelectSites Extract a subalignment from this alignment, correponding to given positions +// (0-based inclusive coordinates). +func (a *align) SelectSites(sites []int) (subalign Alignment, err error) { + for _, site := range sites { + if site < 0 || site > a.Length() { + err = fmt.Errorf("Site is outside the alignment") + return + } + } + + subalign = NewAlign(a.alphabet) + for i := 0; i < a.NbSequences(); i++ { + seq := make([]rune, len(sites)) + alseq := a.seqs[i] + alseqchar := alseq.SequenceChar() + for j, site := range sites { + seq[j] = alseqchar[site] + } + subalign.AddSequenceChar(alseq.name, seq, alseq.Comment()) + } + return +} + +// InverseCoordinates takes a start and a length, and returns starts and lengths that are +// outside this sequence. +// starts are 0-based inclusive +func (a *align) InverseCoordinates(start, length int) (invstarts, invlengths []int, err error) { + invstarts = make([]int, 0) + invlengths = make([]int, 0) + if start < 0 || start > a.Length() { + err = fmt.Errorf("Start is outside the alignment") + return + } + if length < 0 { + err = fmt.Errorf("Length is negative") + return + } + if start+length < 0 || start+length > a.Length() { + err = fmt.Errorf("Start+Length is outside the alignment") + return + } + + if start > 0 { + invstarts = append(invstarts, 0) + invlengths = append(invlengths, start) + } + + if (start + length) < a.Length() { + invstarts = append(invstarts, start+length) + invlengths = append(invlengths, a.Length()-(start+length)) + } + + return +} + +func (a *align) InversePositions(sites []int) (invsites []int, err error) { + invsites = make([]int, 0) + + for _, s := range sites { + if s < 0 || s > a.Length() { + err = fmt.Errorf("Site is outside the alignment") + return + } + } + + posmap := make(map[int]bool) + for _, s := range sites { + posmap[s] = true + } + + for i := 0; i < a.Length(); i++ { + if _, ok := posmap[i]; !ok { + invsites = append(invsites, i) + } + } + + return +} + // Extract a subalignment with given length and a random start position from this alignment func (a *align) RandSubAlign(length int) (Alignment, error) { if length > a.Length() { diff --git a/align/align_test.go b/align/align_test.go index cde06cd..6bee636 100644 --- a/align/align_test.go +++ b/align/align_test.go @@ -1941,3 +1941,111 @@ func Test_align_MaskUnique(t *testing.T) { t.Error(fmt.Errorf("Remove majority failed")) } } + +func Test_align_InverseCoordinates(t *testing.T) { + var a Alignment + + a = NewAlign(NUCLEOTIDS) + a.AddSequence("A", "ACANGA-TACC", "") + a.AddSequence("B", "ACTN-T-TTTC", "") + a.AddSequence("C", "ACTN-TTT--T", "") + a.AddSequence("D", "C-ANCCCCCCC", "") + + expinvlengths := []int{2, 3} + expinvstarts := []int{0, 8} + + invstarts, invlengths, err := a.InverseCoordinates(0, 11) + if err != nil { + t.Error(err) + } + if len(invstarts) != 0 { + t.Errorf("Invstarts should be 0 length") + } + if len(invlengths) != 0 { + t.Errorf("Invlengths should be 0 length") + } + + invstarts, invlengths, err = a.InverseCoordinates(2, 6) + if err != nil { + t.Error(err) + } + + if len(invstarts) != 2 { + t.Errorf("Invstarts should be 2 length") + } + if len(invlengths) != 2 { + t.Errorf("Invlengths should be 2 length") + } + + if !reflect.DeepEqual(expinvlengths, invlengths) { + t.Error(fmt.Errorf("Invlengths is not expected, have %v, want %v", invlengths, expinvlengths)) + } + if !reflect.DeepEqual(expinvstarts, invstarts) { + t.Error(fmt.Errorf("Invlengths is not expected, have %v, want %v", invlengths, expinvlengths)) + } +} + +func Test_align_InversePositions(t *testing.T) { + var a Alignment + + a = NewAlign(NUCLEOTIDS) + a.AddSequence("A", "ACANGA-TACC", "") + a.AddSequence("B", "ACTN-T-TTTC", "") + a.AddSequence("C", "ACTN-TTT--T", "") + a.AddSequence("D", "C-ANCCCCCCC", "") + + positions := []int{1, 2, 6, 10} + expinvpositions := []int{0, 3, 4, 5, 7, 8, 9} + + invpositions, err := a.InversePositions(positions) + if err != nil { + t.Error(err) + } + if !reflect.DeepEqual(expinvpositions, invpositions) { + t.Error(fmt.Errorf("Invpositions is not expected, have %v, want %v", invpositions, expinvpositions)) + } + + positions = []int{} + expinvpositions = []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10} + + invpositions, err = a.InversePositions(positions) + if err != nil { + t.Error(err) + } + if !reflect.DeepEqual(expinvpositions, invpositions) { + t.Error(fmt.Errorf("Invpositions is not expected, have %v, want %v", invpositions, expinvpositions)) + } + + positions = []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10} + expinvpositions = []int{} + + invpositions, err = a.InversePositions(positions) + if err != nil { + t.Error(err) + } + if !reflect.DeepEqual(expinvpositions, invpositions) { + t.Error(fmt.Errorf("Invpositions is not expected, have %v, want %v", invpositions, expinvpositions)) + } +} + +func Test_align_RefSites(t *testing.T) { + var err error + var alisites []int + + in := NewAlign(UNKNOWN) + + in.AddSequence("Seq0000", "--ACG--AT---GC", "") + in.AddSequence("Seq0001", "GGACGTTATCGGGC", "") + in.AutoAlphabet() + + expalisites := []int{2, 3, 4, 7} + sites := []int{0, 1, 2, 3} + + alisites, err = in.RefSites("Seq0000", sites) + if err != nil { + t.Error(err) + } + if !reflect.DeepEqual(expalisites, alisites) { + t.Error(fmt.Errorf("alisites: expected: %v, have: %v", expalisites, alisites)) + } +} diff --git a/cmd/root.go b/cmd/root.go index 0e07023..63ca4ba 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -9,6 +9,7 @@ import ( "math/rand" "os" "runtime" + "strconv" "strings" "time" @@ -348,3 +349,39 @@ func parsePartition(partitionfile string, alilength int) (ps *align.PartitionSet ps, err = p.Parse(alilength) return } + +func parseIntFile(file string) (ints []int, err error) { + var f *os.File + var r *bufio.Reader + var gr *gzip.Reader + var c int + + ints = make([]int, 0) + + if file == "stdin" || file == "-" { + f = os.Stdin + } else { + if f, err = os.Open(file); err != nil { + return + } + } + + if strings.HasSuffix(file, ".gz") { + if gr, err = gzip.NewReader(f); err != nil { + return + } + r = bufio.NewReader(gr) + } else { + r = bufio.NewReader(f) + } + l, e := utils.Readln(r) + for e == nil { + if c, err = strconv.Atoi(l); err != nil { + return + } + ints = append(ints, c) + l, e = utils.Readln(r) + } + + return +} diff --git a/cmd/subseq.go b/cmd/subseq.go index 6d0e59a..4fe1f0d 100644 --- a/cmd/subseq.go +++ b/cmd/subseq.go @@ -15,6 +15,7 @@ var subseqstart int var subseqlength int var subseqstep int var subseqrefseq string +var subseqreverse bool // subseqCmd represents the subseq command var subseqCmd = &cobra.Command{ @@ -93,7 +94,7 @@ If several alignments are present in the input file and the output is a file RunE: func(cmd *cobra.Command, args []string) (err error) { var aligns *align.AlignChannel var f *os.File - var subalign align.Alignment + var subalign, subaligntmp align.Alignment if aligns, err = readalign(infile); err != nil { io.LogError(err) @@ -132,9 +133,29 @@ If several alignments are present in the input file and the output is a file } subalignnum := 0 for { - if subalign, err = al.SubAlign(start, len); err != nil { - io.LogError(err) - return + starts := []int{start} + lens := []int{len} + if subseqreverse { + if starts, lens, err = al.InverseCoordinates(start, len); err != nil { + io.LogError(err) + return + } + } + subalign = nil + for i, s := range starts { + l := lens[i] + if subaligntmp, err = al.SubAlign(s, l); err != nil { + io.LogError(err) + return + } + if subalign == nil { + subalign = subaligntmp + } else { + if err = subalign.Concat(subaligntmp); err != nil { + io.LogError(err) + return + } + } } writeAlign(subalign, f) start += subseqstep @@ -170,5 +191,6 @@ func init() { subseqCmd.PersistentFlags().IntVarP(&subseqstart, "start", "s", 0, "Start position (0-based inclusive)") subseqCmd.PersistentFlags().IntVarP(&subseqlength, "length", "l", 10, "Length of the sub alignment") subseqCmd.PersistentFlags().StringVar(&subseqrefseq, "ref-seq", "none", "Reference sequence on which coordinates are given") + subseqCmd.PersistentFlags().BoolVarP(&subseqreverse, "reverse", "r", false, "Take all but the given subsequence") subseqCmd.PersistentFlags().IntVar(&subseqstep, "step", 0, "Step: If > 0, then will generate several alignments, for each window of length l, with starts: [start,start+step, ..., end-l]* ") } diff --git a/cmd/subsites.go b/cmd/subsites.go new file mode 100644 index 0000000..e933c67 --- /dev/null +++ b/cmd/subsites.go @@ -0,0 +1,149 @@ +package cmd + +import ( + "fmt" + "os" + "path/filepath" + "strconv" + + "github.com/evolbioinfo/goalign/align" + "github.com/evolbioinfo/goalign/io" + "github.com/spf13/cobra" +) + +var subsitesout string = "stdout" +var subsitesfile string +var subsitesrefseq string +var subsitesreverse bool + +// subsitesCmd represents the subsites command +var subsitesCmd = &cobra.Command{ + Use: "subsites", + Short: "Takes a subset of the sites of the alignment", + Long: `Takes a subset of the sites of the alignment + +It takes an alignment and extracts only some sites from it, given +a set of sites given on the command line or on a given file. +If some sites are outside the alignment, will exit with an error. + +For example: +goalign subsites -p -i al.phy 1 2 3 4 + +will select sites 1, 2, 3, 4 from the alignment (0-based inclusive) + +The output format is the same than input format. + +If --ref-seq is specified, then the coordinates are considered according to the +given sequence, and without considering gaps. + +For example: +If al.fa is: +>s1 +--ACG--AT-GC +>s2 +GGACGTTATCGC + +goalign subsites -i al.fa --ref-seq s1 1 2 3 + +will output: + +>s1 +CGA +>s2 +CGA + +If --reverse is given, then will output all positions but the ones that should be output. +`, + RunE: func(cmd *cobra.Command, args []string) (err error) { + var aligns *align.AlignChannel + var f *os.File + var c int + var subalign align.Alignment + + if aligns, err = readalign(infile); err != nil { + io.LogError(err) + return + } + if f, err = openWriteFile(subsitesout); err != nil { + io.LogError(err) + return + } + + refseq := cmd.Flags().Changed("ref-seq") + + var sites []int + + if subsitesfile != "none" { + if sites, err = parseIntFile(subsitesfile); err != nil { + io.LogError(err) + return + } + } else { + sites = make([]int, 0) + for _, s := range args { + if c, err = strconv.Atoi(s); err != nil { + io.LogError(err) + return + } + sites = append(sites, c) + } + } + + if len(sites) == 0 { + err = fmt.Errorf("No sites are provided") + io.LogError(err) + return + } + + fileid := "" + filenum := 0 + extension := filepath.Ext(subsitesout) + name := subsitesout[0 : len(subsitesout)-len(extension)] + + for al := range aligns.Achan { + + if filenum > 0 && subsitesout != "stdout" && subsitesout != "-" { + fileid = fmt.Sprintf("%s_al%d%s", name, filenum, extension) + f.Close() + if f, err = openWriteFile(fileid); err != nil { + io.LogError(err) + return + } + } + + sitespos := sites + if refseq { + sitespos, err = al.RefSites(subsitesrefseq, sites) + } + + if subsitesreverse { + if sitespos, err = al.InversePositions(sitespos); err != nil { + io.LogError(err) + return + } + } + if subalign, err = al.SelectSites(sitespos); err != nil { + io.LogError(err) + return + } + writeAlign(subalign, f) + filenum++ + } + + f.Close() + + if aligns.Err != nil { + err = aligns.Err + io.LogError(err) + } + return + }, +} + +func init() { + RootCmd.AddCommand(subsitesCmd) + subsitesCmd.PersistentFlags().StringVarP(&subsitesout, "output", "o", "stdout", "Alignment output file") + subsitesCmd.PersistentFlags().StringVar(&subsitesrefseq, "ref-seq", "none", "Reference sequence on which coordinates are given") + subsitesCmd.PersistentFlags().StringVar(&subsitesfile, "sitefile", "none", "File with positions of sites to select (one perline)") + subsitesCmd.PersistentFlags().BoolVarP(&subsitesreverse, "reverse", "r", false, "Take all but the given sites") +} diff --git a/docs/commands/subseq.md b/docs/commands/subseq.md index 408973e..760ce64 100644 --- a/docs/commands/subseq.md +++ b/docs/commands/subseq.md @@ -96,6 +96,12 @@ If several alignments are present in the input file and the output is a file (no * Other alignments, other subalignments: results will be placed in the given file with `_al` and `_sub` suffixes (ex `out_al1_sub1.fasta`, `out_al1_sub2.fasta`, etc.) +In any case: +------------ + +If `--reverse` is given, all positions but the given subsequence is output. In case of a sliding window, it will correspond to sliding subsequence "deletion". In case of reference sequence coordinates, the subsequence is first computed, then all positions of the alignment but this subsequence are given in output. + + #### Usage ``` Usage: diff --git a/docs/commands/subsites.md b/docs/commands/subsites.md new file mode 100644 index 0000000..646e4c0 --- /dev/null +++ b/docs/commands/subsites.md @@ -0,0 +1,86 @@ +# Goalign: toolkit and api for alignment manipulation + +## Commands + +### subsites +This command takes a subset of the sites of the given alignment from the input alignment. + +User provides a set of positions to extract from the input alignment, either in the command line or in an input file (one position per line, 0-based coordinates). + +Site position (0-based, inclusive) can be specified on the alignment coordinate system (default) +or on a given sequence coordinate system without gaps (`--ref-seq`). It allows +to extract positions by just knowing coordinates on a given unaligned reference sequence. + +For example: +``` +goalign subsites -p -i al.phy 1 2 3 +``` + +or + +``` +goalign subsites -p -i al.phy --sitefile positions.txt +``` + + +The output format is the same than input format. + +If `--ref-seq ` is specified, then the coordinates are specified on the given sequence +coordinate system without considering gaps. + +For example: + +If al.fa is: +``` +>s1 +--ACG--AT-GC +>s2 +GGACGTTATCGC +``` + +Then: +``` +goalign subsites -i al.fa --ref-seq s1 0 1 2 3 +```` + +will output: + +``` +>s1 +ACGA +>s2 +ACGA +``` + +In any case: +------------ + +If `--reverse` is given, all positions but the given positions are output. In case of reference sequence coordinates, the positions are first extracted based on reference coordinates, then all positions of the alignment but these are given in output. + + +#### Usage +``` +Usage: + goalign subsites [flags] + +Flags: + -h, --help help for subsites + -o, --output string Alignment output file (default "stdout") + --ref-seq string Reference sequence on which coordinates are given (default "none") + -r, --reverse Take all but the given sites + --sitefile string File with positions of sites to select (one perline) (default "none") + +Global Flags: + -i, --align string Alignment input file (default "stdin") + --auto-detect Auto detects input format (overrides -p, -x and -u) + -u, --clustal Alignment is in clustal? default fasta + --ignore-identical Ignore duplicated sequences that have the same name and same sequences + --input-strict Strict phylip input format (only used with -p) + -x, --nexus Alignment is in nexus? default fasta + --no-block Write Phylip sequences without space separated blocks (only used with -p) + --one-line Write Phylip sequences on 1 line (only used with -p) + --output-strict Strict phylip output format (only used with -p) + -p, --phylip Alignment is in phylip? default fasta +``` + + diff --git a/docs/index.md b/docs/index.md index 00c3f43..598a97f 100644 --- a/docs/index.md +++ b/docs/index.md @@ -118,6 +118,7 @@ Command | Subcommand | -- | taxa | Prints index (position) and name of taxa of the alignment file [subseq](commands/subseq.md) ([api](api/subseq.md)) | | Take a sub-alignment from the input alignment [subset](commands/subset.md) ([api](api/subset.md)) | | Take a subset of sequences from the input alignment +[subsites](commands/subsites.md) (api) | | Take a subset of the sites from the input alignment [sw](commands/sw.md) ([api](api/sw.md)) | | Aligns 2 sequences using Smith&Waterman algorithm [translate](commands/translate.md) ([api](api/translate.md))| | Translates an input sequence into Amino-Acids [trim](commands/trim.md) ([api](api/trim.md)) | | This command trims names of sequences or sequences themselves diff --git a/test.sh b/test.sh index da13b6b..7633c29 100755 --- a/test.sh +++ b/test.sh @@ -1851,6 +1851,23 @@ ${GOALIGN} subseq -i input -l 4 -s 1 --ref-seq Seq0000 > result diff -q -b result expected rm -f input expected result +echo "->goalign subseq / refseq / rev" +cat > input <Seq0000 +--ACG--AT-GC +>Seq0001 +GGACGTTATCGC +EOF +cat > expected <Seq0000 +--A-GC +>Seq0001 +GGACGC +EOF +${GOALIGN} subseq -i input -l 4 -s 1 --ref-seq Seq0000 --reverse > result +diff -q -b result expected +rm -f input expected result + echo "->goalign subseq window phylip" cat > input <goalign subsites /1" +cat > input <s1 +ACGTACGT +>s2 +-CGT-C-T +>s3 +ACGTACGT +>s4 +ACGTTCGA +>s5 +ACGTTCGA +EOF + +cat > exp <s1 +CACGT +>s2 +C-C-T +>s3 +CACGT +>s4 +CTCGA +>s5 +CTCGA +EOF + + +${GOALIGN} subsites -i input 1 4 5 6 7 -o result +diff -q -b exp result +rm -f input exp result + +echo "->goalign subsites --reverse /2" +cat > input <s1 +ACGTACGT +>s2 +-CGT-C-T +>s3 +ACGTACGT +>s4 +ACGTTCGA +>s5 +ACGTTCGA +EOF + +cat > exp <s1 +AGT +>s2 +-GT +>s3 +AGT +>s4 +AGT +>s5 +AGT +EOF + +${GOALIGN} subsites -i input --reverse -o result 1 4 5 6 7 +diff -q -b exp result +rm -f input exp result + +echo "->goalign subsites --reverse --ref-seq /2" +cat > input <s1 +ACGTACGT +>s2 +-CGT-C-T +>s3 +ACGTACGT +>s4 +ACGTTCGA +>s5 +ACGTTCGA +EOF + +cat > exp <s1 +ACTACG +>s2 +-CT-C- +>s3 +ACTACG +>s4 +ACTTCG +>s5 +ACTTCG +EOF + +${GOALIGN} subsites -i input --reverse --ref-seq s2 -o result 1 4 +diff -q -b exp result +rm -f input exp result + +echo "->goalign subsites from file" +cat > input <s1 +ACGTACGT +>s2 +-CGT-C-T +>s3 +ACGTACGT +>s4 +ACGTTCGA +>s5 +ACGTTCGA +EOF + +cat > posfile < exp <s1 +CACGT +>s2 +C-C-T +>s3 +CACGT +>s4 +CTCGA +>s5 +CTCGA +EOF + +${GOALIGN} subsites -i input --sitefile posfile -o result +diff -q -b exp result +rm -f input exp result