Skip to content

Commit

Permalink
Added --posfile to command goalign replace : to replace specific char…
Browse files Browse the repository at this point in the history
…acters at specific positions of specific sequences in an input alignment
  • Loading branch information
fredericlemoine committed Oct 14, 2023
1 parent 8adbb88 commit 18217b2
Show file tree
Hide file tree
Showing 3 changed files with 166 additions and 8 deletions.
22 changes: 22 additions & 0 deletions align/align.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,8 @@ type Alignment interface {
RefCoordinates(name string, refstart, refend int) (alistart, aliend int, err error)
// converts sites on the given sequence to coordinates on the alignment
RefSites(name string, sites []int) (refsites []int, err error)
// Overwrites the character at position "site" of the sequence "seqname" by "newchar"
ReplaceChar(seqname string, site int, newchar uint8) error
// Removes sequences having >= cutoff gaps, returns number of removed sequences
RemoveGapSeqs(cutoff float64, ignoreNs bool) int
// Removes sequences having >= cutoff character, returns number of removed sequences
Expand Down Expand Up @@ -1107,6 +1109,26 @@ func (a *align) TrimSequences(trimsize int, fromStart bool) error {
return nil
}

// ReplaceChar overwrites the character at position "site" of the sequence "seqname" by "newchar"
func (a *align) ReplaceChar(seqname string, site int, newchar uint8) (err error) {
var s Sequence
var exists bool
if site < 0 {
err = fmt.Errorf("replacechar: site cannot be < 0")
return
}
if site >= a.Length() {
err = fmt.Errorf("replacechar: site is outside alignment length")
return
}
if s, exists = a.GetSequenceByName(seqname); !exists {
err = fmt.Errorf("replacechar: sequence name does not exist in the alignment")
return
}
s.SequenceChar()[site] = newchar
return
}

// Samples randomly a subset of the sequences
// And returns this new alignment
// If nb < 1 or nb > nbsequences returns nil and an error
Expand Down
113 changes: 105 additions & 8 deletions cmd/replace.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
package cmd

import (
"bufio"
"compress/gzip"
"errors"
"fmt"
"os"
"strconv"
"strings"

"github.com/evolbioinfo/goalign/align"
"github.com/evolbioinfo/goalign/io"
Expand All @@ -10,25 +16,34 @@ import (
)

var replaceOutput string
var replacefile string
var replaceRegexp bool
var replaceOld, replaceNew string

type repchar struct {
seqname string
site int
newchar uint8
}

// renameCmd represents the rename command
var replaceCmd = &cobra.Command{
Use: "replace",
Short: "Replace characters in sequences of the input alignment (possible with a regex)",
Long: `Replace characters in sequences of the input alignment (possible with a regex).
If the replacement changes sequence length, then returns an error.
If --posfile is given, then --old and --new are not considered. Instead, characters at sites+sequences specified in the input file
are replaced in the alignement. The format of the input posfile is tabulated with columns:
0: sequence name
1: site index
2: new character
`,
RunE: func(cmd *cobra.Command, args []string) (err error) {
var aligns *align.AlignChannel
var f utils.StringWriterCloser
var seqs align.SeqBag

if !cmd.Flags().Changed("old") || !cmd.Flags().Changed("new") {
err = errors.New("--old and --new must be specified")
return
}
var replace []repchar

if f, err = utils.OpenWriteFile(replaceOutput); err != nil {
io.LogError(err)
Expand All @@ -37,6 +52,11 @@ If the replacement changes sequence length, then returns an error.
defer utils.CloseWriteFile(f, replaceOutput)

if unaligned {
if !cmd.Flags().Changed("old") || !cmd.Flags().Changed("new") {
err = errors.New("--old and --new must be specified")
return
}

if seqs, err = readsequences(infile); err != nil {
io.LogError(err)
return
Expand All @@ -47,17 +67,39 @@ If the replacement changes sequence length, then returns an error.
}
writeSequences(seqs, f)
} else {
if replacefile == "none" && (!cmd.Flags().Changed("old") || !cmd.Flags().Changed("new")) {
err = errors.New("--old and --new must be specified")
return
}
if aligns, err = readalign(infile); err != nil {
io.LogError(err)
return
}
for al := range aligns.Achan {
if err = al.Replace(replaceOld, replaceNew, replaceRegexp); err != nil {

if replacefile != "none" {
if replace, err = readreplacefile(replacefile); err != nil {
io.LogError(err)
return
}
writeAlign(al, f)
for al := range aligns.Achan {
for _, rep := range replace {
if err = al.ReplaceChar(rep.seqname, rep.site, rep.newchar); err != nil {
io.LogError(err)
return
}
}
writeAlign(al, f)
}
} else {
for al := range aligns.Achan {
if err = al.Replace(replaceOld, replaceNew, replaceRegexp); err != nil {
io.LogError(err)
return
}
writeAlign(al, f)
}
}

if aligns.Err != nil {
err = aligns.Err
io.LogError(err)
Expand All @@ -74,5 +116,60 @@ func init() {
replaceCmd.PersistentFlags().BoolVarP(&replaceRegexp, "regexp", "e", false, "Considers Replace alignment using regexp")
replaceCmd.PersistentFlags().StringVarP(&replaceOld, "old", "s", "none", "String to replace in the sequences")
replaceCmd.PersistentFlags().StringVarP(&replaceNew, "new", "n", "none", "New string that will replace old string in sequences")
replaceCmd.PersistentFlags().StringVarP(&replacefile, "posfile", "f", "none", "File containing sites to replace by give characters in given sequences (deactivates --old & --new)")
replaceCmd.PersistentFlags().BoolVar(&unaligned, "unaligned", false, "Considers input sequences as unaligned and fasta format (phylip, nexus,... options are ignored)")
}

func readreplacefile(file string) (replace []repchar, err error) {
var f *os.File
var r *bufio.Reader
var gr *gzip.Reader

var seqname string
var site int
var newchar uint8
replace = make([]repchar, 0)

if file == "stdin" || file == "-" {
f = os.Stdin
} else {
if f, err = os.Open(file); err != nil {
return
}
}

if strings.HasSuffix(file, ".gz") {
if gr, err = gzip.NewReader(f); err != nil {
return
}
r = bufio.NewReader(gr)
} else {
r = bufio.NewReader(f)
}
l, e := utils.Readln(r)

for e == nil {
cols := strings.Split(l, "\t")
if cols == nil || len(cols) < 3 {
err = errors.New("bad format from replace char file: There should be 3 columns: seqname\\tsite\\tnewchar")
return
}

seqname = cols[0]
if site, err = strconv.Atoi(cols[1]); err != nil {
err = fmt.Errorf("cannot convert site index to int")
return
}
newchar = uint8(cols[2][0])

replace = append(
replace,
repchar{
seqname: seqname,
site: site,
newchar: newchar,
})
l, e = utils.Readln(r)
}
return
}
39 changes: 39 additions & 0 deletions test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4592,6 +4592,45 @@ ${GOALIGN} replace -s 'GA.' -e -n '---' -p -i input -o result
diff -q -b expected result
rm -f input expected result

echo "->goalign replace --posfile"

cat > expected <<EOF
>Seq0000
MGGATTAGTCMTTTTGCTGGACCTGTAGACTGAGCATTCATGCTGTGGCCGTTCTCGGAGCGAAACACGAAGAAAGTGTA
TTCCCAGTTAATTTGTTAGC
>Seq0001
AAATCAAACTCTGGGGCACAGGCCAACGTCAGAGCTGGACAATTCTCGCTATACAGACTGTAAGAGGCGATACAGCTCAG
GTCATTTGTCGACTGCTCTT
>Seq0002
MTTAAAGCATMGGCACCCTCCCATTTGTTAGCAACTATTTCGCCGAGAAAGTGGGCAAGCCTCCCCTTCGATGCCCAGGA
CCTTATCCCCGCTCTTCGCA
>Seq0003
TGTAATTCCTGAAGGTAGTAATATCGCATCGGACTTAACAGAACCTTATAGTACGGGGTATAATTCACTCCTCTGGTGCC
GCTAGGACTGCCGTCCCACC
>Seq0004
CAGAGATTCATGACCTTCTTTGGTAAGACGTCCGGGGACAACTACGTGAGTGGTGCGTTACGTATTAGCGTCAAAGCGCC
ATAATCATATACGGTAAAAT
>Seq0005
MTTGTCTTCAMCATAATTCGTTTCCTGGGGGTAAAGCCCGACCCCTGCTATTTAGTCAACTCTGGTGCTATCGGCTGCTC
CACTGCGCCGCCAATCTGCC
>Seq0006
CCCTATTTGGCATTTAGCTGTGGATTATAACGTTTCAGTATCCATTACCACAACACGCGATTTATTCGTCCGGGCAATGT
TCGGCTCAGATATGCGAATC
>Seq0007
TAGTGAATGTATCAGATGGGATAAGTTAATAACGAGCTGACTGCTACAGAGTAGCCTACCCAGCGGCGCATACACCTTGA
TAGTGAGGACTCCTGTGTAT
>Seq0008
GCTGCAATGAGCTGTGCTGATAGACGGTACAACGCTTAAGCCAAAGCGGTTGCTAGTAACATACCGGACCTTTCAGGAGC
AAAGGCTCCGAACCTACCAA
>Seq0009
MTTTACATTTMATTGGACAATTAGGACCATCTTATTCGCGATCCGTGACCAATTGCTATGGCTCGGGATATGTAGGATAG
GCCTCTCAGTTCACCTGACM
EOF

$GOALIGN random --seed 123456 | $GOALIGN replace --posfile posfile.txt > result
diff -q -b result expected
rm -f result expected

echo "->goalign diff"
cat > input <<EOF
10 100
Expand Down

0 comments on commit 18217b2

Please sign in to comment.