Skip to content

Commit

Permalink
Added an option to reverse complement only some sequences in a given …
Browse files Browse the repository at this point in the history
…input file
  • Loading branch information
fredericlemoine committed Apr 3, 2024
1 parent f75f0b3 commit f5c7c76
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 13 deletions.
35 changes: 30 additions & 5 deletions align/seqbag.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,12 @@ type SeqBag interface {
RarefySeqBag(nb int, counts map[string]int) (SeqBag, error) // Take a new rarefied sample taking into accounts weights
Rename(namemap map[string]string)
RenameRegexp(regex, replace string, namemap map[string]string) error
Replace(old, new string, regex bool) error // Replaces old string with new string in sequences of the alignment
ShuffleSequences() // Shuffle sequence order
String() string // Raw string representation (just write all sequences)
Translate(phase int, geneticcode int) (err error) // Translates nt sequence in aa
ReverseComplement() (err error) // Reverse-complements the alignment
Replace(old, new string, regex bool) error // Replaces old string with new string in sequences of the alignment
ShuffleSequences() // Shuffle sequence order
String() string // Raw string representation (just write all sequences)
Translate(phase int, geneticcode int) (err error) // Translates nt sequence in aa
ReverseComplement() (err error) // Reverse-complements the alignment
ReverseComplementSequences(name ...string) (err error) // Reverse-complements some sequences in the alignment
TrimNames(namemap map[string]string, size int) error
TrimNamesAuto(namemap map[string]string, curid *int) error
Sort() // Sorts the sequences by name
Expand Down Expand Up @@ -945,6 +946,30 @@ func (sb *seqbag) ReverseComplement() (err error) {
return
}

/*
ReverseComplement reverse complements all input seuqences.
- if the alphabet is not NUCLEOTIDES: returns an error
- IUPAC characters are supported
*/
func (sb *seqbag) ReverseComplementSequences(names ...string) (err error) {
if sb.Alphabet() != NUCLEOTIDS {
err = errors.New("wrong alphabet, cannot reverse complement")
return
}

for _, name := range names {
s, found := sb.SequenceByName(name)
if found {
if err = Complement(s.SequenceChar()); err != nil {
return
}
Reverse(s.SequenceChar())
}
}

return
}

// Translate sequences in 3 phases (or 6 phases if reverse strand is true)
// And return the longest orf found
func (sb *seqbag) LongestORF(reverse bool) (orf Sequence, err error) {
Expand Down
32 changes: 26 additions & 6 deletions cmd/revcomp.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ var revCompCmd = &cobra.Command{
If the input alignment is not nucleotides, then returns an error.
IUPAC codes are taken into account for the reverse complement.
If sequence names are given in the command line (e.g. goalign revcomp -i al.fasta s1 s2 s3),
only given sequences are reverse-complemented, if they exist in the alignment.
`,
RunE: func(cmd *cobra.Command, args []string) (err error) {
var f utils.StringWriterCloser
Expand All @@ -37,9 +40,18 @@ IUPAC codes are taken into account for the reverse complement.
io.LogError(err)
return
}
if err = seqs.ReverseComplement(); err != nil {
io.LogError(err)
return

// If names are given as arguments, we reverse complement only these sequences
if len(args) > 0 {
if err = seqs.ReverseComplementSequences(args...); err != nil {
io.LogError(err)
return
}
} else {
if err = seqs.ReverseComplement(); err != nil {
io.LogError(err)
return
}
}
writeSequences(seqs, f)
} else {
Expand All @@ -51,9 +63,17 @@ IUPAC codes are taken into account for the reverse complement.
return
}
for al = range aligns.Achan {
if err = al.ReverseComplement(); err != nil {
io.LogError(err)
return
// If names are given as arguments, we reverse complement only these sequences
if len(args) > 0 {
if err = al.ReverseComplementSequences(args...); err != nil {
io.LogError(err)
return
}
} else {
if err = al.ReverseComplement(); err != nil {
io.LogError(err)
return
}
}
writeAlign(al, f)
}
Expand Down
3 changes: 3 additions & 0 deletions docs/commands/revcomp.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ If `--unaligned` is specified, then input sequences may be unaligned.

IUPAC codes are taken into account.

If sequence names are given in the command line (e.g. goalign revcomp -i al.fasta s1 s2 s3),
only given sequences are reverse-complemented, if they exist in the alignment.

#### Usage
```
Usage:
Expand Down
22 changes: 20 additions & 2 deletions test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5918,10 +5918,19 @@ cat > expected <<EOF
>s2
.*NVHDBMKWSRYCGTTA
EOF
cat > expected.2 <<EOF
>s1
.*NBDHVKMWSYRGCAAT
>s2
TAACGRYSWMKVHDBN*.
EOF

${GOALIGN} revcomp -i input > output
diff -q -b expected output
rm -rf input output expected

${GOALIGN} revcomp -i input s1 > output
diff -q -b expected.2 output
rm -rf input output expected expected.2


echo "->goalign revcomp --unaligned"
Expand All @@ -5937,7 +5946,16 @@ cat > expected <<EOF
>s2
ACGT.*NVHDBMKWSRYCGTTA
EOF
cat > expected.2 <<EOF
>s1
ATUGCYRSWKMBDHVN*.
>s2
ACGT.*NVHDBMKWSRYCGTTA
EOF

${GOALIGN} revcomp -i input --unaligned > output
diff -q -b expected output
rm -rf input output expected

${GOALIGN} revcomp -i input --unaligned s2 > output
diff -q -b expected.2 output
rm -rf input output expected expected.2

0 comments on commit f5c7c76

Please sign in to comment.