From c13a469379cdea012eef1eb9ac9fcdf7f947f0aa Mon Sep 17 00:00:00 2001 From: Frederic Lemoine Date: Wed, 20 Dec 2023 13:47:39 +0100 Subject: [PATCH] Added option swap to goalign shuffle recomb --- align/align.go | 23 ++++++++++++++++------- cmd/recomb.go | 27 ++++++++++++++++++++++++--- docs/commands/shuffle.md | 10 ++++++++-- 3 files changed, 48 insertions(+), 12 deletions(-) diff --git a/align/align.go b/align/align.go index d59fc49..94c4941 100644 --- a/align/align.go +++ b/align/align.go @@ -98,7 +98,7 @@ type Alignment interface { Pssm(log bool, pseudocount float64, normalization int) (pssm map[uint8][]float64, err error) // Normalization: PSSM_NORM_NONE, PSSM_NORM_UNIF, PSSM_NORM_DATA Rarefy(nb int, counts map[string]int) (Alignment, error) // Take a new rarefied sample taking into accounts weights RandSubAlign(length int, consecutive bool) (Alignment, error) // Extract a random subalignment with given length from this alignment - Recombine(rate float64, lenprop float64) + Recombine(rate float64, lenprop float64, swap bool) error // converts coordinates on the given sequence to coordinates on the alignment RefCoordinates(name string, refstart, refend int) (alistart, aliend int, err error) // converts sites on the given sequence to coordinates on the alignment @@ -959,19 +959,23 @@ func (a *align) TranslateByReference(phase int, geneticcode int, refseq string) return } -// Recombines a rate of the sequences to another sequences -// takes rate/2 seqs and copy/paste a portion of them to the other -// rate/2 seqs at a random position -// if rate < 0 : does nothing -// if rate > 1 : does nothing +// Recombine recombines a rate of the sequences into other sequences +// takes prop*nseq seqs and copy/paste a portion of them to the other +// prop*nseq seqs +// if prop < 0 : error +// if prop > 0.5 : error // prop must be <= 0.5 because it will recombine x% of seqs based on other x% of seqs -func (a *align) Recombine(prop float64, lenprop float64) { +// if swap is true, then swaps the two portions of sequences (2*prop sequences will be impacted) +// if swap is false, then just transfers the portion of seq1 to seq2 +func (a *align) Recombine(prop float64, lenprop float64, swap bool) (err error) { var seq1, seq2 *seq if prop < 0 || prop > 0.5 { + err = fmt.Errorf("proportion of sequence is outside of [0,0.5] range") return } if lenprop < 0 || lenprop > 1 { + err = fmt.Errorf("proportion of sequence length is outside of [0,1] range") return } @@ -985,9 +989,14 @@ func (a *align) Recombine(prop float64, lenprop float64) { seq1 = a.seqs[permutation[i]] seq2 = a.seqs[permutation[i+nb]] for j := pos; j < pos+lentorecomb; j++ { + tmp := seq1.sequence[j] seq1.sequence[j] = seq2.sequence[j] + if swap { + seq2.sequence[j] = tmp + } } } + return } // Add prop*100% gaps to lenprop*100% of the sequences diff --git a/cmd/recomb.go b/cmd/recomb.go index 81fe50d..5641b5b 100644 --- a/cmd/recomb.go +++ b/cmd/recomb.go @@ -9,6 +9,7 @@ import ( var recombNb float64 var recombProp float64 +var recombSwap bool // recombCmd represents the recomb command var recombCmd = &cobra.Command{ @@ -16,22 +17,38 @@ var recombCmd = &cobra.Command{ Short: "Recombine sequences in the input alignment", Long: `Recombine of sequences in the input alignment. +This command recombines a proportion of the input sequences into other sequences +It takes prop*nseq seqs and copy/paste a portion of them to the other prop*nseq sequences. + +- if prop < 0 or prop > 0.5 : error (prop must be <= 0.5 because it will recombine x% of + seqs based on other x% of seqs) +- if swap is true, then swaps the two portions of sequences (2*prop sequences will be impacted) +- if swap is false, then just transfers the portion of seq1 to seq2 + It may take Fasta or Phylip input alignment. If the input alignment contains several alignments, will process all of them -Two options: +Three options: 1 - The proportion of recommbining sequences. It will take n sequences and will copy/paste a portion of another n sequences; 2 - The proportion of the sequence length to recombine. +3 - Swap or not -Recombine 25% of sequences by 50%: +Recombine 25% of sequences by 50% (no swap): s1 CCCCCCCCCCCCCC s1 CCCCCCCCCCCCCC s2 AAAAAAAAAAAAAA => s2 AAAATTTTTTTAAA s3 GGGGGGGGGGGGGG s3 GGGGGGGGGGGGGG s4 TTTTTTTTTTTTTT s4 TTTTTTTTTTTTTT +Recombine 2x25% of sequences by 50% (swap): + +s1 CCCCCCCCCCCCCC s1 CCCCCCCCCCCCCC +s2 AAAAAAAAAAAAAA => s2 AAAATTTTTTTAAA +s3 GGGGGGGGGGGGGG s3 GGGGGGGGGGGGGG +s4 TTTTTTTTTTTTTT s4 TTTTAAAAAAATTT + Example of usage: goalign shuffle recomb -i align.phylip -p -n 1 -l 0.5 @@ -52,7 +69,10 @@ goalign shuffle recomb -i align.fasta -r 0.5 -n 1 -l 0.5 defer utils.CloseWriteFile(f, shuffleOutput) for al := range aligns.Achan { - al.Recombine(recombNb, recombProp) + if err = al.Recombine(recombNb, recombProp, recombSwap); err != nil { + io.LogError(err) + return + } writeAlign(al, f) } @@ -69,4 +89,5 @@ func init() { recombCmd.PersistentFlags().Float64VarP(&recombNb, "prop-seq", "n", 0.5, "Proportion of the sequences to recombine") recombCmd.PersistentFlags().Float64VarP(&recombProp, "prop-length", "l", 0.5, "Proportion of length of sequences to recombine") + recombCmd.PersistentFlags().BoolVar(&recombSwap, "swap", false, "If true, swaps sequences, otherwise just transfer seq1 subseq to seq2") } diff --git a/docs/commands/shuffle.md b/docs/commands/shuffle.md index dc046fe..58481ba 100644 --- a/docs/commands/shuffle.md +++ b/docs/commands/shuffle.md @@ -4,13 +4,19 @@ ### shuffle This command adds different type of noises in an input alignment, with these sub-commands: -* `goalign shuffle recomb`:Recombine a given proportion of the length of a given proportion of the sequences with other sequences (copy/paste). - Example: +* `goalign shuffle recomb`:Recombine a given proportion of the length of a given proportion of the sequences with other sequences (copy/paste). If swap is true, will impact 2xproportion of the sequences (Example 2) + Example (no swap): ``` s1 CCCCCCCCCCCCCC s1 CCCCCCCCCCCCCC s2 AAAAAAAAAAAAAA => s2 AAAATTTTTTTAAA s3 GGGGGGGGGGGGGG s3 GGGGGGGGGGGGGG s4 TTTTTTTTTTTTTT s4 TTTTTTTTTTTTTT + Example 2 (swap): + ``` + s1 CCCCCCCCCCCCCC s1 CCCCCCCCCCCCCC + s2 AAAAAAAAAAAAAA => s2 AAAATTTTTTTAAA + s3 GGGGGGGGGGGGGG s3 GGGGGGGGGGGGGG + s4 TTTTTTTTTTTTTT s4 TTTTAAAAAAATTT ``` * `goalign shuffle rogue`: Simulate rogue taxa, by shuffling (horizontally) a given proportion of the sites of a given proportion of the sequences. Example: