Merge pull request #2 from ErillLab/develop

First version for main branch
ErillLab · Aug 1, 2024 · 26b4402 · 26b4402
2 parents e87f22f + 3d8bc82
commit 26b4402
Show file tree

Hide file tree

Showing 9 changed files with 307 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+.vs_code
+build
+**/__pycache__
+sandbox
+**/Markov_DNA.egg-info
diff --git a/README.md b/README.md
@@ -1,2 +1,78 @@
 # Markov_DNA_gen
-A Markov Model DNA sequence generator to generate pseudo-replicate sequences based on an input sequence
+
+A Markov Model DNA sequence generator to generate pseudo-replicate sequences based on an input sequence.
+
+## Requirements
+
+- NumPy
+
+## Installation
+
+It can be installed using pip:
+
+``` bash
+pip3 install .
+```
+
+Or using conda:
+
+``` bash
+conda install erikblazfer::markov_dna
+```
+
+## User guide
+
+Import module:
+
+```python
+    from Markov_DNA import MCM
+```
+
+Train model:
+
+```python
+    mcm = MCM(k)
+    mcm.train(seq)
+```
+
+where:
+  - k is the model order
+  - seq is the reference sequence to train the model
+
+Generate sequences:
+
+Can be done in two different ways, using a generator in an iterative way:
+
+```python
+    for new_seq in mcm.generator(size=l, N=N):
+        print(new_seq)
+```
+
+Or by calling a function that generates a list of sequences:
+
+```python
+    seqs = mcm.generate(size=l, N=N)
+```
+
+Where:
+  - l is the length of the sequence to be generated.
+  - N is the number of sequences to be generated.
+
+The advantage of the first method is that you do not need to keep all the sequences in memory, while the second one allows you to obtain a list of sequences directly.
+
+## Example
+
+There are some examples located in the examples folder. To execute them, the following command can be used:
+
+```bash
+    python3 examples/exampleX.py
+```
+
+Where X can be [1..2] depending on the genereting method.
+
+- Example 1: Shows how to use the function that returns the list of sequences.
+- Example 2: Shows how to use the iterative form of the generator generator.
+
+## Authors
+
+Erik Blázquez Fernández ([email protected])
diff --git a/examples/example1.py b/examples/example1.py
@@ -0,0 +1,13 @@
+from Markov_DNA import MCM
+
+if __name__ == "__main__":
+    seq = "TGAGGACTTTaggatAGGATTTTGTCATCATCAAAAGACATTCTTGTAAATTATATGAAACGTCGGTTTCCTTGTCAAGATTCGTTTTTTGTAAATGATCTCCTTAGTCTATTTTACAACAACATCACTGTGGATAACATCTATCCACACACAAAAGCATGTCCTCAAACTTATCGACAGATTACACACATGATCTTGTGTTGTGGATAAAAAACAAACACAGCATATTGAACCTTGTGGATAAATAGCTAGAACCCTTGCAACTACTTATTTTTTTTGCTAAGATATTATTGTTTTACACGTGGATGCCTATTTTGAACGTCAATTATTATCCACACGCTGTGTATAACTTTGTGGACAGTTGTTTACGACCTTATCCACACAGGTGTGATTCGTGTTCATTCGACGTTTTTCTACTATATTATATTCTATCCACATCGTTTTTCAGAATTTCATATTTATTCATC"
+
+    mcm = MCM(8)
+
+    mcm.train(seq)
+
+    seqs = mcm.generate(size=20, N=10)
+
+    for s in seqs:
+        print(s)
diff --git a/examples/example2.py b/examples/example2.py
@@ -0,0 +1,11 @@
+from Markov_DNA import MCM
+
+if __name__ == "__main__":
+    seq = "TGAGGACTTTaggatAGGATTTTGTCATCATCAAAAGACATTCTTGTAAATTATATGAAACGTCGGTTTCCTTGTCAAGATTCGTTTTTTGTAAATGATCTCCTTAGTCTATTTTACAACAACATCACTGTGGATAACATCTATCCACACACAAAAGCATGTCCTCAAACTTATCGACAGATTACACACATGATCTTGTGTTGTGGATAAAAAACAAACACAGCATATTGAACCTTGTGGATAAATAGCTAGAACCCTTGCAACTACTTATTTTTTTTGCTAAGATATTATTGTTTTACACGTGGATGCCTATTTTGAACGTCAATTATTATCCACACGCTGTGTATAACTTTGTGGACAGTTGTTTACGACCTTATCCACACAGGTGTGATTCGTGTTCATTCGACGTTTTTCTACTATATTATATTCTATCCACATCGTTTTTCAGAATTTCATATTTATTCATC"
+
+    mcm = MCM(3)
+
+    mcm.train(seq)
+
+    for new_seq in mcm.generator(N=10):
+        print(new_seq)
diff --git a/meta.yaml b/meta.yaml
@@ -0,0 +1,24 @@
+package:
+  name: markov_dna
+  version: "1.0.0"
+
+source:
+  path: ./
+
+build:
+  noarch: python
+  script: python setup.py install
+  # script: python setup_conda.py install
+
+requirements:
+  host:
+    - python
+    - setuptools
+    - numpy
+  run:
+    - python
+    - numpy
+
+about:
+  license: GNU GENERAL PUBLIC LICENSE
+  summary: "A Markov Model DNA sequence generator to generate pseudo-replicate sequences based on an input sequence."
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1 @@
+numpy
diff --git a/setup.py b/setup.py
@@ -0,0 +1,19 @@
+from setuptools import setup, find_packages
+
+setup(
+    name='Markov_DNA',
+    version='1.0.0',
+    packages=["Markov_DNA"],
+    package_dir={'':'src'},
+    description='A Markov Model DNA sequence generator to generate pseudo-replicate sequences based on an input sequence.',
+    long_description=open('README.md').read(),
+    long_description_content_type='text/markdown',
+    author='Erik Blázquez',
+    author_email='[email protected]',
+    classifiers=[
+        'Programming Language :: Python :: 3',
+        'License :: OSI Approved :: GNU GENERAL PUBLIC LICENSE',
+        'Operating System :: OS Independent',
+    ],
+    python_requires='>=3.6',
+)
diff --git a/src/Markov_DNA/MCM.py b/src/Markov_DNA/MCM.py
@@ -0,0 +1,156 @@
+import random
+import numpy as np
+
+class MCM:
+
+    def __init__(self, k=1):
+        """
+        Markov Chain Model constructor.
+
+        Args:
+            k (int): Order of the Markov model.
+        """
+        # Markov model order
+        self.k = k
+
+        # Transition matrix
+        # key: Represents the state, a sequence of n nucleotides
+        # value: Transition probabilities for the state in form of dict.
+        #         Each entry represents the probability to make a transition
+        #         to a nucleotide, being the nucleotide the key
+        self.transition = {}
+
+        # Transition matrix of auxiliar Markov model
+        # Similar structure as self.transition
+        self.aux_transition = [{} for _ in range(self.k)]
+
+    def train(self, seq):
+        """
+        Trains the model using a DNA sequence, computing the transition matrix of the model.
+
+        An auxiliar Markov model is trained, with an order in which there is no state without 
+        transitions. In case of arriving to some state withput transitions, this auxiliar model
+        is used to generate the next transition.
+
+        Args:
+            seq (string): DNA sequence to train the model.
+        """
+        self.size = len(seq)
+        seq = seq.upper()
+        self.seq = seq
+        self.transition = {}
+        for i in range(0, len(seq)-self.k):
+            kmer = seq[i: i+self.k]
+            if not kmer in self.transition:
+                self.transition[kmer] = {}
+                self.transition[kmer]["A"] = 0
+                self.transition[kmer]["C"] = 0
+                self.transition[kmer]["G"] = 0
+                self.transition[kmer]["T"] = 0
+                self.transition[kmer]["tot"] = 0
+            self.transition[kmer][seq[i+self.k]] += 1
+            self.transition[kmer]["tot"] += 1
+
+            ## Computes all posible models
+            for k in range(0, self.k):
+                if k == 0:
+                    aux_kmer = ""
+                else:
+                    aux_kmer = kmer[-k:]
+                if not aux_kmer in self.aux_transition[k]:
+                    self.aux_transition[k][aux_kmer] = {}
+                    self.aux_transition[k][aux_kmer]["A"] = 0
+                    self.aux_transition[k][aux_kmer]["C"] = 0
+                    self.aux_transition[k][aux_kmer]["G"] = 0
+                    self.aux_transition[k][aux_kmer]["T"] = 0
+                    self.aux_transition[k][aux_kmer]["tot"] = 0
+                self.aux_transition[k][aux_kmer][seq[i+self.k]] += 1
+                self.aux_transition[k][aux_kmer]["tot"] += 1
+
+        for key in self.transition.keys():
+            self.transition[key]["A"] /= self.transition[key]["tot"]
+            self.transition[key]["C"] /= self.transition[key]["tot"]
+            self.transition[key]["G"] /= self.transition[key]["tot"]
+            self.transition[key]["T"] /= self.transition[key]["tot"]
+
+        ## Searches for the model with the highest order that has no states without transitions
+        for k in range(self.k-1, -1, -1):
+            if len(self.aux_transition[k]) == 4**(k):
+                self.aux_transition = self.aux_transition[k]
+                for key in self.aux_transition.keys():
+                    self.aux_transition[key]["A"] /= self.aux_transition[key]["tot"]
+                    self.aux_transition[key]["C"] /= self.aux_transition[key]["tot"]
+                    self.aux_transition[key]["G"] /= self.aux_transition[key]["tot"]
+                    self.aux_transition[key]["T"] /= self.aux_transition[key]["tot"]
+                self.aux_k = k
+                break
+
+    def sample(self, state):
+        """
+        Samples a new nucleotide given an array of probabilities.
+
+        Args:
+            state: Dictionary of probabilities for nucleotides.
+        """
+        if state in self.transition.keys():
+            probs = self.transition[state]
+        else:
+            probs = self.aux_transition[state[-self.aux_k:]]
+        return np.random.choice(["A","C","G","T"], p=[probs["A"], probs["C"], probs["G"], probs["T"]])
+
+    def generate(self, size=None, N=1):
+        """
+        Generates a new sequence of nucleotides using the trained model.
+
+        Args:
+            size (int): Length of the sequence to be generated.
+
+            N (int): Number of sequences to be generated.
+        """
+        if size == None:
+            size = self.size
+
+        res = []
+        for _ in range(N):
+            if self.k == 0:
+                probs = self.transition[""]
+                seq = np.random.choice(["A","C","G","T"], p=[probs["A"], probs["C"], probs["G"], probs["T"]], size=n)
+                seq = "".join(seq)
+            else:
+                seq = self.initial_state()
+                state = seq
+                for _ in range(size-len(state)):
+                    nuc = self.sample(state)
+                    seq += nuc
+                    state = seq[-self.k:]
+            res.append(seq)
+        return res
+
+    def generator(self, size=None, N=None):
+        """
+        Generator for sampling new sequences of nucleotides using the trained model.
+
+        Args:
+            size (int): Length of the sequence to be generated.
+
+            N (int): Number of sequences to be generated.
+        """
+        if size == None:
+            size = self.size
+
+        if N == None:
+            while 1:
+                seq = self.generate(size=size)[0]
+                yield seq
+        else:
+            for _ in range(N):
+                seq = self.generate(size=size)[0]
+                yield seq
+
+    def initial_state(self):
+        """
+        Computes the initial k-mer of the sequence sampling from the list
+        of appeared k-mers in the training sequence.
+        """
+        keys = list(self.transition.keys())
+        return random.choice(keys)
diff --git a/src/Markov_DNA/__init__.py b/src/Markov_DNA/__init__.py
@@ -0,0 +1 @@
+from Markov_DNA.MCM import MCM