Skip to content

Commit

Permalink
Merge pull request #2 from ErillLab/develop
Browse files Browse the repository at this point in the history
First version for main branch
  • Loading branch information
eliamascolo authored Aug 1, 2024
2 parents e87f22f + 3d8bc82 commit 26b4402
Show file tree
Hide file tree
Showing 9 changed files with 307 additions and 1 deletion.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
.vs_code
build
**/__pycache__
sandbox
**/Markov_DNA.egg-info
78 changes: 77 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,78 @@
# Markov_DNA_gen
A Markov Model DNA sequence generator to generate pseudo-replicate sequences based on an input sequence

A Markov Model DNA sequence generator to generate pseudo-replicate sequences based on an input sequence.

## Requirements

- NumPy

## Installation

It can be installed using pip:

``` bash
pip3 install .
```

Or using conda:

``` bash
conda install erikblazfer::markov_dna
```

## User guide

Import module:

```python
from Markov_DNA import MCM
```

Train model:

```python
mcm = MCM(k)
mcm.train(seq)
```

where:
- k is the model order
- seq is the reference sequence to train the model

Generate sequences:

Can be done in two different ways, using a generator in an iterative way:

```python
for new_seq in mcm.generator(size=l, N=N):
print(new_seq)
```

Or by calling a function that generates a list of sequences:

```python
seqs = mcm.generate(size=l, N=N)
```

Where:
- l is the length of the sequence to be generated.
- N is the number of sequences to be generated.

The advantage of the first method is that you do not need to keep all the sequences in memory, while the second one allows you to obtain a list of sequences directly.

## Example

There are some examples located in the examples folder. To execute them, the following command can be used:

```bash
python3 examples/exampleX.py
```

Where X can be [1..2] depending on the genereting method.

- Example 1: Shows how to use the function that returns the list of sequences.
- Example 2: Shows how to use the iterative form of the generator generator.

## Authors

Erik Blázquez Fernández ([email protected])
13 changes: 13 additions & 0 deletions examples/example1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from Markov_DNA import MCM

if __name__ == "__main__":
seq = "TGAGGACTTTaggatAGGATTTTGTCATCATCAAAAGACATTCTTGTAAATTATATGAAACGTCGGTTTCCTTGTCAAGATTCGTTTTTTGTAAATGATCTCCTTAGTCTATTTTACAACAACATCACTGTGGATAACATCTATCCACACACAAAAGCATGTCCTCAAACTTATCGACAGATTACACACATGATCTTGTGTTGTGGATAAAAAACAAACACAGCATATTGAACCTTGTGGATAAATAGCTAGAACCCTTGCAACTACTTATTTTTTTTGCTAAGATATTATTGTTTTACACGTGGATGCCTATTTTGAACGTCAATTATTATCCACACGCTGTGTATAACTTTGTGGACAGTTGTTTACGACCTTATCCACACAGGTGTGATTCGTGTTCATTCGACGTTTTTCTACTATATTATATTCTATCCACATCGTTTTTCAGAATTTCATATTTATTCATC"

mcm = MCM(8)

mcm.train(seq)

seqs = mcm.generate(size=20, N=10)

for s in seqs:
print(s)
11 changes: 11 additions & 0 deletions examples/example2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from Markov_DNA import MCM

if __name__ == "__main__":
seq = "TGAGGACTTTaggatAGGATTTTGTCATCATCAAAAGACATTCTTGTAAATTATATGAAACGTCGGTTTCCTTGTCAAGATTCGTTTTTTGTAAATGATCTCCTTAGTCTATTTTACAACAACATCACTGTGGATAACATCTATCCACACACAAAAGCATGTCCTCAAACTTATCGACAGATTACACACATGATCTTGTGTTGTGGATAAAAAACAAACACAGCATATTGAACCTTGTGGATAAATAGCTAGAACCCTTGCAACTACTTATTTTTTTTGCTAAGATATTATTGTTTTACACGTGGATGCCTATTTTGAACGTCAATTATTATCCACACGCTGTGTATAACTTTGTGGACAGTTGTTTACGACCTTATCCACACAGGTGTGATTCGTGTTCATTCGACGTTTTTCTACTATATTATATTCTATCCACATCGTTTTTCAGAATTTCATATTTATTCATC"

mcm = MCM(3)

mcm.train(seq)

for new_seq in mcm.generator(N=10):
print(new_seq)
24 changes: 24 additions & 0 deletions meta.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package:
name: markov_dna
version: "1.0.0"

source:
path: ./

build:
noarch: python
script: python setup.py install
# script: python setup_conda.py install

requirements:
host:
- python
- setuptools
- numpy
run:
- python
- numpy

about:
license: GNU GENERAL PUBLIC LICENSE
summary: "A Markov Model DNA sequence generator to generate pseudo-replicate sequences based on an input sequence."
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
numpy
19 changes: 19 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from setuptools import setup, find_packages

setup(
name='Markov_DNA',
version='1.0.0',
packages=["Markov_DNA"],
package_dir={'':'src'},
description='A Markov Model DNA sequence generator to generate pseudo-replicate sequences based on an input sequence.',
long_description=open('README.md').read(),
long_description_content_type='text/markdown',
author='Erik Blázquez',
author_email='[email protected]',
classifiers=[
'Programming Language :: Python :: 3',
'License :: OSI Approved :: GNU GENERAL PUBLIC LICENSE',
'Operating System :: OS Independent',
],
python_requires='>=3.6',
)
156 changes: 156 additions & 0 deletions src/Markov_DNA/MCM.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
import random
import numpy as np

class MCM:

def __init__(self, k=1):
"""
Markov Chain Model constructor.
Args:
k (int): Order of the Markov model.
"""
# Markov model order
self.k = k

# Transition matrix
# key: Represents the state, a sequence of n nucleotides
# value: Transition probabilities for the state in form of dict.
# Each entry represents the probability to make a transition
# to a nucleotide, being the nucleotide the key
self.transition = {}

# Transition matrix of auxiliar Markov model
# Similar structure as self.transition
self.aux_transition = [{} for _ in range(self.k)]

def train(self, seq):
"""
Trains the model using a DNA sequence, computing the transition matrix of the model.
An auxiliar Markov model is trained, with an order in which there is no state without
transitions. In case of arriving to some state withput transitions, this auxiliar model
is used to generate the next transition.
Args:
seq (string): DNA sequence to train the model.
"""
self.size = len(seq)
seq = seq.upper()
self.seq = seq
self.transition = {}
for i in range(0, len(seq)-self.k):
kmer = seq[i: i+self.k]
if not kmer in self.transition:
self.transition[kmer] = {}
self.transition[kmer]["A"] = 0
self.transition[kmer]["C"] = 0
self.transition[kmer]["G"] = 0
self.transition[kmer]["T"] = 0
self.transition[kmer]["tot"] = 0
self.transition[kmer][seq[i+self.k]] += 1
self.transition[kmer]["tot"] += 1

## Computes all posible models
for k in range(0, self.k):
if k == 0:
aux_kmer = ""
else:
aux_kmer = kmer[-k:]
if not aux_kmer in self.aux_transition[k]:
self.aux_transition[k][aux_kmer] = {}
self.aux_transition[k][aux_kmer]["A"] = 0
self.aux_transition[k][aux_kmer]["C"] = 0
self.aux_transition[k][aux_kmer]["G"] = 0
self.aux_transition[k][aux_kmer]["T"] = 0
self.aux_transition[k][aux_kmer]["tot"] = 0
self.aux_transition[k][aux_kmer][seq[i+self.k]] += 1
self.aux_transition[k][aux_kmer]["tot"] += 1

for key in self.transition.keys():
self.transition[key]["A"] /= self.transition[key]["tot"]
self.transition[key]["C"] /= self.transition[key]["tot"]
self.transition[key]["G"] /= self.transition[key]["tot"]
self.transition[key]["T"] /= self.transition[key]["tot"]

## Searches for the model with the highest order that has no states without transitions
for k in range(self.k-1, -1, -1):
if len(self.aux_transition[k]) == 4**(k):
self.aux_transition = self.aux_transition[k]
for key in self.aux_transition.keys():
self.aux_transition[key]["A"] /= self.aux_transition[key]["tot"]
self.aux_transition[key]["C"] /= self.aux_transition[key]["tot"]
self.aux_transition[key]["G"] /= self.aux_transition[key]["tot"]
self.aux_transition[key]["T"] /= self.aux_transition[key]["tot"]
self.aux_k = k
break

def sample(self, state):
"""
Samples a new nucleotide given an array of probabilities.
Args:
state: Dictionary of probabilities for nucleotides.
"""
if state in self.transition.keys():
probs = self.transition[state]
else:
probs = self.aux_transition[state[-self.aux_k:]]
return np.random.choice(["A","C","G","T"], p=[probs["A"], probs["C"], probs["G"], probs["T"]])

def generate(self, size=None, N=1):
"""
Generates a new sequence of nucleotides using the trained model.
Args:
size (int): Length of the sequence to be generated.
N (int): Number of sequences to be generated.
"""
if size == None:
size = self.size

res = []
for _ in range(N):
if self.k == 0:
probs = self.transition[""]
seq = np.random.choice(["A","C","G","T"], p=[probs["A"], probs["C"], probs["G"], probs["T"]], size=n)
seq = "".join(seq)
else:
seq = self.initial_state()
state = seq
for _ in range(size-len(state)):
nuc = self.sample(state)
seq += nuc
state = seq[-self.k:]
res.append(seq)
return res

def generator(self, size=None, N=None):
"""
Generator for sampling new sequences of nucleotides using the trained model.
Args:
size (int): Length of the sequence to be generated.
N (int): Number of sequences to be generated.
"""
if size == None:
size = self.size

if N == None:
while 1:
seq = self.generate(size=size)[0]
yield seq
else:
for _ in range(N):
seq = self.generate(size=size)[0]
yield seq

def initial_state(self):
"""
Computes the initial k-mer of the sequence sampling from the list
of appeared k-mers in the training sequence.
"""
keys = list(self.transition.keys())
return random.choice(keys)
1 change: 1 addition & 0 deletions src/Markov_DNA/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from Markov_DNA.MCM import MCM

0 comments on commit 26b4402

Please sign in to comment.