diff --git a/README.md b/README.md
index 2a20ffe..87767ab 100644
--- a/README.md
+++ b/README.md
@@ -2,9 +2,9 @@
## MEGABYTE - Pytorch
-Implementation of MEGABYTE, Predicting Million-byte Sequences with Multiscale Transformers, in Pytorch
+Implementation of MEGABYTE, Predicting Million-byte Sequences with Multiscale Transformers, in Pytorch. Took the liberty to generalize it even further so one can have multiple local models.
-Similar independent research
+Similar independent research that is a further generalization
## Appreciation
@@ -25,8 +25,8 @@ from MEGABYTE_pytorch import MEGABYTE
model = MEGABYTE(
num_tokens = 16000, # number of tokens
dim = 512, # transformer model dimension
- max_seq_len = (1024, 4), # sequence length for global and then local
- depth = (6, 4), # number of layers for global and then local
+ max_seq_len = (1024, 4), # sequence length for global and then local. this can be more than 2
+ depth = (6, 4), # number of layers for global and then local. this can be more than 2, but length must match the max_seq_len's
dim_head = 64, # dimension per head
heads = 8, # number of attention heads
flash_attn = True # use flash attention