diff --git a/README.md b/README.md index 2a20ffe..87767ab 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,9 @@ ## MEGABYTE - Pytorch -Implementation of MEGABYTE, Predicting Million-byte Sequences with Multiscale Transformers, in Pytorch +Implementation of MEGABYTE, Predicting Million-byte Sequences with Multiscale Transformers, in Pytorch. Took the liberty to generalize it even further so one can have multiple local models. -Similar independent research +Similar independent research that is a further generalization ## Appreciation @@ -25,8 +25,8 @@ from MEGABYTE_pytorch import MEGABYTE model = MEGABYTE( num_tokens = 16000, # number of tokens dim = 512, # transformer model dimension - max_seq_len = (1024, 4), # sequence length for global and then local - depth = (6, 4), # number of layers for global and then local + max_seq_len = (1024, 4), # sequence length for global and then local. this can be more than 2 + depth = (6, 4), # number of layers for global and then local. this can be more than 2, but length must match the max_seq_len's dim_head = 64, # dimension per head heads = 8, # number of attention heads flash_attn = True # use flash attention