grok-1 support. (#18)

b4rtaz · Apr 11, 2024 · 620644a · 620644a
1 parent 49e18b3
commit 620644a
Show file tree

Hide file tree

Showing 33 changed files with 1,829 additions and 1,087 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,6 +7,7 @@
 __pycache__
 
 quants-test
-transformer-tasks-test
+llama2-tasks-test
+grok1-tasks-test
 main
 run.sh
diff --git a/Makefile b/Makefile
@@ -11,14 +11,20 @@ socket: src/socket.cpp
 	$(CXX) $(CXXFLAGS) -c src/socket.cpp -o socket.o
 transformer: src/utils.cpp
 	$(CXX) $(CXXFLAGS) -c src/transformer.cpp -o transformer.o
-transformer-tasks: src/transformer-tasks.cpp
-	$(CXX) $(CXXFLAGS) -c src/transformer-tasks.cpp -o transformer-tasks.o
+tasks: src/tasks.cpp
+	$(CXX) $(CXXFLAGS) -c src/tasks.cpp -o tasks.o
+llama2-tasks: src/llama2-tasks.cpp
+	$(CXX) $(CXXFLAGS) -c src/llama2-tasks.cpp -o llama2-tasks.o
+grok1-tasks: src/grok1-tasks.cpp
+	$(CXX) $(CXXFLAGS) -c src/grok1-tasks.cpp -o grok1-tasks.o
 tokenizer: src/tokenizer.cpp
 	$(CXX) $(CXXFLAGS) -c src/tokenizer.cpp -o tokenizer.o
 
-main: src/main.cpp utils quants funcs socket transformer transformer-tasks tokenizer
-	$(CXX) $(CXXFLAGS) src/main.cpp -o main utils.o quants.o funcs.o socket.o transformer.o transformer-tasks.o tokenizer.o -lpthread
+main: src/main.cpp utils quants funcs socket transformer tasks llama2-tasks grok1-tasks tokenizer
+	$(CXX) $(CXXFLAGS) src/main.cpp -o main utils.o quants.o funcs.o socket.o transformer.o tasks.o llama2-tasks.o grok1-tasks.o tokenizer.o -lpthread
 quants-test: src/quants.cpp utils quants
 	$(CXX) $(CXXFLAGS) src/quants-test.cpp -o quants-test utils.o quants.o -lpthread
-transformer-tasks-test: src/transformer-tasks-test.cpp utils quants funcs socket transformer transformer-tasks tokenizer
-	$(CXX) $(CXXFLAGS) src/transformer-tasks-test.cpp -o transformer-tasks-test utils.o quants.o funcs.o socket.o transformer.o transformer-tasks.o tokenizer.o -lpthread
+llama2-tasks-test: src/llama2-tasks-test.cpp utils quants funcs socket transformer llama2-tasks tokenizer
+	$(CXX) $(CXXFLAGS) src/llama2-tasks-test.cpp -o llama2-tasks-test utils.o quants.o funcs.o socket.o transformer.o tasks.o llama2-tasks.o tokenizer.o -lpthread
+grok1-tasks-test: src/grok1-tasks-test.cpp utils quants funcs socket transformer llama2-tasks grok1-tasks tokenizer
+	$(CXX) $(CXXFLAGS) src/grok1-tasks-test.cpp -o grok1-tasks-test utils.o quants.o funcs.o socket.o transformer.o tasks.o llama2-tasks.o grok1-tasks.o tokenizer.o -lpthread
diff --git a/README.md b/README.md
@@ -8,15 +8,14 @@ Run LLMs on weak devices or make powerful devices even more powerful by distribu
 
 <p align="center">
   <img src=".github/8raspi.jpg" width="50%" alt="Distributed Llama running on 8 Raspberry Pi 4B devices" /><br />
-  <sub><sup>Distributed Llama running on 8 Raspberry Pi 4B devices</sup></sub>
+  <sub><sup>Distributed Llama running Llama 2 70B on 8 Raspberry Pi 4B devices</sup></sub>
 </p>
 
-This project was initiated based on the [llama2.c](https://github.com/karpathy/llama2.c) repository. Big thanks to [@karpathy](https://github.com/karpathy) and other contributors. Most ARM optimizations come from the [llama.cpp](https://github.com/ggerganov/llama.cpp) project.
-
-📃 [Read the report](https://raw.githubusercontent.com/b4rtaz/distributed-llama/main/report/report.pdf)
+**Supported models:**
+* Llama 2 (7B, 13B, 70B) chat and non-chat versions,
+* Grok-1 (314B).
 
-**Known limitations**
-* This project is a proof of concept, it's not optimized for production usage.
+**Known limitations:**
 * You can run Distributed Llama only on 1, 2, 4... 2^n devices.
 * Optimized for (weights format × buffer format):
   * ARM CPUs
@@ -30,10 +29,6 @@ This project was initiated based on the [llama2.c](https://github.com/karpathy/l
     * ❌ Q40 × F32
     * ⚠️ Q40 × Q80 (partial optimization)
 
-**Supported models**
-* Llama 2 (7B, 13B, 70B) chat and non-chat versions,
-* Llama 2 compatible models
-
 **Architecture**<br />
 The project is split up into two parts:
 * **Root node** - it's responsible for loading the model and weights and forward them to workers. Also, it synchronizes the state of the neural network. The root node is also a worker, it processes own slice of the neural network.
@@ -106,7 +101,7 @@ All tests below were conducted on c3d-highcpu-30 (30 vCPU, 15 core, 59 GB memory
 
 <sub><sup>S - sent data from the root node to workers, R - received data by the root node from workers</sup></sub>
 
-## 🔨 How to Convert Llama 2 Weights
+## 🔨 Download & Convert Llama 2
 
 1. Download [Llama 2](https://github.com/facebookresearch/llama) weights from Meta. This project supports 7B, 7B-chat, 13B, 13B-chat, 70B and 70B-chat models.
 2. Open the `llama-2-7b/params.json` file and replace `"vocab_size": -1` to `"vocab_size": 32000`.
@@ -118,6 +113,10 @@ cd converter && pip install -r requirements.txt
 ```sh
 python convert-llama2.py /path/to/meta/llama-2-7b q40
 ```
+5. Download the `tokenizer.bin` file from the [llama2.c](https://github.com/karpathy/llama2.c) repository.
+```
+wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin
+```
 
 In the table below, you can find the expected size of the converted weights with different floating-point types.
 
@@ -127,23 +126,14 @@ In the table below, you can find the expected size of the converted weights with
 | Llama 2 13B | 26.03 GB      |          |          | 7.35 GB  |
 | Llama 2 70B | 137.97 GB     |          |          | 36.98 GB |
 
-## 🔨 How to Convert .bin Weights
-
-You can convert weights compatible with [llama2.c](https://github.com/karpathy/llama2.c) to the Distributed Llama format. The legacy converter converts weights only to Float32 format.
+## 🔨 Download Grok-1 Weights
 
-1. Download weights.
-```
-wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories42M.bin
-wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.bin
-```
-2. Install dependencies of the converter:
-```sh
-cd converter && pip install -r requirements.txt
+1. Download quantized (Q40) weights from https://huggingface.co/b4rtaz/grok-1-dllama (180GB).
+2. Merge split models files into single file:
 ```
-3. Convert weights to Distributed Llama format.
-```sh
-python convert-legacy.py stories42M.bin true
+cat dllama-grok-1-q40.binaa dllama-grok-1-q40.binab dllama-grok-1-q40.binac dllama-grok-1-q40.binad dllama-grok-1-q40.binae dllama-grok-1-q40.binaf dllama-grok-1-q40.binag dllama-grok-1-q40.binah dllama-grok-1-q40.binai > dllama-grok-1-q40-final.bin
 ```
+3. The tokenizer file is already added to this repository: `tokenizers/grok-1-tokenizer.t`.
 
 ## 📟 How to Run on Raspberry Pi Devices
 
@@ -166,21 +156,17 @@ git clone https://github.com/b4rtaz/distributed-llama.git
 ```sh
 make main
 ```
-7. Download the `tokenizer.bin` file from the [llama2.c](https://github.com/karpathy/llama2.c) repository to the root device.
-```
-wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin
-```
-8. Transfer converted weights to the root device.
-9. Optional: assign static IP addresses.
+7. Transfer weights and the tokenizer file to the root device.
+8. Optional: assign static IP addresses.
 ```sh
 sudo ip addr add 10.0.0.1/24 dev eth0 # 1th device
 sudo ip addr add 10.0.0.2/24 dev eth0 # 2th device
 ```
-10. Run worker nodes on worker devices:
+9. Run worker nodes on worker devices:
 ```sh
 sudo nice -n -20 ./main worker --port 9998 --nthreads 4
 ```
-11. Run root node on the root device:
+10. Run root node on the root device:
 ```sh
 sudo nice -n -20 ./main inference --model ../dllama_llama-2-7b_q40.bin --tokenizer ../tokenizer.bin --weights-float-type q40 --buffer-float-type q80 --prompt "Hello world" --steps 16 --nthreads 4 --workers 10.0.0.2:9998
 ```
@@ -209,26 +195,16 @@ git clone https://github.com/b4rtaz/distributed-llama.git
 ```sh
 make main
 ```
-4. Download the `tokenizer.bin` file from the [llama2.c](https://github.com/karpathy/llama2.c) repository.
-```sh
-wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin
-```
-5. Download converted weights from your Google Drive. To get the file ID you need to share the file ("Anyone with the link") and copy the ID from the URL.
-```sh
-sudo apt install python pip
-pip install gdown
-gdown https://drive.google.com/uc?id=<FILE_ID>
-```
-6. Run worker nodes on worker devices:
+4. Transfer weights and the tokenizer file to the root node.
+5. Run worker nodes on worker devices:
 ```sh
 sudo nice -n -20 ./main worker --port 9998 --nthreads 4
 ```
-7. Run worker nodes on worker devices:
+6. Run root node on the root device:
 ```sh
 sudo nice -n -20 ./main inference --model ../dllama_llama-2-7b_q40.bin --tokenizer ../tokenizer.bin --weights-float-type q40 --buffer-float-type q80 --prompt "Hello world" --steps 16 --nthreads 4 --workers 192.168.0.1:9998
 ```
-
-8. To run the root node in the chat mode:
+7. To run the root node in the chat mode:
 ```sh
 sudo nice -n -20 ./main chat --model ../dllama_llama-2-7b-chat_q40.bin --tokenizer ../tokenizer.bin --weights-float-type q40 --buffer-float-type q80 --nthreads 4 --workers 192.168.0.1:9998
 ```

diff --git a/converter/convert-grok-1.py b/converter/convert-grok-1.py
@@ -0,0 +1,126 @@
+import gc
+import torch
+import sys
+import os
+from writer import isFloatTypeSupported, writeTensor, writeHeader 
+
+# Model: https://huggingface.co/keyfan/grok-1-hf/tree/main
+
+currentFileIndex = None
+model = None
+layerMap = {}
+folderPath = None
+
+def unloadModel():
+    global model
+    if model:
+       del model
+       model = None
+       gc.collect()
+
+def loadModel(index):
+    global currentFileIndex
+    global model
+    global layerMap
+    global folderPath
+    if (currentFileIndex == index):
+        return
+    unloadModel()
+    fileName = f'pytorch_model-000{str(index).zfill(2)}-of-00019.bin'
+    filePath = os.path.join(folderPath, fileName)
+    print(f'💿 Loading file {fileName}...')
+    model = torch.load(filePath, map_location='cpu')
+    layerNames = list(model.keys())
+    for layerName in layerNames:
+        layerMap[layerName] = index
+    print(f'Found layers: {layerNames}')
+    currentFileIndex = index
+
+def writeLayer(outFile, layerName, targetFloatType):
+    global currentFileIndex
+    global model
+    global layerMap
+
+    if (not layerName in model):
+        if (layerName in layerMap):
+            loadModel(layerMap[layerName])
+        else:
+            loadModel(currentFileIndex + 1)
+    if (not layerName in model):
+        raise Exception(f'Cannot load {layerName}')
+
+    tensor = model[layerName]
+    print(f'🔶 Writing tensor {layerName} {tensor.shape}...')
+    writeTensor(outFile, tensor, targetFloatType)
+
+def convert(targetFloatType, outputFileName):
+    outFile = open(outputFileName, 'wb')
+
+    params = {
+        'arch_type': 0xABCD01,
+        'dim': 6144,
+        'hidden_dim': 32768,
+        'n_layers': 64,
+        'n_heads': 48,
+        'n_kv_heads': 8,
+        'n_experts': 8,
+        'n_active_experts': 2,
+        'vocab_size': 131072,
+        'max_seq_len': 8192,
+    }
+    writeHeader(outFile, params)
+
+    #### pytorch_model-00001-of-00019.bin -> pytorch_model-00019-of-00019.bin
+    loadModel(1)
+
+    writeLayer(outFile, 'transformer.in_out_embed.weight', 'f32')
+
+    for index in range(0, params['n_layers']):
+        writeLayer(outFile, f'transformer.decoder_layer.{index}.multi_head_attention.query.weight', targetFloatType)
+        writeLayer(outFile, f'transformer.decoder_layer.{index}.multi_head_attention.key.weight', targetFloatType)
+        writeLayer(outFile, f'transformer.decoder_layer.{index}.multi_head_attention.value.weight', targetFloatType)
+        writeLayer(outFile, f'transformer.decoder_layer.{index}.multi_head_attention.linear.weight', targetFloatType)
+
+        writeLayer(outFile, f'transformer.decoder_layer.{index}.router.weight', targetFloatType)
+        for e in range(params['n_experts']):
+            writeLayer(outFile, f'transformer.decoder_layer.{index}.moe.{e}.linear_v.weight', targetFloatType) # up
+            writeLayer(outFile, f'transformer.decoder_layer.{index}.moe.{e}.linear.weight', targetFloatType) # gate
+            writeLayer(outFile, f'transformer.decoder_layer.{index}.moe.{e}.linear_1.weight', targetFloatType) # down
+
+        writeLayer(outFile, f'transformer.decoder_layer.{index}.rms_norm.weight', 'f32')
+        writeLayer(outFile, f'transformer.decoder_layer.{index}.rms_norm_1.weight', 'f32')
+        writeLayer(outFile, f'transformer.decoder_layer.{index}.rms_norm_2.weight', 'f32')
+        writeLayer(outFile, f'transformer.decoder_layer.{index}.rms_norm_3.weight', 'f32')
+
+    #### pytorch_model-00019-of-00019.bin
+    loadModel(19)
+
+    writeLayer(outFile, 'transformer.rms_norm.weight', 'f32') # rmsFinalNorm
+
+    #### pytorch_model-00001-of-00019.bin
+    loadModel(1)
+
+    writeLayer(outFile, 'lm_head.weight', targetFloatType)
+
+    unloadModel()
+
+    outFile.close()
+    print(f'Converted {outputFileName}')
+
+def usage():
+    print('Usage: python convert-grok-1.py <modelPath> <targetFloatType>')
+    exit(1)
+
+if __name__ == '__main__':
+    if (len(sys.argv) < 3):
+        usage()
+
+    folderPath = sys.argv[1]
+    targetFloatType = sys.argv[2]
+    outputFileName = f'dllama-grok-1-{targetFloatType}.bin'
+
+    if not isFloatTypeSupported(targetFloatType):
+        print('Float type is not supported')
+        exit(1)
+
+    convert(targetFloatType, outputFileName)
diff --git a/converter/convert-legacy.py b/converter/convert-legacy.py