feat(transformer-nvidia): 先把参数拷贝到锁页内存

Signed-off-by: YdrMaster <[email protected]>
InfiniTensor · Feb 26, 2024 · 92c50a1 · 92c50a1
1 parent e2407d0
commit 92c50a1
Show file tree

Hide file tree

Showing 8 changed files with 188 additions and 8 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -5,6 +5,7 @@ members = [
     "model-parameters",
     "tokenizer",
     "transformer-cpu",
+    "transformer-nvidia",
     "xtask",
 ]
 resolver = "2"
diff --git a/model-parameters/src/lib.rs b/model-parameters/src/lib.rs
@@ -8,7 +8,7 @@ use std::{
 };
 use tensor::{DataType, Tensor};
 
-pub use memory::{Memory, SafeTensorError};
+pub use memory::{Allocator, Memory, SafeTensorError};
 pub use save::save;
 
 pub trait Llama2 {
@@ -80,6 +80,23 @@ pub trait Llama2 {
     fn model_norm(&self) -> Tensor<Storage>;
     /// Shape = `vocab_size x hidden_size`.
     fn lm_head(&self) -> Tensor<Storage>;
+
+    fn tensors(&self) -> Vec<Tensor<Storage>> {
+        let mut tensors = Vec::with_capacity(self.num_hidden_layers() * 6 + 3);
+        tensors.push(self.embed_tokens());
+        tensors.push(self.embed_tokens());
+        for layer in 0..self.num_hidden_layers() {
+            tensors.push(self.input_layernorm(layer));
+            tensors.push(self.w_qkv(layer));
+            tensors.push(self.self_attn_o_proj(layer));
+            tensors.push(self.post_attention_layernorm(layer));
+            tensors.push(self.mlp_gate_up(layer));
+            tensors.push(self.mlp_down(layer));
+        }
+        tensors.push(self.model_norm());
+        tensors.push(self.lm_head());
+        tensors
+    }
 }
 
 #[derive(serde::Serialize, serde::Deserialize, Debug)]
@@ -119,7 +136,7 @@ impl From<&dyn Llama2> for ConfigJson {
 
 #[derive(Clone)]
 pub struct Storage {
-    data: Arc<dyn AsRef<[u8]>>,
+    data: Arc<dyn Deref<Target = [u8]>>,
     range: Range<usize>,
 }
 
@@ -134,19 +151,24 @@ impl Deref for Storage {
 
 impl Storage {
     #[inline]
-    pub fn new(data: Arc<dyn AsRef<[u8]>>, offset: usize, len: usize) -> Self {
+    pub fn new(data: Arc<dyn Deref<Target = [u8]>>, offset: usize, len: usize) -> Self {
         Self {
             data,
             range: offset..offset + len,
         }
     }
 
     #[inline]
-    pub fn from_blob(data: impl 'static + AsRef<[u8]>) -> Self {
+    pub fn from_blob(data: impl 'static + Deref<Target = [u8]>) -> Self {
         let len = data.as_ref().len();
         Self {
             data: Arc::new(data),
             range: 0..len,
         }
     }
+
+    #[inline]
+    pub fn raw_blob(&self) -> &[u8] {
+        &self.data
+    }
 }
diff --git a/model-parameters/src/memory/mod.rs b/model-parameters/src/memory/mod.rs
@@ -1,10 +1,12 @@
 mod cast;
+mod realloc;
 mod safe_tensors;
 
 use crate::{ConfigJson, DataType, Llama2, Storage};
 use common::utok;
 use tensor::{udim, Shape, Tensor};
 
+pub use realloc::Allocator;
 pub use safe_tensors::SafeTensorError;
 pub(crate) use safe_tensors::SafeTensorHeaderJson;
 
@@ -210,8 +212,7 @@ fn concat0(tensors: &[&Tensor<Storage>]) -> Tensor<Storage> {
 
 #[test]
 fn test_load() {
-    use std::io::ErrorKind::NotFound;
-    use std::time::Instant;
+    use std::{io::ErrorKind::NotFound, time::Instant};
 
     let t0 = Instant::now();
     let safetensors = Memory::load_safetensors("../../TinyLlama-1.1B-Chat-v1.0");

diff --git a/model-parameters/src/memory/realloc.rs b/model-parameters/src/memory/realloc.rs
@@ -0,0 +1,77 @@
+use crate::{memory::Layer, ConfigJson, Llama2, Memory, Storage};
+use std::{ops::Deref, ptr::NonNull, slice::from_raw_parts_mut, sync::Arc};
+use tensor::Tensor;
+
+pub trait Allocator {
+    unsafe fn allocate(&self, size: usize) -> NonNull<u8>;
+    unsafe fn deallocate(&self, ptr: NonNull<u8>);
+}
+
+struct TotalStorage<A: Allocator> {
+    ptr: NonNull<u8>,
+    len: usize,
+    allocator: A,
+}
+
+impl<A: Allocator> Deref for TotalStorage<A> {
+    type Target = [u8];
+
+    fn deref(&self) -> &Self::Target {
+        unsafe { std::slice::from_raw_parts(self.ptr.as_ptr(), self.len) }
+    }
+}
+
+impl<A: Allocator> Drop for TotalStorage<A> {
+    fn drop(&mut self) {
+        unsafe { self.allocator.deallocate(self.ptr) }
+    }
+}
+
+impl Memory {
+    pub fn realloc_with(src: &dyn Llama2, allocator: impl Allocator + 'static) -> Self {
+        let len = src.size();
+        let ptr = unsafe { allocator.allocate(len) };
+        let total = Arc::new(TotalStorage {
+            ptr,
+            len,
+            allocator,
+        });
+
+        struct Writer<A: Allocator> {
+            total: Arc<TotalStorage<A>>,
+            offset: usize,
+        }
+        impl<A: Allocator + 'static> Writer<A> {
+            fn write(&mut self, tensor: Tensor<Storage>) -> Tensor<Storage> {
+                let offset = self.offset;
+                let ptr = self.total.ptr.as_ptr();
+                let len = tensor.bytes_size();
+                self.offset += len;
+                unsafe { tensor.reform_to_raw(from_raw_parts_mut(ptr.add(offset), len)) };
+                Tensor::new(
+                    tensor.data_type(),
+                    tensor.shape(),
+                    Storage::new(self.total.clone(), offset, len),
+                )
+            }
+        }
+
+        let mut writer = Writer { total, offset: 0 };
+        Self {
+            config: ConfigJson::from(src),
+            embed_tokens: writer.write(src.embed_tokens()),
+            layers: (0..src.num_hidden_layers())
+                .map(|layer| Layer {
+                    input_layernorm: writer.write(src.input_layernorm(layer)),
+                    w_qkv: writer.write(src.w_qkv(layer)),
+                    self_attn_o_proj: writer.write(src.self_attn_o_proj(layer)),
+                    post_attention_layernorm: writer.write(src.post_attention_layernorm(layer)),
+                    mlp_gate_up: writer.write(src.mlp_gate(layer)),
+                    mlp_down: writer.write(src.mlp_down(layer)),
+                })
+                .collect(),
+            model_norm: writer.write(src.model_norm()),
+            lm_head: writer.write(src.lm_head()),
+        }
+    }
+}
diff --git a/transformer-cpu/src/lib.rs b/transformer-cpu/src/lib.rs
@@ -201,8 +201,7 @@ fn tensor(dt: DataType, shape: &[udim]) -> Tensor<Storage> {
 #[test]
 fn test_build() {
     use model_parameters::SafeTensorError;
-    use std::io::ErrorKind::NotFound;
-    use std::time::Instant;
+    use std::{io::ErrorKind::NotFound, time::Instant};
 
     let t0 = Instant::now();
     let safetensors = Memory::load_safetensors("../../TinyLlama-1.1B-Chat-v1.0");

diff --git a/transformer-nvidia/Cargo.toml b/transformer-nvidia/Cargo.toml
@@ -0,0 +1,20 @@
+[package]
+name = "transformer-nvidia"
+version = "0.0.0"
+edition = "2021"
+authors = ["YdrMaster <[email protected]>"]
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+common = { path = "../common" }
+tensor = { path = "../tensor" }
+model-parameters = { path = "../model-parameters" }
+# cuda = { git = "https://github.com/YdrMaster/cuda-bench" }
+cuda = { path = "../../cuda-bench/cuda" }
+
+[dev-dependencies]
+tokenizer = { path = "../tokenizer" }
+
+[build-dependencies]
+find_cuda_helper = "0.2"
diff --git a/transformer-nvidia/build.rs b/transformer-nvidia/build.rs
@@ -0,0 +1,5 @@
+fn main() {
+    if find_cuda_helper::find_cuda_root().is_some() {
+        println!("cargo:rustc-cfg=detected_cuda");
+    }
+}
diff --git a/transformer-nvidia/src/lib.rs b/transformer-nvidia/src/lib.rs
@@ -0,0 +1,55 @@
+#![cfg(detected_cuda)]
+
+use cuda::{driver, Context};
+use std::{
+    ptr::{null_mut, NonNull},
+    sync::Arc,
+};
+
+pub extern crate model_parameters;
+
+struct HostAllocator(Arc<Context>);
+
+impl model_parameters::Allocator for HostAllocator {
+    #[inline]
+    unsafe fn allocate(&self, size: usize) -> NonNull<u8> {
+        let mut ptr = null_mut();
+        self.0.apply(|_| driver!(cuMemHostAlloc(&mut ptr, size, 0)));
+        NonNull::new(ptr.cast()).unwrap()
+    }
+
+    #[inline]
+    unsafe fn deallocate(&self, ptr: NonNull<u8>) {
+        self.0
+            .apply(|_| driver!(cuMemFreeHost(ptr.as_ptr().cast())));
+    }
+}
+
+#[test]
+fn test_load() {
+    use model_parameters::{Memory, SafeTensorError};
+    use std::{io::ErrorKind::NotFound, time::Instant};
+
+    cuda::init();
+    let Some(dev) = cuda::Device::fetch() else {
+        return;
+    };
+
+    let t0 = Instant::now();
+    let safetensors = Memory::load_safetensors("../../TinyLlama-1.1B-Chat-v1.0_F16");
+    let t1 = Instant::now();
+    println!("mmap {:?}", t1 - t0);
+
+    let safetensors = match safetensors {
+        Ok(m) => m,
+        Err(SafeTensorError::Io(e)) if e.kind() == NotFound => return,
+        Err(e) => panic!("{e:?}"),
+    };
+
+    dev.context().apply(|ctx| {
+        let t0 = Instant::now();
+        let _model = Memory::realloc_with(&safetensors, HostAllocator(ctx.clone_ctx()));
+        let t1 = Instant::now();
+        println!("realloc {:?}", t1 - t0);
+    });
+}