Skip to content

Commit

Permalink
feat(transformer-nvidia): 先把参数拷贝到锁页内存
Browse files Browse the repository at this point in the history
Signed-off-by: YdrMaster <[email protected]>
  • Loading branch information
YdrMaster committed Feb 26, 2024
1 parent e2407d0 commit 92c50a1
Show file tree
Hide file tree
Showing 8 changed files with 188 additions and 8 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ members = [
"model-parameters",
"tokenizer",
"transformer-cpu",
"transformer-nvidia",
"xtask",
]
resolver = "2"
30 changes: 26 additions & 4 deletions model-parameters/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use std::{
};
use tensor::{DataType, Tensor};

pub use memory::{Memory, SafeTensorError};
pub use memory::{Allocator, Memory, SafeTensorError};
pub use save::save;

pub trait Llama2 {
Expand Down Expand Up @@ -80,6 +80,23 @@ pub trait Llama2 {
fn model_norm(&self) -> Tensor<Storage>;
/// Shape = `vocab_size x hidden_size`.
fn lm_head(&self) -> Tensor<Storage>;

fn tensors(&self) -> Vec<Tensor<Storage>> {
let mut tensors = Vec::with_capacity(self.num_hidden_layers() * 6 + 3);
tensors.push(self.embed_tokens());
tensors.push(self.embed_tokens());
for layer in 0..self.num_hidden_layers() {
tensors.push(self.input_layernorm(layer));
tensors.push(self.w_qkv(layer));
tensors.push(self.self_attn_o_proj(layer));
tensors.push(self.post_attention_layernorm(layer));
tensors.push(self.mlp_gate_up(layer));
tensors.push(self.mlp_down(layer));
}
tensors.push(self.model_norm());
tensors.push(self.lm_head());
tensors
}
}

#[derive(serde::Serialize, serde::Deserialize, Debug)]
Expand Down Expand Up @@ -119,7 +136,7 @@ impl From<&dyn Llama2> for ConfigJson {

#[derive(Clone)]
pub struct Storage {
data: Arc<dyn AsRef<[u8]>>,
data: Arc<dyn Deref<Target = [u8]>>,
range: Range<usize>,
}

Expand All @@ -134,19 +151,24 @@ impl Deref for Storage {

impl Storage {
#[inline]
pub fn new(data: Arc<dyn AsRef<[u8]>>, offset: usize, len: usize) -> Self {
pub fn new(data: Arc<dyn Deref<Target = [u8]>>, offset: usize, len: usize) -> Self {
Self {
data,
range: offset..offset + len,
}
}

#[inline]
pub fn from_blob(data: impl 'static + AsRef<[u8]>) -> Self {
pub fn from_blob(data: impl 'static + Deref<Target = [u8]>) -> Self {
let len = data.as_ref().len();
Self {
data: Arc::new(data),
range: 0..len,
}
}

#[inline]
pub fn raw_blob(&self) -> &[u8] {
&self.data
}
}
5 changes: 3 additions & 2 deletions model-parameters/src/memory/mod.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
mod cast;
mod realloc;
mod safe_tensors;

use crate::{ConfigJson, DataType, Llama2, Storage};
use common::utok;
use tensor::{udim, Shape, Tensor};

pub use realloc::Allocator;
pub use safe_tensors::SafeTensorError;
pub(crate) use safe_tensors::SafeTensorHeaderJson;

Expand Down Expand Up @@ -210,8 +212,7 @@ fn concat0(tensors: &[&Tensor<Storage>]) -> Tensor<Storage> {

#[test]
fn test_load() {
use std::io::ErrorKind::NotFound;
use std::time::Instant;
use std::{io::ErrorKind::NotFound, time::Instant};

let t0 = Instant::now();
let safetensors = Memory::load_safetensors("../../TinyLlama-1.1B-Chat-v1.0");
Expand Down
77 changes: 77 additions & 0 deletions model-parameters/src/memory/realloc.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
use crate::{memory::Layer, ConfigJson, Llama2, Memory, Storage};
use std::{ops::Deref, ptr::NonNull, slice::from_raw_parts_mut, sync::Arc};
use tensor::Tensor;

pub trait Allocator {
unsafe fn allocate(&self, size: usize) -> NonNull<u8>;
unsafe fn deallocate(&self, ptr: NonNull<u8>);
}

struct TotalStorage<A: Allocator> {
ptr: NonNull<u8>,
len: usize,
allocator: A,
}

impl<A: Allocator> Deref for TotalStorage<A> {
type Target = [u8];

fn deref(&self) -> &Self::Target {
unsafe { std::slice::from_raw_parts(self.ptr.as_ptr(), self.len) }
}
}

impl<A: Allocator> Drop for TotalStorage<A> {
fn drop(&mut self) {
unsafe { self.allocator.deallocate(self.ptr) }
}
}

impl Memory {
pub fn realloc_with(src: &dyn Llama2, allocator: impl Allocator + 'static) -> Self {
let len = src.size();
let ptr = unsafe { allocator.allocate(len) };
let total = Arc::new(TotalStorage {
ptr,
len,
allocator,
});

struct Writer<A: Allocator> {
total: Arc<TotalStorage<A>>,
offset: usize,
}
impl<A: Allocator + 'static> Writer<A> {
fn write(&mut self, tensor: Tensor<Storage>) -> Tensor<Storage> {
let offset = self.offset;
let ptr = self.total.ptr.as_ptr();
let len = tensor.bytes_size();
self.offset += len;
unsafe { tensor.reform_to_raw(from_raw_parts_mut(ptr.add(offset), len)) };
Tensor::new(
tensor.data_type(),
tensor.shape(),
Storage::new(self.total.clone(), offset, len),
)
}
}

let mut writer = Writer { total, offset: 0 };
Self {
config: ConfigJson::from(src),
embed_tokens: writer.write(src.embed_tokens()),
layers: (0..src.num_hidden_layers())
.map(|layer| Layer {
input_layernorm: writer.write(src.input_layernorm(layer)),
w_qkv: writer.write(src.w_qkv(layer)),
self_attn_o_proj: writer.write(src.self_attn_o_proj(layer)),
post_attention_layernorm: writer.write(src.post_attention_layernorm(layer)),
mlp_gate_up: writer.write(src.mlp_gate(layer)),
mlp_down: writer.write(src.mlp_down(layer)),
})
.collect(),
model_norm: writer.write(src.model_norm()),
lm_head: writer.write(src.lm_head()),
}
}
}
3 changes: 1 addition & 2 deletions transformer-cpu/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -201,8 +201,7 @@ fn tensor(dt: DataType, shape: &[udim]) -> Tensor<Storage> {
#[test]
fn test_build() {
use model_parameters::SafeTensorError;
use std::io::ErrorKind::NotFound;
use std::time::Instant;
use std::{io::ErrorKind::NotFound, time::Instant};

let t0 = Instant::now();
let safetensors = Memory::load_safetensors("../../TinyLlama-1.1B-Chat-v1.0");
Expand Down
20 changes: 20 additions & 0 deletions transformer-nvidia/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
[package]
name = "transformer-nvidia"
version = "0.0.0"
edition = "2021"
authors = ["YdrMaster <[email protected]>"]

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
common = { path = "../common" }
tensor = { path = "../tensor" }
model-parameters = { path = "../model-parameters" }
# cuda = { git = "https://github.com/YdrMaster/cuda-bench" }
cuda = { path = "../../cuda-bench/cuda" }

[dev-dependencies]
tokenizer = { path = "../tokenizer" }

[build-dependencies]
find_cuda_helper = "0.2"
5 changes: 5 additions & 0 deletions transformer-nvidia/build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
fn main() {
if find_cuda_helper::find_cuda_root().is_some() {
println!("cargo:rustc-cfg=detected_cuda");
}
}
55 changes: 55 additions & 0 deletions transformer-nvidia/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#![cfg(detected_cuda)]

use cuda::{driver, Context};
use std::{
ptr::{null_mut, NonNull},
sync::Arc,
};

pub extern crate model_parameters;

struct HostAllocator(Arc<Context>);

impl model_parameters::Allocator for HostAllocator {
#[inline]
unsafe fn allocate(&self, size: usize) -> NonNull<u8> {
let mut ptr = null_mut();
self.0.apply(|_| driver!(cuMemHostAlloc(&mut ptr, size, 0)));
NonNull::new(ptr.cast()).unwrap()
}

#[inline]
unsafe fn deallocate(&self, ptr: NonNull<u8>) {
self.0
.apply(|_| driver!(cuMemFreeHost(ptr.as_ptr().cast())));
}
}

#[test]
fn test_load() {
use model_parameters::{Memory, SafeTensorError};
use std::{io::ErrorKind::NotFound, time::Instant};

cuda::init();
let Some(dev) = cuda::Device::fetch() else {
return;
};

let t0 = Instant::now();
let safetensors = Memory::load_safetensors("../../TinyLlama-1.1B-Chat-v1.0_F16");
let t1 = Instant::now();
println!("mmap {:?}", t1 - t0);

let safetensors = match safetensors {
Ok(m) => m,
Err(SafeTensorError::Io(e)) if e.kind() == NotFound => return,
Err(e) => panic!("{e:?}"),
};

dev.context().apply(|ctx| {
let t0 = Instant::now();
let _model = Memory::realloc_with(&safetensors, HostAllocator(ctx.clone_ctx()));
let t1 = Instant::now();
println!("realloc {:?}", t1 - t0);
});
}

0 comments on commit 92c50a1

Please sign in to comment.