EricLBuehler · EricLBuehler · Jan 1, 2025 · Dec 28, 2024 · Dec 28, 2024 · Dec 28, 2024
diff --git a/mistralrs-core/src/dummy_paged_attention/cache_engine.rs b/mistralrs-core/src/dummy_paged_attention/cache_engine.rs
@@ -26,6 +26,7 @@ impl CacheEngine {
         _cache_config: &CacheConfig,
         _dtype: DType,
         _device: &Device,
+        _layer_devices: Vec<Option<Device>>,
     ) -> Result<Self> {
         Ok(Self {
             dummy_cache: Arc::new(Mutex::new(Vec::new())),

diff --git a/mistralrs-core/src/models/gemma.rs b/mistralrs-core/src/models/gemma.rs
@@ -284,6 +284,7 @@ impl Attention {
             v.reshape((b_sz, self.num_kv_heads, q_len, self.head_dim))?
         };
 
+        let start_offsets_kernel = start_offsets_kernel.to_device(q.device())?;
         self.rotary_emb
             .forward(seqlen_offsets, &start_offsets_kernel, &mut q, &mut k, b_sz)?;
 

diff --git a/mistralrs-core/src/models/gemma2.rs b/mistralrs-core/src/models/gemma2.rs
@@ -292,6 +292,7 @@ impl Attention {
             v.reshape((b_sz, self.num_kv_heads, q_len, self.head_dim))?
         };
 
+        let start_offsets_kernel = start_offsets_kernel.to_device(q.device())?;
         self.rotary_emb
             .forward(seqlen_offsets, &start_offsets_kernel, &mut q, &mut k, b_sz)?;
 

diff --git a/mistralrs-core/src/models/mistral.rs b/mistralrs-core/src/models/mistral.rs
@@ -261,6 +261,7 @@ impl Attention {
             v.reshape((b_sz, self.num_kv_heads, q_len, self.head_dim))?
         };
 
+        let start_offsets_kernel = start_offsets_kernel.to_device(q.device())?;
         self.rotary_emb
             .forward(seqlen_offsets, &start_offsets_kernel, &mut q, &mut k, b_sz)?;
 

diff --git a/mistralrs-core/src/models/mixtral.rs b/mistralrs-core/src/models/mixtral.rs
@@ -156,6 +156,7 @@ impl Attention {
             v.reshape((b_sz, self.num_kv_heads, q_len, self.head_dim))?
         };
 
+        let start_offsets_kernel = start_offsets_kernel.to_device(q.device())?;
         self.rotary_emb
             .forward(seqlen_offsets, &start_offsets_kernel, &mut q, &mut k, b_sz)?;
 

diff --git a/mistralrs-core/src/models/phi2.rs b/mistralrs-core/src/models/phi2.rs
@@ -275,6 +275,7 @@ impl Attention {
             v.reshape((b_size, self.num_kv_heads, seq_len, self.head_dim))?
         };
 
+        let start_offsets_kernel = start_offsets_kernel.to_device(q.device())?;
         self.rotary_emb.forward(
             seqlen_offsets,
             &start_offsets_kernel,

diff --git a/mistralrs-core/src/models/quantized_llama.rs b/mistralrs-core/src/models/quantized_llama.rs
@@ -163,6 +163,7 @@ impl LayerWeights {
             v.reshape((b_sz, self.n_kv_head, seq_len, self.head_dim))?
         };
 
+        let start_offsets_kernel = start_offsets_kernel.to_device(q.device())?;
         self.rotary
             .forward(start_offsets, &start_offsets_kernel, &mut q, &mut k, b_sz)?;
 

diff --git a/mistralrs-core/src/models/quantized_qwen2.rs b/mistralrs-core/src/models/quantized_qwen2.rs
@@ -79,6 +79,7 @@ impl LayerWeights {
             v.reshape((b_sz, self.n_kv_head, seq_len, self.head_dim))?
         };
 
+        let start_offsets_kernel = start_offsets_kernel.to_device(q.device())?;
         self.rotary
             .forward(start_offsets, &start_offsets_kernel, &mut q, &mut k, b_sz)?;
 

diff --git a/mistralrs-core/src/models/quantized_starcoder2.rs b/mistralrs-core/src/models/quantized_starcoder2.rs
@@ -87,6 +87,7 @@ impl LayerWeights {
             v.reshape((b_sz, self.n_kv_head, q_len, self.head_dim))?
         };
 
+        let start_offsets_kernel = start_offsets_kernel.to_device(q.device())?;
         self.rotary_emb
             .forward(seqlen_offsets, &start_offsets_kernel, &mut q, &mut k, b_sz)?;
 

diff --git a/mistralrs-core/src/models/qwen2.rs b/mistralrs-core/src/models/qwen2.rs
@@ -253,6 +253,7 @@ impl Attention {
             v.reshape((b_sz, self.num_kv_heads, q_len, self.head_dim))?
         };
 
+        let start_offsets_kernel = start_offsets_kernel.to_device(q.device())?;
         self.rotary_emb
             .forward(seqlen_offsets, &start_offsets_kernel, &mut q, &mut k, b_sz)?;
 

diff --git a/mistralrs-core/src/models/starcoder2.rs b/mistralrs-core/src/models/starcoder2.rs
@@ -246,6 +246,7 @@ impl Attention {
             v.reshape((b_sz, self.num_kv_heads, q_len, self.head_dim))?
         };
 
+        let start_offsets_kernel = start_offsets_kernel.to_device(q.device())?;
         self.rotary_emb
             .forward(seqlen_offsets, &start_offsets_kernel, &mut q, &mut k, b_sz)?;
 

diff --git a/mistralrs-core/src/paged_attention/cache_engine.rs b/mistralrs-core/src/paged_attention/cache_engine.rs
@@ -29,13 +29,15 @@ impl CacheEngine {
         cache_config: &CacheConfig,
         dtype: DType,
         device: &Device,
+        layer_devices: Vec<Option<Device>>,
     ) -> Result<Self> {
         Ok(Self {
             gpu_cache: Arc::new(Mutex::new(Self::allocate_gpu_cache(
                 model_config,
                 cache_config,
                 dtype,
                 device,
+                layer_devices,
             )?)),
             cpu_cache: Self::allocate_cpu_cache(model_config, cache_config, dtype, device)?,
             num_layers: model_config.num_layers(),
@@ -55,13 +57,16 @@ impl CacheEngine {
         cache_config: &CacheConfig,
         dtype: DType,
         device: &Device,
+        layer_devices: Vec<Option<Device>>,
     ) -> Result<Vec<KVCache>> {
         let key_block_shape =
             Self::calculate_key_block_shape(model_config, dtype, cache_config.block_size);
         let value_block_shape =
             Self::calculate_value_block_shape(model_config, cache_config.block_size);
         let mut gpu_cache = Vec::new();
-        for _ in 0..model_config.num_layers() {
+
+        for i in 0..model_config.num_layers() {
+            let device = layer_devices[i].as_ref().unwrap_or(device);
             let key_blocks = Tensor::zeros(
                 (
                     cache_config.num_gpu_blocks,

diff --git a/mistralrs-core/src/paged_attention/layers/paged_attention.rs b/mistralrs-core/src/paged_attention/layers/paged_attention.rs
@@ -70,6 +70,34 @@ impl PagedAttention {
             input_metadata.slot_mappings.clone()
         };
 
+        // When device mapping, these Tensors are fixed on the first device, and must be moved to the same device as q,k,v
+        // - slot_mapping
+        // - input_metadata.block_tables
+        // - input_metadata.context_lens
+        // - self.alibi_slopes
+        // - attention_mask
+        let slot_mapping = slot_mapping.to_device(query.device())?;
+        let block_tables = input_metadata
+            .block_tables
+            .as_ref()
+            .unwrap()
+            .to_device(query.device())?;
+        let context_lens = input_metadata
+            .context_lens
+            .as_ref()
+            .unwrap()
+            .to_device(query.device())?;
+        let alibi_slopes = if let Some(alibi_slopes) = self.alibi_slopes.as_ref() {
+            Some(alibi_slopes.to_device(query.device())?)
+        } else {
+            None
+        };
+        let attention_mask = if let Some(mask) = attention_mask {
+            Some(mask.to_device(query.device())?)
+        } else {
+            None
+        };
+
         let (batch_size, attention_heads, seq_len, head_size) = query.shape().dims4()?;
         let (_, key_value_heads, _, _) = key.shape().dims4()?;
 
@@ -80,7 +108,7 @@ impl PagedAttention {
                 query,
                 key,
                 value,
-                Some(mask),
+                Some(&mask),
                 None,
                 &SdpaParams {
                     n_kv_groups: self.n_kv_groups,
@@ -92,7 +120,7 @@ impl PagedAttention {
             )?),
         };
 
-        // // paged-attn expects [batch_size, num_tokens, num_heads, head_size]
+        // paged-attn expects [batch_size, num_tokens, num_heads, head_size]
         let (query, key, value) = if seq_len > 1 {
             let q = query
                 .transpose(1, 2)?
@@ -105,7 +133,7 @@ impl PagedAttention {
                 .reshape(((), key_value_heads, head_size))?;
             (q, k, v)
         } else {
-            //avoid unnecessary transpose for decoding
+            // avoid unnecessary transpose for decoding
             let q = query.reshape(((), attention_heads, head_size))?;
             let k = key.reshape(((), key_value_heads, head_size))?;
             let v = value.reshape(((), key_value_heads, head_size))?;
@@ -131,7 +159,6 @@ impl PagedAttention {
             // Return result in prefill
             return Ok(att);
         }
-
         //  Args:
         //  output: shape = [num_generation_tokens, num_heads, head_size]
         //
@@ -147,18 +174,16 @@ impl PagedAttention {
         //
         //  alibi_slopes: shape = [num_heads]
         #[allow(clippy::cast_possible_truncation)]
-        let res = paged_attention(
+        paged_attention(
             &query,
             key_cache.as_ref().unwrap(),
             value_cache.as_ref().unwrap(),
-            input_metadata.block_tables.as_ref().unwrap(),
-            input_metadata.context_lens.as_ref().unwrap(),
-            self.alibi_slopes.as_ref(),
+            &block_tables,
+            &context_lens,
+            alibi_slopes.as_ref(),
             input_metadata.max_context_len.unwrap(),
             self.scale,
             softcapping.unwrap_or(1.0f64) as f32,
-        )?;
-
-        Ok(res)
+        )
     }
 }
diff --git a/mistralrs-core/src/pipeline/gguf.rs b/mistralrs-core/src/pipeline/gguf.rs
@@ -332,7 +332,7 @@ impl Loader for GGUFLoader {
         silent: bool,
         mapper: DeviceMapMetadata,
         in_situ_quant: Option<IsqType>,
-        mut paged_attn_config: Option<PagedAttentionConfig>,
+        paged_attn_config: Option<PagedAttentionConfig>,
     ) -> Result<Arc<Mutex<dyn Pipeline + Send + Sync>>> {
         if in_situ_quant.is_some() {
             anyhow::bail!(
@@ -353,10 +353,11 @@ impl Loader for GGUFLoader {
                 self.get_id(),
                 device.device_pretty_repr()
             );
-        } else if paged_attn_config.is_some() {
-            warn!("Device mapping or device topology and PagedAttention are incompatible, disabling PagedAttention.");
-            paged_attn_config = None;
         }
+        // } else if paged_attn_config.is_some() {
+        //     warn!("Device mapping or device topology and PagedAttention are incompatible, disabling PagedAttention.");
+        //     paged_attn_config = None;
+        // }
 
         let mut readers = Vec::new();
         for filename in paths.get_weight_filenames() {
@@ -408,7 +409,7 @@ impl Loader for GGUFLoader {
             // Base config (quantization only):
             let quant = ModelConfig::ParamsGGUF(
                 model,
-                (device, mapper, self.config.topology.as_ref()).into(),
+                (device, mapper.clone(), self.config.topology.as_ref()).into(),
                 if paged_attn_config.is_some() {
                     AttentionImplementation::PagedAttention
                 } else {
@@ -453,6 +454,24 @@ impl Loader for GGUFLoader {
             _ => unreachable!(),
         };
 
+        let num_hidden_layers = match model {
+            Model::Llama(ref model) => model.cache.normal().0.len(),
+            Model::Phi2(ref model) => model.cache.normal().0.len(),
+            Model::XLoraLlama(ref model) => model.cache.full().lock().len(),
+            Model::Phi3(ref model) => model.cache.normal().0.len(),
+            Model::XLoraPhi3(ref model) => model.cache.full().lock().len(),
+            Model::Starcoder2(ref model) => model.cache.normal().0.len(),
+            Model::Qwen2(ref model) => model.cache.normal().0.len(),
+        };
+
+        let mapper =
+            mapper.into_mapper(num_hidden_layers, device, self.config.topology.as_ref())?;
+        let mut layer_devices = Vec::new();
+        for layer in 0..num_hidden_layers {
+            let device = mapper.device_for(layer, false).cloned();
+            layer_devices.push(device);
+        }
+
         let (cache_config, cache_engine) = if let Some(paged_attn_config) = paged_attn_config {
             let model_config: &dyn ModelConfigLike = &model_config_metadata;
             let cache_config = calculate_cache_config(
@@ -463,7 +482,13 @@ impl Loader for GGUFLoader {
                 model_config,
                 device,
             )?;
-            let cache_engine = CacheEngine::new(model_config, &cache_config, DType::F32, device)?;
+            let cache_engine = CacheEngine::new(
+                model_config,
+                &cache_config,
+                DType::F32,
+                device,
+                layer_devices,
+            )?;
             (Some(cache_config), Some(cache_engine))
         } else {
             (None, None)
@@ -494,15 +519,6 @@ impl Loader for GGUFLoader {
             Model::Qwen2(ref p) => p.max_seq_len,
         };
         let tok_env = build_tok_env(tokenizer.clone());
-        let num_hidden_layers = match model {
-            Model::Llama(ref model) => model.cache.normal().0.len(),
-            Model::Phi2(ref model) => model.cache.normal().0.len(),
-            Model::XLoraLlama(ref model) => model.cache.full().lock().len(),
-            Model::Phi3(ref model) => model.cache.normal().0.len(),
-            Model::XLoraPhi3(ref model) => model.cache.full().lock().len(),
-            Model::Starcoder2(ref model) => model.cache.normal().0.len(),
-            Model::Qwen2(ref model) => model.cache.normal().0.len(),
-        };
 
         if chat_template.bos_token.is_none() && bos.is_some() {
             chat_template.bos_token = Some(BeginEndUnkTok(Either::Left(bos.unwrap())));

diff --git a/mistralrs-core/src/pipeline/normal.rs b/mistralrs-core/src/pipeline/normal.rs
@@ -271,7 +271,7 @@ impl Loader for NormalLoader {
         silent: bool,
         mapper: DeviceMapMetadata,
         in_situ_quant: Option<IsqType>,
-        mut paged_attn_config: Option<PagedAttentionConfig>,
+        paged_attn_config: Option<PagedAttentionConfig>,
     ) -> Result<Arc<Mutex<dyn Pipeline + Send + Sync>>> {
         let config = std::fs::read_to_string(paths.get_config_filename())?;
         // Otherwise, the device mapper will print it
@@ -288,16 +288,17 @@ impl Loader for NormalLoader {
                 self.get_id(),
                 device.device_pretty_repr()
             );
-        } else if paged_attn_config.is_some() {
-            warn!("Device mapping or device topology and PagedAttention are incompatible, disabling PagedAttention.");
-            paged_attn_config = None;
         }
-
         let mapper = mapper.into_mapper(
             self.inner.get_total_device_mapping_num_layers(&config)?,
             device,
             self.config.topology.as_ref(),
         )?;
+        let mut layer_devices = Vec::new();
+        for layer in 0..self.inner.get_total_device_mapping_num_layers(&config)? {
+            let device = mapper.device_for(layer, false).cloned();
+            layer_devices.push(device);
+        }
         let dtype = mapper.get_min_dtype(dtype)?;
 
         info!(
@@ -523,7 +524,8 @@ impl Loader for NormalLoader {
                 model.config(),
                 device,
             )?;
-            let cache_engine = CacheEngine::new(model.config(), &cache_config, dtype, device)?;
+            let cache_engine =
+                CacheEngine::new(model.config(), &cache_config, dtype, device, layer_devices)?;
             (Some(cache_config), Some(cache_engine))
         } else {
             (None, None)

diff --git a/mistralrs-core/src/pipeline/vision.rs b/mistralrs-core/src/pipeline/vision.rs
@@ -235,6 +235,11 @@ impl Loader for VisionLoader {
             device,
             self.config.topology.as_ref(),
         )?;
+        let mut layer_devices = Vec::new();
+        for layer in 0..self.inner.get_total_device_mapping_num_layers(&config)? {
+            let device = mapper.device_for(layer, false).cloned();
+            layer_devices.push(device);
+        }
         let dtype = mapper.get_min_dtype(dtype)?;
 
         let mut loading_isq = in_situ_quant.is_some() || self.config.from_uqff.is_some();
@@ -435,7 +440,8 @@ impl Loader for VisionLoader {
                 model.config(),
                 device,
             )?;
-            let cache_engine = CacheEngine::new(model.config(), &cache_config, dtype, device)?;
+            let cache_engine =
+                CacheEngine::new(model.config(), &cache_config, dtype, device, layer_devices)?;
             (Some(cache_config), Some(cache_engine))
         } else {
             (None, None)