Skip to content

Commit

Permalink
fix: make avx512 fp16 a runtime check (#1884)
Browse files Browse the repository at this point in the history
Makes [avx512
fp16](https://networkbuilders.intel.com/solutionslibrary/intel-avx-512-fp16-instruction-set-for-intel-xeon-processor-based-products-technology-guide)
support a runtime check. This will allow binaries compiled w/ the
avx512fp16 feature to run hardware that doesn't support this feature
(e.g. x86 before saphire rapids).

Check does not add performance penalty:
```
albertlockett@albert-ubuntu-saphire:~/lance/rust/lance-linalg$ TARGET_TIME=55 cargo bench \
  --bench dot \
  -F avx512fp16

   Compiling lance-linalg v0.9.9 (/home/albertlockett/lance/rust/lance-linalg)
    Finished bench [optimized + debuginfo] target(s) in 55.77s
     Running benches/dot.rs (/home/albertlockett/lance/rust/target/release/deps/dot-f42dee3ad61e0342)
Gnuplot not found, using plotters backend
Dot(half::binary16::f16, arrow_artiy)
                        time:   [2.5228 s 2.5230 s 2.5233 s]
                        change: [-0.0915% -0.0641% -0.0381%] (p = 0.00 < 0.10)
                        Change within noise threshold.

Dot(half::binary16::f16, auto-vectorization)
                        time:   [167.90 ms 168.05 ms 168.34 ms]
                        change: [-0.3945% -0.1097% +0.1731%] (p = 0.47 > 0.10)
                        No change in performance detected.

Dot(f16, SIMD)          time:   [167.03 ms 167.22 ms 167.50 ms]
                        change: [-1.4038% -0.9215% -0.4951%] (p = 0.00 < 0.10)
                        Change within noise threshold.
```
  • Loading branch information
albertlockett authored Jan 30, 2024
1 parent 2f67cf9 commit 628f7a3
Show file tree
Hide file tree
Showing 7 changed files with 119 additions and 42 deletions.
1 change: 1 addition & 0 deletions rust/lance-core/src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
// limitations under the License.

pub mod address;
pub mod cpu;
pub mod deletion;
pub mod mask;
pub mod testing;
Expand Down
40 changes: 40 additions & 0 deletions rust/lance-core/src/utils/cpu.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
// Copyright 2024 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#[cfg(target_arch = "x86_64")]
pub mod x86 {
use core::arch::x86_64::__cpuid;

use lazy_static::lazy_static;

#[inline]
fn check_flag(x: usize, position: u32) -> bool {
x & (1 << position) != 0
}

lazy_static! {
pub static ref AVX512_F16_SUPPORTED: bool = {
// this macro does many OS checks/etc. to determine if allowed to use AVX512
if !is_x86_feature_detected!("avx512f") {
return false;
}

// EAX=7, ECX=0: Extended Features (includes AVX512)
// More info on calling CPUID can be found here (section 1.4)
// https://www.intel.com/content/dam/develop/external/us/en/documents/architecture-instruction-set-extensions-programming-reference.pdf
let ext_cpuid_result = unsafe { __cpuid(7) };
check_flag(ext_cpuid_result.edx as usize, 23)
};
}
}
1 change: 1 addition & 0 deletions rust/lance-linalg/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ arrow-schema = { workspace = true }
futures = { workspace = true }
half = { workspace = true }
lance-arrow = { workspace = true }
lance-core = { workspace = true }
log = { workspace = true }
num_cpus = { workspace = true }
num-traits = { workspace = true }
Expand Down
13 changes: 11 additions & 2 deletions rust/lance-linalg/benches/dot.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
// limitations under the License.

use std::iter::{repeat_with, Sum};
use std::time::Duration;

use arrow_array::{
types::{Float16Type, Float32Type, Float64Type},
Expand Down Expand Up @@ -131,18 +132,26 @@ fn bench_distance(c: &mut Criterion) {
run_bench::<Float64Type>(c);
}

fn bench_time() -> Duration {
let secs: u64 = option_env!("TARGET_TIME").unwrap_or("5").parse().unwrap();
Duration::from_secs(secs)
}

#[cfg(target_os = "linux")]
criterion_group!(
name=benches;
config = Criterion::default().significance_level(0.1).sample_size(10)
config = Criterion::default()
.significance_level(0.1)
.sample_size(10)
.measurement_time(bench_time())
.with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
targets = bench_distance);

// Non-linux version does not support pprof.
#[cfg(not(target_os = "linux"))]
criterion_group!(
name=benches;
config = Criterion::default().significance_level(0.1).sample_size(10);
config = Criterion::default().significance_level(0.1).sample_size(10).measurement_time(bench_time());
targets = bench_distance);

criterion_main!(benches);
16 changes: 12 additions & 4 deletions rust/lance-linalg/src/distance/dot.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ use lance_arrow::{ArrowFloatType, FloatArray, FloatToArrayType};
use num_traits::real::Real;
use num_traits::AsPrimitive;

#[cfg(all(target_os = "linux", feature = "avx512fp16", target_arch = "x86_64"))]
use lance_core::utils::cpu::x86::AVX512_F16_SUPPORTED;

use crate::simd::{
f32::{f32x16, f32x8},
SIMD,
Expand Down Expand Up @@ -112,13 +115,18 @@ mod kernel {
impl Dot for Float16Type {
#[inline]
fn dot(x: &[f16], y: &[f16]) -> f32 {
#[cfg(any(
all(target_os = "macos", target_feature = "neon"),
all(target_os = "linux", feature = "avx512fp16")
))]
#[cfg(all(target_os = "macos", target_feature = "neon"))]
unsafe {
kernel::dot_f16(x.as_ptr(), y.as_ptr(), x.len() as u32)
}

#[cfg(all(target_os = "linux", feature = "avx512fp16", target_arch = "x86_64"))]
if *AVX512_F16_SUPPORTED {
unsafe { kernel::dot_f16(x.as_ptr(), y.as_ptr(), x.len() as u32) }
} else {
dot_scalar::<f16, 16>(x, y)
}

#[cfg(not(any(
all(target_os = "macos", target_feature = "neon"),
all(target_os = "linux", feature = "avx512fp16")
Expand Down
14 changes: 10 additions & 4 deletions rust/lance-linalg/src/distance/l2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ use half::{bf16, f16};
use lance_arrow::{bfloat16::BFloat16Type, ArrowFloatType, FloatArray, FloatToArrayType};
use num_traits::{AsPrimitive, Float};

#[cfg(all(target_os = "linux", feature = "avx512fp16", target_arch = "x86_64"))]
use lance_core::utils::cpu::x86::AVX512_F16_SUPPORTED;

use crate::simd::{
f32::{f32x16, f32x8},
SIMD,
Expand Down Expand Up @@ -119,13 +122,16 @@ mod kernel {
impl L2 for Float16Type {
#[inline]
fn l2(x: &[f16], y: &[f16]) -> f32 {
#[cfg(any(
all(target_os = "macos", target_feature = "neon"),
all(target_os = "linux", feature = "avx512fp16")
))]
#[cfg(all(target_os = "macos", target_feature = "neon"))]
unsafe {
kernel::l2_f16(x.as_ptr(), y.as_ptr(), x.len() as u32)
}
#[cfg(all(target_os = "linux", feature = "avx512fp16", target_arch = "x86_64"))]
if *AVX512_F16_SUPPORTED {
unsafe { kernel::l2_f16(x.as_ptr(), y.as_ptr(), x.len() as u32) }
} else {
l2_scalar::<f16, 16>(x, y)
}
#[cfg(not(any(
all(target_os = "macos", target_feature = "neon"),
all(target_os = "linux", feature = "avx512fp16")
Expand Down
76 changes: 44 additions & 32 deletions rust/lance-linalg/src/distance/norm_l2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ use std::iter::Sum;
use half::{bf16, f16};
use num_traits::{AsPrimitive, Float};

#[cfg(all(target_os = "linux", feature = "avx512fp16", target_arch = "x86_64"))]
use lance_core::utils::cpu::x86::AVX512_F16_SUPPORTED;

use crate::simd::{
f32::{f32x16, f32x8},
SIMD,
Expand Down Expand Up @@ -45,47 +48,56 @@ mod kernel {
impl Normalize<f16> for &[f16] {
// #[inline]
fn norm_l2(&self) -> f32 {
#[cfg(any(
all(target_os = "macos", target_feature = "neon"),
feature = "avx512fp16"
))]
#[cfg(all(target_os = "macos", target_feature = "neon"))]
unsafe {
kernel::norm_l2_f16(self.as_ptr(), self.len() as u32)
}

#[cfg(all(target_os = "linux", feature = "avx512fp16", target_arch = "x86_64"))]
if *AVX512_F16_SUPPORTED {
unsafe { kernel::norm_l2_f16(self.as_ptr(), self.len() as u32) }
} else {
norm_l2_f16_impl(self)
}

#[cfg(not(any(
all(target_os = "macos", target_feature = "neon"),
feature = "avx512fp16"
)))]
{
// Please run `cargo bench --bench norm_l2" on Apple Silicon when
// change the following code.
const LANES: usize = 16;
let chunks = self.chunks_exact(LANES);
let sum = if chunks.remainder().is_empty() {
0.0
} else {
chunks
.remainder()
.iter()
.map(|v| v.to_f32().powi(2))
.sum::<f32>()
};

let mut sums: [f32; LANES] = [0_f32; LANES];
for chk in chunks {
// Convert to f32
let mut f32_vals: [f32; LANES] = [0_f32; LANES];
for i in 0..LANES {
f32_vals[i] = chk[i].to_f32();
}
// Vectorized multiply
for i in 0..LANES {
sums[i] += f32_vals[i].powi(2);
}
}
(sums.iter().copied().sum::<f32>() + sum).sqrt()
norm_l2_f16_impl(self)
}
}

#[inline]
#[cfg(not(all(target_os = "macos", target_feature = "neon")))]
fn norm_l2_f16_impl(arr: &[f16]) -> f32 {
// Please run `cargo bench --bench norm_l2" on Apple Silicon when
// change the following code.
const LANES: usize = 16;
let chunks = arr.chunks_exact(LANES);
let sum = if chunks.remainder().is_empty() {
0.0
} else {
chunks
.remainder()
.iter()
.map(|v| v.to_f32().powi(2))
.sum::<f32>()
};

let mut sums: [f32; LANES] = [0_f32; LANES];
for chk in chunks {
// Convert to f32
let mut f32_vals: [f32; LANES] = [0_f32; LANES];
for i in 0..LANES {
f32_vals[i] = chk[i].to_f32();
}
// Vectorized multiply
for i in 0..LANES {
sums[i] += f32_vals[i].powi(2);
}
}
(sums.iter().copied().sum::<f32>() + sum).sqrt()
}

impl Normalize<bf16> for &[bf16] {
Expand Down

0 comments on commit 628f7a3

Please sign in to comment.