Add x86 implentation of 8x16 ushr

This involves some large mask tables that may hurt code size but reduce the number of instructions. See WebAssembly/simd#117 for a more in-depth discussion on this.
abrown · Mar 21, 2020 · aba1ef4 · aba1ef4
1 parent 2f648ea
commit aba1ef4
Show file tree

Hide file tree

Showing 4 changed files with 113 additions and 10 deletions.
diff --git a/cranelift/codegen/meta/src/isa/x86/legalize.rs b/cranelift/codegen/meta/src/isa/x86/legalize.rs
@@ -357,7 +357,6 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
     let x86_pshufd = x86_instructions.by_name("x86_pshufd");
     let x86_psll = x86_instructions.by_name("x86_psll");
     let x86_psra = x86_instructions.by_name("x86_psra");
-    let x86_psrl = x86_instructions.by_name("x86_psrl");
     let x86_ptest = x86_instructions.by_name("x86_ptest");
 
     let imm = &shared.imm;
@@ -497,14 +496,14 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
     }
 
     // SIMD shift right (logical)
-    for ty in &[I16, I32, I64] {
-        let ushr = ushr.bind(vector(*ty, sse_vector_size));
-        let bitcast = bitcast.bind(vector(I64, sse_vector_size));
-        narrow.legalize(
-            def!(a = ushr(x, y)),
-            vec![def!(b = bitcast(y)), def!(a = x86_psrl(x, b))],
-        );
-    }
+    // for ty in &[I16, I32, I64] {
+    //     let ushr = ushr.bind(vector(*ty, sse_vector_size));
+    //     let bitcast = bitcast.bind(vector(I64, sse_vector_size));
+    //     narrow.legalize(
+    //         def!(a = ushr(x, y)),
+    //         vec![def!(b = bitcast(y)), def!(a = x86_psrl(x, b))],
+    //     );
+    // }
 
     // SIMD shift left (arithmetic)
     for ty in &[I16, I32, I64] {
@@ -695,6 +694,7 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
     narrow.custom_legalize(extractlane, "convert_extractlane");
     narrow.custom_legalize(insertlane, "convert_insertlane");
     narrow.custom_legalize(ineg, "convert_ineg");
+    narrow.custom_legalize(ushr, "convert_ushr");
 
     narrow.build_and_add_to(&mut shared.transform_groups);
 }
diff --git a/cranelift/codegen/src/isa/x86/enc_tables.rs b/cranelift/codegen/src/isa/x86/enc_tables.rs
@@ -6,7 +6,7 @@ use crate::cursor::{Cursor, FuncCursor};
 use crate::flowgraph::ControlFlowGraph;
 use crate::ir::condcodes::{FloatCC, IntCC};
 use crate::ir::types::*;
-use crate::ir::{self, Function, Inst, InstBuilder};
+use crate::ir::{self, Function, Inst, InstBuilder, MemFlags};
 use crate::isa::constraints::*;
 use crate::isa::enc_tables::*;
 use crate::isa::encoding::base_size;
@@ -1289,6 +1289,68 @@ fn convert_ineg(
     }
 }
 
+static USHR_MASKS: [u8; 128] = [
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+    0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f,
+    0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
+    0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+    0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
+    0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
+    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+];
+
+// TODO this would be unnecessary if we could pass varargs in legalize.rs
+// For SIMD or scalar integer negation, convert `ineg` to `vconst + isub` or `iconst + isub`.
+fn convert_ushr(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    _cfg: &mut ControlFlowGraph,
+    isa: &dyn TargetIsa,
+) {
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+    pos.use_srcloc(inst);
+
+    if let ir::InstructionData::Binary {
+        opcode: ir::Opcode::Ushr,
+        args: [arg0, arg1],
+    } = pos.func.dfg[inst]
+    {
+        // Note that for Wasm, the bounding of the shift index has happened during translation
+        let arg0_type = pos.func.dfg.value_type(arg0);
+        let arg1_type = pos.func.dfg.value_type(arg1);
+        assert!(!arg1_type.is_vector() && arg1_type.is_int());
+
+        // TODO it may be more clear to use scalar_to_vector here; the current issue is that
+        // scalar_to_vector has the restriction that the vector produced has a matching lane size
+        // (e.g. i32 -> i32x4) whereas bitcast allows moving any-to-any conversions (e.g. i32 ->
+        // i64x2). This matters because for some reason x86_psrl only allows i64x2 as the shift
+        // index type--this could be relaxed since it is not really meaningful.
+        let shift_index = pos.ins().bitcast(I64X2, arg1);
+
+        if arg0_type == I8X16 {
+            // First, shift the
+            let bitcasted = pos.ins().raw_bitcast(I16X8, arg0); // So as to use the 16x8 instruction
+            let shifted = pos.ins().x86_psrl(bitcasted, shift_index);
+            let shifted = pos.ins().raw_bitcast(I8X16, shifted);
+
+            // This is a substantial code-size increas... TODO explain
+            let masks = pos.func.dfg.constants.insert(USHR_MASKS.as_ref().into());
+            let mask_address = pos.ins().const_addr(isa.pointer_type(), masks);
+            let mask_offset = pos.ins().ishl_imm(arg1, 4);
+            let mask =
+                pos.ins()
+                    .load_complex(arg0_type, MemFlags::new(), &[mask_address, mask_offset], 0);
+            pos.func.dfg.replace(inst).band(shifted, mask);
+        } else if arg0_type.is_vector() {
+            // x86 has encodings for these shifts.
+            pos.func.dfg.replace(inst).x86_psrl(arg0, shift_index);
+        } else {
+            unreachable!()
+        }
+    }
+}
+
 fn expand_tls_value(
     inst: ir::Inst,
     func: &mut ir::Function,

diff --git a/cranelift/filetests/filetests/isa/x86/simd-bitwise-legalize.clif b/cranelift/filetests/filetests/isa/x86/simd-bitwise-legalize.clif
@@ -2,6 +2,34 @@ test legalizer
 set enable_simd
 target x86_64 skylake
 
+function %ishl_i8x16() -> i8x16 {
+block0:
+    v0 = iconst.i32 1
+    v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
+    v2 = ushr v1, v0
+    ; check:  v3 = bitcast.i64x2 v0
+    ; nextln: v4 = raw_bitcast.i16x8 v1
+    ; nextln: v5 = x86_psrl v4, v3
+    ; nextln: v6 = raw_bitcast.i8x16 v5
+    ; nextln: v7 = const_addr.i64 const1
+    ; nextln: v8 = ishl_imm v0, 4
+    ; nextln: v9 = load_complex.i8x16 v7+v8
+    ; nextln: v2 = band v6, v9
+    return v2
+}
+
+function %blah() -> b1 {
+block0:
+    v0 = iconst.i32 1
+    v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
+    v2 = ushr v1, v0
+
+    v3 = vconst.i8x16 [0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7]
+    v4 = icmp eq v2, v3
+    v5 = vall_true v4
+    return v5
+}
+
 function %ishl_i32x4() -> i32x4 {
 block0:
     v0 = iconst.i32 1

diff --git a/cranelift/filetests/filetests/isa/x86/simd-bitwise-run.clif b/cranelift/filetests/filetests/isa/x86/simd-bitwise-run.clif
@@ -38,6 +38,19 @@ block0:
 }
 ; run
 
+function %ushr_i8x16() -> b1 {
+block0:
+    v0 = iconst.i32 1
+    v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
+    v2 = ushr v1, v0
+
+    v3 = vconst.i8x16 [0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7]
+    v4 = icmp eq v2, v3
+    v5 = vall_true v4
+    return v5
+}
+; run
+
 function %ushr_i64x2() -> b1 {
 block0:
     v0 = iconst.i32 1