Skip to content

Commit

Permalink
Add x86 implentation of 8x16 ushr
Browse files Browse the repository at this point in the history
This involves some large mask tables that may hurt code size but reduce the number of instructions. See WebAssembly/simd#117 for a more in-depth discussion on this.
  • Loading branch information
abrown committed Mar 21, 2020
1 parent 2f648ea commit aba1ef4
Show file tree
Hide file tree
Showing 4 changed files with 113 additions and 10 deletions.
18 changes: 9 additions & 9 deletions cranelift/codegen/meta/src/isa/x86/legalize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,6 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
let x86_pshufd = x86_instructions.by_name("x86_pshufd");
let x86_psll = x86_instructions.by_name("x86_psll");
let x86_psra = x86_instructions.by_name("x86_psra");
let x86_psrl = x86_instructions.by_name("x86_psrl");
let x86_ptest = x86_instructions.by_name("x86_ptest");

let imm = &shared.imm;
Expand Down Expand Up @@ -497,14 +496,14 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
}

// SIMD shift right (logical)
for ty in &[I16, I32, I64] {
let ushr = ushr.bind(vector(*ty, sse_vector_size));
let bitcast = bitcast.bind(vector(I64, sse_vector_size));
narrow.legalize(
def!(a = ushr(x, y)),
vec![def!(b = bitcast(y)), def!(a = x86_psrl(x, b))],
);
}
// for ty in &[I16, I32, I64] {
// let ushr = ushr.bind(vector(*ty, sse_vector_size));
// let bitcast = bitcast.bind(vector(I64, sse_vector_size));
// narrow.legalize(
// def!(a = ushr(x, y)),
// vec![def!(b = bitcast(y)), def!(a = x86_psrl(x, b))],
// );
// }

// SIMD shift left (arithmetic)
for ty in &[I16, I32, I64] {
Expand Down Expand Up @@ -695,6 +694,7 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
narrow.custom_legalize(extractlane, "convert_extractlane");
narrow.custom_legalize(insertlane, "convert_insertlane");
narrow.custom_legalize(ineg, "convert_ineg");
narrow.custom_legalize(ushr, "convert_ushr");

narrow.build_and_add_to(&mut shared.transform_groups);
}
64 changes: 63 additions & 1 deletion cranelift/codegen/src/isa/x86/enc_tables.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use crate::cursor::{Cursor, FuncCursor};
use crate::flowgraph::ControlFlowGraph;
use crate::ir::condcodes::{FloatCC, IntCC};
use crate::ir::types::*;
use crate::ir::{self, Function, Inst, InstBuilder};
use crate::ir::{self, Function, Inst, InstBuilder, MemFlags};
use crate::isa::constraints::*;
use crate::isa::enc_tables::*;
use crate::isa::encoding::base_size;
Expand Down Expand Up @@ -1289,6 +1289,68 @@ fn convert_ineg(
}
}

static USHR_MASKS: [u8; 128] = [
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f,
0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
];

// TODO this would be unnecessary if we could pass varargs in legalize.rs
// For SIMD or scalar integer negation, convert `ineg` to `vconst + isub` or `iconst + isub`.
fn convert_ushr(
inst: ir::Inst,
func: &mut ir::Function,
_cfg: &mut ControlFlowGraph,
isa: &dyn TargetIsa,
) {
let mut pos = FuncCursor::new(func).at_inst(inst);
pos.use_srcloc(inst);

if let ir::InstructionData::Binary {
opcode: ir::Opcode::Ushr,
args: [arg0, arg1],
} = pos.func.dfg[inst]
{
// Note that for Wasm, the bounding of the shift index has happened during translation
let arg0_type = pos.func.dfg.value_type(arg0);
let arg1_type = pos.func.dfg.value_type(arg1);
assert!(!arg1_type.is_vector() && arg1_type.is_int());

// TODO it may be more clear to use scalar_to_vector here; the current issue is that
// scalar_to_vector has the restriction that the vector produced has a matching lane size
// (e.g. i32 -> i32x4) whereas bitcast allows moving any-to-any conversions (e.g. i32 ->
// i64x2). This matters because for some reason x86_psrl only allows i64x2 as the shift
// index type--this could be relaxed since it is not really meaningful.
let shift_index = pos.ins().bitcast(I64X2, arg1);

if arg0_type == I8X16 {
// First, shift the
let bitcasted = pos.ins().raw_bitcast(I16X8, arg0); // So as to use the 16x8 instruction
let shifted = pos.ins().x86_psrl(bitcasted, shift_index);
let shifted = pos.ins().raw_bitcast(I8X16, shifted);

// This is a substantial code-size increas... TODO explain
let masks = pos.func.dfg.constants.insert(USHR_MASKS.as_ref().into());
let mask_address = pos.ins().const_addr(isa.pointer_type(), masks);
let mask_offset = pos.ins().ishl_imm(arg1, 4);
let mask =
pos.ins()
.load_complex(arg0_type, MemFlags::new(), &[mask_address, mask_offset], 0);
pos.func.dfg.replace(inst).band(shifted, mask);
} else if arg0_type.is_vector() {
// x86 has encodings for these shifts.
pos.func.dfg.replace(inst).x86_psrl(arg0, shift_index);
} else {
unreachable!()
}
}
}

fn expand_tls_value(
inst: ir::Inst,
func: &mut ir::Function,
Expand Down
28 changes: 28 additions & 0 deletions cranelift/filetests/filetests/isa/x86/simd-bitwise-legalize.clif
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,34 @@ test legalizer
set enable_simd
target x86_64 skylake

function %ishl_i8x16() -> i8x16 {
block0:
v0 = iconst.i32 1
v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
v2 = ushr v1, v0
; check: v3 = bitcast.i64x2 v0
; nextln: v4 = raw_bitcast.i16x8 v1
; nextln: v5 = x86_psrl v4, v3
; nextln: v6 = raw_bitcast.i8x16 v5
; nextln: v7 = const_addr.i64 const1
; nextln: v8 = ishl_imm v0, 4
; nextln: v9 = load_complex.i8x16 v7+v8
; nextln: v2 = band v6, v9
return v2
}

function %blah() -> b1 {
block0:
v0 = iconst.i32 1
v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
v2 = ushr v1, v0

v3 = vconst.i8x16 [0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7]
v4 = icmp eq v2, v3
v5 = vall_true v4
return v5
}

function %ishl_i32x4() -> i32x4 {
block0:
v0 = iconst.i32 1
Expand Down
13 changes: 13 additions & 0 deletions cranelift/filetests/filetests/isa/x86/simd-bitwise-run.clif
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,19 @@ block0:
}
; run

function %ushr_i8x16() -> b1 {
block0:
v0 = iconst.i32 1
v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
v2 = ushr v1, v0

v3 = vconst.i8x16 [0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7]
v4 = icmp eq v2, v3
v5 = vall_true v4
return v5
}
; run

function %ushr_i64x2() -> b1 {
block0:
v0 = iconst.i32 1
Expand Down

0 comments on commit aba1ef4

Please sign in to comment.